xref: /petsc/src/mat/impls/aij/seq/inode.c (revision 89583661dbdda284bc23265230c2f308532cda40)
1 
2 /*
3   This file provides high performance routines for the Inode format (compressed sparse row)
4   by taking advantage of rows with identical nonzero structure (I-nodes).
5 */
6 #include <../src/mat/impls/aij/seq/aij.h>
7 
8 static PetscErrorCode MatCreateColInode_Private(Mat A,PetscInt *size,PetscInt **ns)
9 {
10   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
11   PetscErrorCode ierr;
12   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;
13 
14   PetscFunctionBegin;
15   n      = A->cmap->n;
16   m      = A->rmap->n;
17   ns_row = a->inode.size;
18 
19   min_mn = (m < n) ? m : n;
20   if (!ns) {
21     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) ;
22     for (; count+1 < n; count++,i++) ;
23     if (count < n)  {
24       i++;
25     }
26     *size = i;
27     PetscFunctionReturn(0);
28   }
29   ierr = PetscMalloc1(n+1,&ns_col);CHKERRQ(ierr);
30 
31   /* Use the same row structure wherever feasible. */
32   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
33     ns_col[i] = ns_row[i];
34   }
35 
36   /* if m < n; pad up the remainder with inode_limit */
37   for (; count+1 < n; count++,i++) {
38     ns_col[i] = 1;
39   }
40   /* The last node is the odd ball. padd it up with the remaining rows; */
41   if (count < n) {
42     ns_col[i] = n - count;
43     i++;
44   } else if (count > n) {
45     /* Adjust for the over estimation */
46     ns_col[i-1] += n - count;
47   }
48   *size = i;
49   *ns   = ns_col;
50   PetscFunctionReturn(0);
51 }
52 
53 
54 /*
55       This builds symmetric version of nonzero structure,
56 */
57 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
58 {
59   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
60   PetscErrorCode ierr;
61   PetscInt       *work,*ia,*ja,nz,nslim_row,nslim_col,m,row,col,n;
62   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2;
63   const PetscInt *j,*jmax,*ai= a->i,*aj = a->j;
64 
65   PetscFunctionBegin;
66   nslim_row = a->inode.node_count;
67   m         = A->rmap->n;
68   n         = A->cmap->n;
69   if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
70 
71   /* Use the row_inode as column_inode */
72   nslim_col = nslim_row;
73   ns_col    = ns_row;
74 
75   /* allocate space for reformated inode structure */
76   ierr = PetscMalloc2(nslim_col+1,&tns,n+1,&tvc);CHKERRQ(ierr);
77   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
78 
79   for (i1=0,col=0; i1<nslim_col; ++i1) {
80     nsz = ns_col[i1];
81     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
82   }
83   /* allocate space for row pointers */
84   ierr = PetscCalloc1(nslim_row+1,&ia);CHKERRQ(ierr);
85   *iia = ia;
86   ierr = PetscMalloc1(nslim_row+1,&work);CHKERRQ(ierr);
87 
88   /* determine the number of columns in each row */
89   ia[0] = oshift;
90   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
91 
92     j    = aj + ai[row] + ishift;
93     jmax = aj + ai[row+1] + ishift;
94     if (j==jmax) continue; /* empty row */
95     col  = *j++ + ishift;
96     i2   = tvc[col];
97     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
98       ia[i1+1]++;
99       ia[i2+1]++;
100       i2++;                     /* Start col of next node */
101       while ((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j;
102       i2 = tvc[col];
103     }
104     if (i2 == i1) ia[i2+1]++;    /* now the diagonal element */
105   }
106 
107   /* shift ia[i] to point to next row */
108   for (i1=1; i1<nslim_row+1; i1++) {
109     row        = ia[i1-1];
110     ia[i1]    += row;
111     work[i1-1] = row - oshift;
112   }
113 
114   /* allocate space for column pointers */
115   nz   = ia[nslim_row] + (!ishift);
116   ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr);
117   *jja = ja;
118 
119   /* loop over lower triangular part putting into ja */
120   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
121     j    = aj + ai[row] + ishift;
122     jmax = aj + ai[row+1] + ishift;
123     if (j==jmax) continue; /* empty row */
124     col  = *j++ + ishift;
125     i2   = tvc[col];
126     while (i2<i1 && j<jmax) {
127       ja[work[i2]++] = i1 + oshift;
128       ja[work[i1]++] = i2 + oshift;
129       ++i2;
130       while ((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */
131       i2 = tvc[col];
132     }
133     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
134 
135   }
136   ierr = PetscFree(work);CHKERRQ(ierr);
137   ierr = PetscFree2(tns,tvc);CHKERRQ(ierr);
138   PetscFunctionReturn(0);
139 }
140 
141 /*
142       This builds nonsymmetric version of nonzero structure,
143 */
144 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
145 {
146   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
147   PetscErrorCode ierr;
148   PetscInt       *work,*ia,*ja,nz,nslim_row,n,row,col,*ns_col,nslim_col;
149   PetscInt       *tns,*tvc,nsz,i1,i2;
150   const PetscInt *j,*ai= a->i,*aj = a->j,*ns_row = a->inode.size;
151 
152   PetscFunctionBegin;
153   nslim_row = a->inode.node_count;
154   n         = A->cmap->n;
155 
156   /* Create The column_inode for this matrix */
157   ierr = MatCreateColInode_Private(A,&nslim_col,&ns_col);CHKERRQ(ierr);
158 
159   /* allocate space for reformated column_inode structure */
160   ierr = PetscMalloc2(nslim_col +1,&tns,n + 1,&tvc);CHKERRQ(ierr);
161   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
162 
163   for (i1=0,col=0; i1<nslim_col; ++i1) {
164     nsz = ns_col[i1];
165     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
166   }
167   /* allocate space for row pointers */
168   ierr = PetscCalloc1(nslim_row+1,&ia);CHKERRQ(ierr);
169   *iia = ia;
170   ierr = PetscMalloc1(nslim_row+1,&work);CHKERRQ(ierr);
171 
172   /* determine the number of columns in each row */
173   ia[0] = oshift;
174   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
175     j   = aj + ai[row] + ishift;
176     nz  = ai[row+1] - ai[row];
177     if (!nz) continue; /* empty row */
178     col = *j++ + ishift;
179     i2  = tvc[col];
180     while (nz-- > 0) {           /* off-diagonal elemets */
181       ia[i1+1]++;
182       i2++;                     /* Start col of next node */
183       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
184       if (nz > 0) i2 = tvc[col];
185     }
186   }
187 
188   /* shift ia[i] to point to next row */
189   for (i1=1; i1<nslim_row+1; i1++) {
190     row        = ia[i1-1];
191     ia[i1]    += row;
192     work[i1-1] = row - oshift;
193   }
194 
195   /* allocate space for column pointers */
196   nz   = ia[nslim_row] + (!ishift);
197   ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr);
198   *jja = ja;
199 
200   /* loop over matrix putting into ja */
201   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
202     j   = aj + ai[row] + ishift;
203     nz  = ai[row+1] - ai[row];
204     if (!nz) continue; /* empty row */
205     col = *j++ + ishift;
206     i2  = tvc[col];
207     while (nz-- > 0) {
208       ja[work[i1]++] = i2 + oshift;
209       ++i2;
210       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
211       if (nz > 0) i2 = tvc[col];
212     }
213   }
214   ierr = PetscFree(ns_col);CHKERRQ(ierr);
215   ierr = PetscFree(work);CHKERRQ(ierr);
216   ierr = PetscFree2(tns,tvc);CHKERRQ(ierr);
217   PetscFunctionReturn(0);
218 }
219 
220 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
221 {
222   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
223   PetscErrorCode ierr;
224 
225   PetscFunctionBegin;
226   *n = a->inode.node_count;
227   if (!ia) PetscFunctionReturn(0);
228   if (!blockcompressed) {
229     ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
230   } else if (symmetric) {
231     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
232   } else {
233     ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
234   }
235   PetscFunctionReturn(0);
236 }
237 
238 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
239 {
240   PetscErrorCode ierr;
241 
242   PetscFunctionBegin;
243   if (!ia) PetscFunctionReturn(0);
244 
245   if (!blockcompressed) {
246     ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
247   } else {
248     ierr = PetscFree(*ia);CHKERRQ(ierr);
249     ierr = PetscFree(*ja);CHKERRQ(ierr);
250   }
251   PetscFunctionReturn(0);
252 }
253 
254 /* ----------------------------------------------------------- */
255 
256 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
257 {
258   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
259   PetscErrorCode ierr;
260   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
261   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
262 
263   PetscFunctionBegin;
264   nslim_row = a->inode.node_count;
265   n         = A->cmap->n;
266 
267   /* Create The column_inode for this matrix */
268   ierr = MatCreateColInode_Private(A,&nslim_col,&ns_col);CHKERRQ(ierr);
269 
270   /* allocate space for reformated column_inode structure */
271   ierr = PetscMalloc2(nslim_col + 1,&tns,n + 1,&tvc);CHKERRQ(ierr);
272   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
273 
274   for (i1=0,col=0; i1<nslim_col; ++i1) {
275     nsz = ns_col[i1];
276     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
277   }
278   /* allocate space for column pointers */
279   ierr = PetscCalloc1(nslim_col+1,&ia);CHKERRQ(ierr);
280   *iia = ia;
281   ierr = PetscMalloc1(nslim_col+1,&work);CHKERRQ(ierr);
282 
283   /* determine the number of columns in each row */
284   ia[0] = oshift;
285   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
286     j   = aj + ai[row] + ishift;
287     col = *j++ + ishift;
288     i2  = tvc[col];
289     nz  = ai[row+1] - ai[row];
290     while (nz-- > 0) {           /* off-diagonal elemets */
291       /* ia[i1+1]++; */
292       ia[i2+1]++;
293       i2++;
294       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
295       if (nz > 0) i2 = tvc[col];
296     }
297   }
298 
299   /* shift ia[i] to point to next col */
300   for (i1=1; i1<nslim_col+1; i1++) {
301     col        = ia[i1-1];
302     ia[i1]    += col;
303     work[i1-1] = col - oshift;
304   }
305 
306   /* allocate space for column pointers */
307   nz   = ia[nslim_col] + (!ishift);
308   ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr);
309   *jja = ja;
310 
311   /* loop over matrix putting into ja */
312   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
313     j   = aj + ai[row] + ishift;
314     col = *j++ + ishift;
315     i2  = tvc[col];
316     nz  = ai[row+1] - ai[row];
317     while (nz-- > 0) {
318       /* ja[work[i1]++] = i2 + oshift; */
319       ja[work[i2]++] = i1 + oshift;
320       i2++;
321       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
322       if (nz > 0) i2 = tvc[col];
323     }
324   }
325   ierr = PetscFree(ns_col);CHKERRQ(ierr);
326   ierr = PetscFree(work);CHKERRQ(ierr);
327   ierr = PetscFree2(tns,tvc);CHKERRQ(ierr);
328   PetscFunctionReturn(0);
329 }
330 
331 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
332 {
333   PetscErrorCode ierr;
334 
335   PetscFunctionBegin;
336   ierr = MatCreateColInode_Private(A,n,NULL);CHKERRQ(ierr);
337   if (!ia) PetscFunctionReturn(0);
338 
339   if (!blockcompressed) {
340     ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
341   } else if (symmetric) {
342     /* Since the indices are symmetric it does'nt matter */
343     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
344   } else {
345     ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
346   }
347   PetscFunctionReturn(0);
348 }
349 
350 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
351 {
352   PetscErrorCode ierr;
353 
354   PetscFunctionBegin;
355   if (!ia) PetscFunctionReturn(0);
356   if (!blockcompressed) {
357     ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
358   } else {
359     ierr = PetscFree(*ia);CHKERRQ(ierr);
360     ierr = PetscFree(*ja);CHKERRQ(ierr);
361   }
362   PetscFunctionReturn(0);
363 }
364 
365 /* ----------------------------------------------------------- */
366 
367 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
368 {
369   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
370   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
371   PetscScalar       *y;
372   const PetscScalar *x;
373   const MatScalar   *v1,*v2,*v3,*v4,*v5;
374   PetscErrorCode    ierr;
375   PetscInt          i1,i2,n,i,row,node_max,nsz,sz,nonzerorow=0;
376   const PetscInt    *idx,*ns,*ii;
377 
378 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
379 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
380 #endif
381 
382   PetscFunctionBegin;
383   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
384   node_max = a->inode.node_count;
385   ns       = a->inode.size;     /* Node Size array */
386   ierr     = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
387   ierr     = VecGetArray(yy,&y);CHKERRQ(ierr);
388   idx      = a->j;
389   v1       = a->a;
390   ii       = a->i;
391 
392   for (i = 0,row = 0; i< node_max; ++i) {
393     nsz         = ns[i];
394     n           = ii[1] - ii[0];
395     nonzerorow += (n>0)*nsz;
396     ii         += nsz;
397     PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA);    /* Prefetch the indices for the block row after the current one */
398     PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one  */
399     sz = n;                     /* No of non zeros in this row */
400                                 /* Switch on the size of Node */
401     switch (nsz) {               /* Each loop in 'case' is unrolled */
402     case 1:
403       sum1 = 0.;
404 
405       for (n = 0; n< sz-1; n+=2) {
406         i1    = idx[0];         /* The instructions are ordered to */
407         i2    = idx[1];         /* make the compiler's job easy */
408         idx  += 2;
409         tmp0  = x[i1];
410         tmp1  = x[i2];
411         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
412       }
413 
414       if (n == sz-1) {          /* Take care of the last nonzero  */
415         tmp0  = x[*idx++];
416         sum1 += *v1++ *tmp0;
417       }
418       y[row++]=sum1;
419       break;
420     case 2:
421       sum1 = 0.;
422       sum2 = 0.;
423       v2   = v1 + n;
424 
425       for (n = 0; n< sz-1; n+=2) {
426         i1    = idx[0];
427         i2    = idx[1];
428         idx  += 2;
429         tmp0  = x[i1];
430         tmp1  = x[i2];
431         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
432         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
433       }
434       if (n == sz-1) {
435         tmp0  = x[*idx++];
436         sum1 += *v1++ * tmp0;
437         sum2 += *v2++ * tmp0;
438       }
439       y[row++]=sum1;
440       y[row++]=sum2;
441       v1      =v2;              /* Since the next block to be processed starts there*/
442       idx    +=sz;
443       break;
444     case 3:
445       sum1 = 0.;
446       sum2 = 0.;
447       sum3 = 0.;
448       v2   = v1 + n;
449       v3   = v2 + n;
450 
451       for (n = 0; n< sz-1; n+=2) {
452         i1    = idx[0];
453         i2    = idx[1];
454         idx  += 2;
455         tmp0  = x[i1];
456         tmp1  = x[i2];
457         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
458         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
459         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
460       }
461       if (n == sz-1) {
462         tmp0  = x[*idx++];
463         sum1 += *v1++ * tmp0;
464         sum2 += *v2++ * tmp0;
465         sum3 += *v3++ * tmp0;
466       }
467       y[row++]=sum1;
468       y[row++]=sum2;
469       y[row++]=sum3;
470       v1      =v3;              /* Since the next block to be processed starts there*/
471       idx    +=2*sz;
472       break;
473     case 4:
474       sum1 = 0.;
475       sum2 = 0.;
476       sum3 = 0.;
477       sum4 = 0.;
478       v2   = v1 + n;
479       v3   = v2 + n;
480       v4   = v3 + n;
481 
482       for (n = 0; n< sz-1; n+=2) {
483         i1    = idx[0];
484         i2    = idx[1];
485         idx  += 2;
486         tmp0  = x[i1];
487         tmp1  = x[i2];
488         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
489         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
490         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
491         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
492       }
493       if (n == sz-1) {
494         tmp0  = x[*idx++];
495         sum1 += *v1++ * tmp0;
496         sum2 += *v2++ * tmp0;
497         sum3 += *v3++ * tmp0;
498         sum4 += *v4++ * tmp0;
499       }
500       y[row++]=sum1;
501       y[row++]=sum2;
502       y[row++]=sum3;
503       y[row++]=sum4;
504       v1      =v4;              /* Since the next block to be processed starts there*/
505       idx    +=3*sz;
506       break;
507     case 5:
508       sum1 = 0.;
509       sum2 = 0.;
510       sum3 = 0.;
511       sum4 = 0.;
512       sum5 = 0.;
513       v2   = v1 + n;
514       v3   = v2 + n;
515       v4   = v3 + n;
516       v5   = v4 + n;
517 
518       for (n = 0; n<sz-1; n+=2) {
519         i1    = idx[0];
520         i2    = idx[1];
521         idx  += 2;
522         tmp0  = x[i1];
523         tmp1  = x[i2];
524         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
525         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
526         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
527         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
528         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
529       }
530       if (n == sz-1) {
531         tmp0  = x[*idx++];
532         sum1 += *v1++ * tmp0;
533         sum2 += *v2++ * tmp0;
534         sum3 += *v3++ * tmp0;
535         sum4 += *v4++ * tmp0;
536         sum5 += *v5++ * tmp0;
537       }
538       y[row++]=sum1;
539       y[row++]=sum2;
540       y[row++]=sum3;
541       y[row++]=sum4;
542       y[row++]=sum5;
543       v1      =v5;       /* Since the next block to be processed starts there */
544       idx    +=4*sz;
545       break;
546     default:
547       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
548     }
549   }
550   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
551   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
552   ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr);
553   PetscFunctionReturn(0);
554 }
555 /* ----------------------------------------------------------- */
556 /* Almost same code as the MatMult_SeqAIJ_Inode() */
557 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
558 {
559   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
560   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
561   const MatScalar   *v1,*v2,*v3,*v4,*v5;
562   const PetscScalar *x;
563   PetscScalar       *y,*z,*zt;
564   PetscErrorCode    ierr;
565   PetscInt          i1,i2,n,i,row,node_max,nsz,sz;
566   const PetscInt    *idx,*ns,*ii;
567 
568   PetscFunctionBegin;
569   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
570   node_max = a->inode.node_count;
571   ns       = a->inode.size;     /* Node Size array */
572 
573   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
574   ierr = VecGetArrayPair(zz,yy,&z,&y);CHKERRQ(ierr);
575   zt = z;
576 
577   idx = a->j;
578   v1  = a->a;
579   ii  = a->i;
580 
581   for (i = 0,row = 0; i< node_max; ++i) {
582     nsz = ns[i];
583     n   = ii[1] - ii[0];
584     ii += nsz;
585     sz  = n;                    /* No of non zeros in this row */
586                                 /* Switch on the size of Node */
587     switch (nsz) {               /* Each loop in 'case' is unrolled */
588     case 1:
589       sum1 = *zt++;
590 
591       for (n = 0; n< sz-1; n+=2) {
592         i1    = idx[0];         /* The instructions are ordered to */
593         i2    = idx[1];         /* make the compiler's job easy */
594         idx  += 2;
595         tmp0  = x[i1];
596         tmp1  = x[i2];
597         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
598       }
599 
600       if (n   == sz-1) {          /* Take care of the last nonzero  */
601         tmp0  = x[*idx++];
602         sum1 += *v1++ * tmp0;
603       }
604       y[row++]=sum1;
605       break;
606     case 2:
607       sum1 = *zt++;
608       sum2 = *zt++;
609       v2   = v1 + n;
610 
611       for (n = 0; n< sz-1; n+=2) {
612         i1    = idx[0];
613         i2    = idx[1];
614         idx  += 2;
615         tmp0  = x[i1];
616         tmp1  = x[i2];
617         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
618         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
619       }
620       if (n   == sz-1) {
621         tmp0  = x[*idx++];
622         sum1 += *v1++ * tmp0;
623         sum2 += *v2++ * tmp0;
624       }
625       y[row++]=sum1;
626       y[row++]=sum2;
627       v1      =v2;              /* Since the next block to be processed starts there*/
628       idx    +=sz;
629       break;
630     case 3:
631       sum1 = *zt++;
632       sum2 = *zt++;
633       sum3 = *zt++;
634       v2   = v1 + n;
635       v3   = v2 + n;
636 
637       for (n = 0; n< sz-1; n+=2) {
638         i1    = idx[0];
639         i2    = idx[1];
640         idx  += 2;
641         tmp0  = x[i1];
642         tmp1  = x[i2];
643         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
644         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
645         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
646       }
647       if (n == sz-1) {
648         tmp0  = x[*idx++];
649         sum1 += *v1++ * tmp0;
650         sum2 += *v2++ * tmp0;
651         sum3 += *v3++ * tmp0;
652       }
653       y[row++]=sum1;
654       y[row++]=sum2;
655       y[row++]=sum3;
656       v1      =v3;              /* Since the next block to be processed starts there*/
657       idx    +=2*sz;
658       break;
659     case 4:
660       sum1 = *zt++;
661       sum2 = *zt++;
662       sum3 = *zt++;
663       sum4 = *zt++;
664       v2   = v1 + n;
665       v3   = v2 + n;
666       v4   = v3 + n;
667 
668       for (n = 0; n< sz-1; n+=2) {
669         i1    = idx[0];
670         i2    = idx[1];
671         idx  += 2;
672         tmp0  = x[i1];
673         tmp1  = x[i2];
674         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
675         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
676         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
677         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
678       }
679       if (n == sz-1) {
680         tmp0  = x[*idx++];
681         sum1 += *v1++ * tmp0;
682         sum2 += *v2++ * tmp0;
683         sum3 += *v3++ * tmp0;
684         sum4 += *v4++ * tmp0;
685       }
686       y[row++]=sum1;
687       y[row++]=sum2;
688       y[row++]=sum3;
689       y[row++]=sum4;
690       v1      =v4;              /* Since the next block to be processed starts there*/
691       idx    +=3*sz;
692       break;
693     case 5:
694       sum1 = *zt++;
695       sum2 = *zt++;
696       sum3 = *zt++;
697       sum4 = *zt++;
698       sum5 = *zt++;
699       v2   = v1 + n;
700       v3   = v2 + n;
701       v4   = v3 + n;
702       v5   = v4 + n;
703 
704       for (n = 0; n<sz-1; n+=2) {
705         i1    = idx[0];
706         i2    = idx[1];
707         idx  += 2;
708         tmp0  = x[i1];
709         tmp1  = x[i2];
710         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
711         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
712         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
713         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
714         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
715       }
716       if (n   == sz-1) {
717         tmp0  = x[*idx++];
718         sum1 += *v1++ * tmp0;
719         sum2 += *v2++ * tmp0;
720         sum3 += *v3++ * tmp0;
721         sum4 += *v4++ * tmp0;
722         sum5 += *v5++ * tmp0;
723       }
724       y[row++]=sum1;
725       y[row++]=sum2;
726       y[row++]=sum3;
727       y[row++]=sum4;
728       y[row++]=sum5;
729       v1      =v5;       /* Since the next block to be processed starts there */
730       idx    +=4*sz;
731       break;
732     default:
733       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
734     }
735   }
736   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
737   ierr = VecRestoreArrayPair(zz,yy,&z,&y);CHKERRQ(ierr);
738   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
739   PetscFunctionReturn(0);
740 }
741 
742 /* ----------------------------------------------------------- */
743 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)
744 {
745   Mat_SeqAIJ        *a    = (Mat_SeqAIJ*)A->data;
746   IS                iscol = a->col,isrow = a->row;
747   PetscErrorCode    ierr;
748   const PetscInt    *r,*c,*rout,*cout;
749   PetscInt          i,j,n = A->rmap->n,nz;
750   PetscInt          node_max,*ns,row,nsz,aii,i0,i1;
751   const PetscInt    *ai = a->i,*a_j = a->j,*vi,*ad,*aj;
752   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
753   PetscScalar       sum1,sum2,sum3,sum4,sum5;
754   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
755   const PetscScalar *b;
756 
757   PetscFunctionBegin;
758   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
759   node_max = a->inode.node_count;
760   ns       = a->inode.size;     /* Node Size array */
761 
762   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
763   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
764   tmp  = a->solve_work;
765 
766   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
767   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
768 
769   /* forward solve the lower triangular */
770   tmps = tmp;
771   aa   = a_a;
772   aj   = a_j;
773   ad   = a->diag;
774 
775   for (i = 0,row = 0; i< node_max; ++i) {
776     nsz = ns[i];
777     aii = ai[row];
778     v1  = aa + aii;
779     vi  = aj + aii;
780     nz  = ad[row]- aii;
781     if (i < node_max-1) {
782       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
783       * but our indexing to determine it's size could. */
784       PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
785       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
786       PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
787       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
788     }
789 
790     switch (nsz) {               /* Each loop in 'case' is unrolled */
791     case 1:
792       sum1 = b[*r++];
793       for (j=0; j<nz-1; j+=2) {
794         i0    = vi[0];
795         i1    = vi[1];
796         vi   +=2;
797         tmp0  = tmps[i0];
798         tmp1  = tmps[i1];
799         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
800       }
801       if (j == nz-1) {
802         tmp0  = tmps[*vi++];
803         sum1 -= *v1++ *tmp0;
804       }
805       tmp[row++]=sum1;
806       break;
807     case 2:
808       sum1 = b[*r++];
809       sum2 = b[*r++];
810       v2   = aa + ai[row+1];
811 
812       for (j=0; j<nz-1; j+=2) {
813         i0    = vi[0];
814         i1    = vi[1];
815         vi   +=2;
816         tmp0  = tmps[i0];
817         tmp1  = tmps[i1];
818         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
819         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
820       }
821       if (j == nz-1) {
822         tmp0  = tmps[*vi++];
823         sum1 -= *v1++ *tmp0;
824         sum2 -= *v2++ *tmp0;
825       }
826       sum2     -= *v2++ *sum1;
827       tmp[row++]=sum1;
828       tmp[row++]=sum2;
829       break;
830     case 3:
831       sum1 = b[*r++];
832       sum2 = b[*r++];
833       sum3 = b[*r++];
834       v2   = aa + ai[row+1];
835       v3   = aa + ai[row+2];
836 
837       for (j=0; j<nz-1; j+=2) {
838         i0    = vi[0];
839         i1    = vi[1];
840         vi   +=2;
841         tmp0  = tmps[i0];
842         tmp1  = tmps[i1];
843         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
844         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
845         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
846       }
847       if (j == nz-1) {
848         tmp0  = tmps[*vi++];
849         sum1 -= *v1++ *tmp0;
850         sum2 -= *v2++ *tmp0;
851         sum3 -= *v3++ *tmp0;
852       }
853       sum2 -= *v2++ * sum1;
854       sum3 -= *v3++ * sum1;
855       sum3 -= *v3++ * sum2;
856 
857       tmp[row++]=sum1;
858       tmp[row++]=sum2;
859       tmp[row++]=sum3;
860       break;
861 
862     case 4:
863       sum1 = b[*r++];
864       sum2 = b[*r++];
865       sum3 = b[*r++];
866       sum4 = b[*r++];
867       v2   = aa + ai[row+1];
868       v3   = aa + ai[row+2];
869       v4   = aa + ai[row+3];
870 
871       for (j=0; j<nz-1; j+=2) {
872         i0    = vi[0];
873         i1    = vi[1];
874         vi   +=2;
875         tmp0  = tmps[i0];
876         tmp1  = tmps[i1];
877         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
878         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
879         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
880         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
881       }
882       if (j == nz-1) {
883         tmp0  = tmps[*vi++];
884         sum1 -= *v1++ *tmp0;
885         sum2 -= *v2++ *tmp0;
886         sum3 -= *v3++ *tmp0;
887         sum4 -= *v4++ *tmp0;
888       }
889       sum2 -= *v2++ * sum1;
890       sum3 -= *v3++ * sum1;
891       sum4 -= *v4++ * sum1;
892       sum3 -= *v3++ * sum2;
893       sum4 -= *v4++ * sum2;
894       sum4 -= *v4++ * sum3;
895 
896       tmp[row++]=sum1;
897       tmp[row++]=sum2;
898       tmp[row++]=sum3;
899       tmp[row++]=sum4;
900       break;
901     case 5:
902       sum1 = b[*r++];
903       sum2 = b[*r++];
904       sum3 = b[*r++];
905       sum4 = b[*r++];
906       sum5 = b[*r++];
907       v2   = aa + ai[row+1];
908       v3   = aa + ai[row+2];
909       v4   = aa + ai[row+3];
910       v5   = aa + ai[row+4];
911 
912       for (j=0; j<nz-1; j+=2) {
913         i0    = vi[0];
914         i1    = vi[1];
915         vi   +=2;
916         tmp0  = tmps[i0];
917         tmp1  = tmps[i1];
918         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
919         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
920         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
921         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
922         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
923       }
924       if (j == nz-1) {
925         tmp0  = tmps[*vi++];
926         sum1 -= *v1++ *tmp0;
927         sum2 -= *v2++ *tmp0;
928         sum3 -= *v3++ *tmp0;
929         sum4 -= *v4++ *tmp0;
930         sum5 -= *v5++ *tmp0;
931       }
932 
933       sum2 -= *v2++ * sum1;
934       sum3 -= *v3++ * sum1;
935       sum4 -= *v4++ * sum1;
936       sum5 -= *v5++ * sum1;
937       sum3 -= *v3++ * sum2;
938       sum4 -= *v4++ * sum2;
939       sum5 -= *v5++ * sum2;
940       sum4 -= *v4++ * sum3;
941       sum5 -= *v5++ * sum3;
942       sum5 -= *v5++ * sum4;
943 
944       tmp[row++]=sum1;
945       tmp[row++]=sum2;
946       tmp[row++]=sum3;
947       tmp[row++]=sum4;
948       tmp[row++]=sum5;
949       break;
950     default:
951       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
952     }
953   }
954   /* backward solve the upper triangular */
955   for (i=node_max -1,row = n-1; i>=0; i--) {
956     nsz = ns[i];
957     aii = ai[row+1] -1;
958     v1  = aa + aii;
959     vi  = aj + aii;
960     nz  = aii- ad[row];
961     switch (nsz) {               /* Each loop in 'case' is unrolled */
962     case 1:
963       sum1 = tmp[row];
964 
965       for (j=nz; j>1; j-=2) {
966         vi   -=2;
967         i0    = vi[2];
968         i1    = vi[1];
969         tmp0  = tmps[i0];
970         tmp1  = tmps[i1];
971         v1   -= 2;
972         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
973       }
974       if (j==1) {
975         tmp0  = tmps[*vi--];
976         sum1 -= *v1-- * tmp0;
977       }
978       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
979       break;
980     case 2:
981       sum1 = tmp[row];
982       sum2 = tmp[row -1];
983       v2   = aa + ai[row]-1;
984       for (j=nz; j>1; j-=2) {
985         vi   -=2;
986         i0    = vi[2];
987         i1    = vi[1];
988         tmp0  = tmps[i0];
989         tmp1  = tmps[i1];
990         v1   -= 2;
991         v2   -= 2;
992         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
993         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
994       }
995       if (j==1) {
996         tmp0  = tmps[*vi--];
997         sum1 -= *v1-- * tmp0;
998         sum2 -= *v2-- * tmp0;
999       }
1000 
1001       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1002       sum2   -= *v2-- * tmp0;
1003       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1004       break;
1005     case 3:
1006       sum1 = tmp[row];
1007       sum2 = tmp[row -1];
1008       sum3 = tmp[row -2];
1009       v2   = aa + ai[row]-1;
1010       v3   = aa + ai[row -1]-1;
1011       for (j=nz; j>1; j-=2) {
1012         vi   -=2;
1013         i0    = vi[2];
1014         i1    = vi[1];
1015         tmp0  = tmps[i0];
1016         tmp1  = tmps[i1];
1017         v1   -= 2;
1018         v2   -= 2;
1019         v3   -= 2;
1020         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1021         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1022         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1023       }
1024       if (j==1) {
1025         tmp0  = tmps[*vi--];
1026         sum1 -= *v1-- * tmp0;
1027         sum2 -= *v2-- * tmp0;
1028         sum3 -= *v3-- * tmp0;
1029       }
1030       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1031       sum2   -= *v2-- * tmp0;
1032       sum3   -= *v3-- * tmp0;
1033       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1034       sum3   -= *v3-- * tmp0;
1035       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1036 
1037       break;
1038     case 4:
1039       sum1 = tmp[row];
1040       sum2 = tmp[row -1];
1041       sum3 = tmp[row -2];
1042       sum4 = tmp[row -3];
1043       v2   = aa + ai[row]-1;
1044       v3   = aa + ai[row -1]-1;
1045       v4   = aa + ai[row -2]-1;
1046 
1047       for (j=nz; j>1; j-=2) {
1048         vi   -=2;
1049         i0    = vi[2];
1050         i1    = vi[1];
1051         tmp0  = tmps[i0];
1052         tmp1  = tmps[i1];
1053         v1   -= 2;
1054         v2   -= 2;
1055         v3   -= 2;
1056         v4   -= 2;
1057         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1058         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1059         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1060         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1061       }
1062       if (j==1) {
1063         tmp0  = tmps[*vi--];
1064         sum1 -= *v1-- * tmp0;
1065         sum2 -= *v2-- * tmp0;
1066         sum3 -= *v3-- * tmp0;
1067         sum4 -= *v4-- * tmp0;
1068       }
1069 
1070       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1071       sum2   -= *v2-- * tmp0;
1072       sum3   -= *v3-- * tmp0;
1073       sum4   -= *v4-- * tmp0;
1074       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1075       sum3   -= *v3-- * tmp0;
1076       sum4   -= *v4-- * tmp0;
1077       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1078       sum4   -= *v4-- * tmp0;
1079       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1080       break;
1081     case 5:
1082       sum1 = tmp[row];
1083       sum2 = tmp[row -1];
1084       sum3 = tmp[row -2];
1085       sum4 = tmp[row -3];
1086       sum5 = tmp[row -4];
1087       v2   = aa + ai[row]-1;
1088       v3   = aa + ai[row -1]-1;
1089       v4   = aa + ai[row -2]-1;
1090       v5   = aa + ai[row -3]-1;
1091       for (j=nz; j>1; j-=2) {
1092         vi   -= 2;
1093         i0    = vi[2];
1094         i1    = vi[1];
1095         tmp0  = tmps[i0];
1096         tmp1  = tmps[i1];
1097         v1   -= 2;
1098         v2   -= 2;
1099         v3   -= 2;
1100         v4   -= 2;
1101         v5   -= 2;
1102         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1103         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1104         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1105         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1106         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1107       }
1108       if (j==1) {
1109         tmp0  = tmps[*vi--];
1110         sum1 -= *v1-- * tmp0;
1111         sum2 -= *v2-- * tmp0;
1112         sum3 -= *v3-- * tmp0;
1113         sum4 -= *v4-- * tmp0;
1114         sum5 -= *v5-- * tmp0;
1115       }
1116 
1117       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1118       sum2   -= *v2-- * tmp0;
1119       sum3   -= *v3-- * tmp0;
1120       sum4   -= *v4-- * tmp0;
1121       sum5   -= *v5-- * tmp0;
1122       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1123       sum3   -= *v3-- * tmp0;
1124       sum4   -= *v4-- * tmp0;
1125       sum5   -= *v5-- * tmp0;
1126       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1127       sum4   -= *v4-- * tmp0;
1128       sum5   -= *v5-- * tmp0;
1129       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1130       sum5   -= *v5-- * tmp0;
1131       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1132       break;
1133     default:
1134       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
1135     }
1136   }
1137   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1138   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1139   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1140   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1141   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
1142   PetscFunctionReturn(0);
1143 }
1144 
1145 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info)
1146 {
1147   Mat             C     =B;
1148   Mat_SeqAIJ      *a    =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)C->data;
1149   IS              isrow = b->row,isicol = b->icol;
1150   PetscErrorCode  ierr;
1151   const PetscInt  *r,*ic,*ics;
1152   const PetscInt  n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag;
1153   PetscInt        i,j,k,nz,nzL,row,*pj;
1154   const PetscInt  *ajtmp,*bjtmp;
1155   MatScalar       *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4;
1156   const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4;
1157   FactorShiftCtx  sctx;
1158   const PetscInt  *ddiag;
1159   PetscReal       rs;
1160   MatScalar       d;
1161   PetscInt        inod,nodesz,node_max,col;
1162   const PetscInt  *ns;
1163   PetscInt        *tmp_vec1,*tmp_vec2,*nsmap;
1164 
1165   PetscFunctionBegin;
1166   /* MatPivotSetUp(): initialize shift context sctx */
1167   ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr);
1168 
1169   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1170     ddiag          = a->diag;
1171     sctx.shift_top = info->zeropivot;
1172     for (i=0; i<n; i++) {
1173       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1174       d  = (aa)[ddiag[i]];
1175       rs = -PetscAbsScalar(d) - PetscRealPart(d);
1176       v  = aa+ai[i];
1177       nz = ai[i+1] - ai[i];
1178       for (j=0; j<nz; j++) rs += PetscAbsScalar(v[j]);
1179       if (rs>sctx.shift_top) sctx.shift_top = rs;
1180     }
1181     sctx.shift_top *= 1.1;
1182     sctx.nshift_max = 5;
1183     sctx.shift_lo   = 0.;
1184     sctx.shift_hi   = 1.;
1185   }
1186 
1187   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1188   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1189 
1190   ierr  = PetscCalloc4(n,&rtmp1,n,&rtmp2,n,&rtmp3,n,&rtmp4);CHKERRQ(ierr);
1191   ics   = ic;
1192 
1193   node_max = a->inode.node_count;
1194   ns       = a->inode.size;
1195   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1196 
1197   /* If max inode size > 4, split it into two inodes.*/
1198   /* also map the inode sizes according to the ordering */
1199   ierr = PetscMalloc1(n+1,&tmp_vec1);CHKERRQ(ierr);
1200   for (i=0,j=0; i<node_max; ++i,++j) {
1201     if (ns[i] > 4) {
1202       tmp_vec1[j] = 4;
1203       ++j;
1204       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1205     } else {
1206       tmp_vec1[j] = ns[i];
1207     }
1208   }
1209   /* Use the correct node_max */
1210   node_max = j;
1211 
1212   /* Now reorder the inode info based on mat re-ordering info */
1213   /* First create a row -> inode_size_array_index map */
1214   ierr = PetscMalloc1(n+1,&nsmap);CHKERRQ(ierr);
1215   ierr = PetscMalloc1(node_max+1,&tmp_vec2);CHKERRQ(ierr);
1216   for (i=0,row=0; i<node_max; i++) {
1217     nodesz = tmp_vec1[i];
1218     for (j=0; j<nodesz; j++,row++) {
1219       nsmap[row] = i;
1220     }
1221   }
1222   /* Using nsmap, create a reordered ns structure */
1223   for (i=0,j=0; i< node_max; i++) {
1224     nodesz      = tmp_vec1[nsmap[r[j]]];     /* here the reordered row_no is in r[] */
1225     tmp_vec2[i] = nodesz;
1226     j          += nodesz;
1227   }
1228   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1229   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1230 
1231   /* Now use the correct ns */
1232   ns = tmp_vec2;
1233 
1234   do {
1235     sctx.newshift = PETSC_FALSE;
1236     /* Now loop over each block-row, and do the factorization */
1237     for (inod=0,i=0; inod<node_max; inod++) { /* i: row index; inod: inode index */
1238       nodesz = ns[inod];
1239 
1240       switch (nodesz) {
1241       case 1:
1242         /*----------*/
1243         /* zero rtmp1 */
1244         /* L part */
1245         nz    = bi[i+1] - bi[i];
1246         bjtmp = bj + bi[i];
1247         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1248 
1249         /* U part */
1250         nz    = bdiag[i]-bdiag[i+1];
1251         bjtmp = bj + bdiag[i+1]+1;
1252         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1253 
1254         /* load in initial (unfactored row) */
1255         nz    = ai[r[i]+1] - ai[r[i]];
1256         ajtmp = aj + ai[r[i]];
1257         v     = aa + ai[r[i]];
1258         for (j=0; j<nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];
1259 
1260         /* ZeropivotApply() */
1261         rtmp1[i] += sctx.shift_amount;  /* shift the diagonal of the matrix */
1262 
1263         /* elimination */
1264         bjtmp = bj + bi[i];
1265         row   = *bjtmp++;
1266         nzL   = bi[i+1] - bi[i];
1267         for (k=0; k < nzL; k++) {
1268           pc = rtmp1 + row;
1269           if (*pc != 0.0) {
1270             pv   = b->a + bdiag[row];
1271             mul1 = *pc * (*pv);
1272             *pc  = mul1;
1273             pj   = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1274             pv   = b->a + bdiag[row+1]+1;
1275             nz   = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1276             for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1277             ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1278           }
1279           row = *bjtmp++;
1280         }
1281 
1282         /* finished row so stick it into b->a */
1283         rs = 0.0;
1284         /* L part */
1285         pv = b->a + bi[i];
1286         pj = b->j + bi[i];
1287         nz = bi[i+1] - bi[i];
1288         for (j=0; j<nz; j++) {
1289           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1290         }
1291 
1292         /* U part */
1293         pv = b->a + bdiag[i+1]+1;
1294         pj = b->j + bdiag[i+1]+1;
1295         nz = bdiag[i] - bdiag[i+1]-1;
1296         for (j=0; j<nz; j++) {
1297           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1298         }
1299 
1300         /* Check zero pivot */
1301         sctx.rs = rs;
1302         sctx.pv = rtmp1[i];
1303         ierr    = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr);
1304         if (sctx.newshift) break;
1305 
1306         /* Mark diagonal and invert diagonal for simplier triangular solves */
1307         pv  = b->a + bdiag[i];
1308         *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1309         break;
1310 
1311       case 2:
1312         /*----------*/
1313         /* zero rtmp1 and rtmp2 */
1314         /* L part */
1315         nz    = bi[i+1] - bi[i];
1316         bjtmp = bj + bi[i];
1317         for  (j=0; j<nz; j++) {
1318           col        = bjtmp[j];
1319           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1320         }
1321 
1322         /* U part */
1323         nz    = bdiag[i]-bdiag[i+1];
1324         bjtmp = bj + bdiag[i+1]+1;
1325         for  (j=0; j<nz; j++) {
1326           col        = bjtmp[j];
1327           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1328         }
1329 
1330         /* load in initial (unfactored row) */
1331         nz    = ai[r[i]+1] - ai[r[i]];
1332         ajtmp = aj + ai[r[i]];
1333         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1];
1334         for (j=0; j<nz; j++) {
1335           col        = ics[ajtmp[j]];
1336           rtmp1[col] = v1[j]; rtmp2[col] = v2[j];
1337         }
1338         /* ZeropivotApply(): shift the diagonal of the matrix  */
1339         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount;
1340 
1341         /* elimination */
1342         bjtmp = bj + bi[i];
1343         row   = *bjtmp++; /* pivot row */
1344         nzL   = bi[i+1] - bi[i];
1345         for (k=0; k < nzL; k++) {
1346           pc1 = rtmp1 + row;
1347           pc2 = rtmp2 + row;
1348           if (*pc1 != 0.0 || *pc2 != 0.0) {
1349             pv   = b->a + bdiag[row];
1350             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv);
1351             *pc1 = mul1;       *pc2 = mul2;
1352 
1353             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1354             pv = b->a + bdiag[row+1]+1;
1355             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1356             for (j=0; j<nz; j++) {
1357               col         = pj[j];
1358               rtmp1[col] -= mul1 * pv[j];
1359               rtmp2[col] -= mul2 * pv[j];
1360             }
1361             ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr);
1362           }
1363           row = *bjtmp++;
1364         }
1365 
1366         /* finished row i; check zero pivot, then stick row i into b->a */
1367         rs = 0.0;
1368         /* L part */
1369         pc1 = b->a + bi[i];
1370         pj  = b->j + bi[i];
1371         nz  = bi[i+1] - bi[i];
1372         for (j=0; j<nz; j++) {
1373           col    = pj[j];
1374           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1375         }
1376         /* U part */
1377         pc1 = b->a + bdiag[i+1]+1;
1378         pj  = b->j + bdiag[i+1]+1;
1379         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1380         for (j=0; j<nz; j++) {
1381           col    = pj[j];
1382           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1383         }
1384 
1385         sctx.rs = rs;
1386         sctx.pv = rtmp1[i];
1387         ierr    = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr);
1388         if (sctx.newshift) break;
1389         pc1  = b->a + bdiag[i]; /* Mark diagonal */
1390         *pc1 = 1.0/sctx.pv;
1391 
1392         /* Now take care of diagonal 2x2 block. */
1393         pc2 = rtmp2 + i;
1394         if (*pc2 != 0.0) {
1395           mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */
1396           *pc2 = mul1;          /* insert L entry */
1397           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1398           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1399           for (j=0; j<nz; j++) {
1400             col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col];
1401           }
1402           ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1403         }
1404 
1405         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1406         rs = 0.0;
1407         /* L part */
1408         pc2 = b->a + bi[i+1];
1409         pj  = b->j + bi[i+1];
1410         nz  = bi[i+2] - bi[i+1];
1411         for (j=0; j<nz; j++) {
1412           col    = pj[j];
1413           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1414         }
1415         /* U part */
1416         pc2 = b->a + bdiag[i+2]+1;
1417         pj  = b->j + bdiag[i+2]+1;
1418         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1419         for (j=0; j<nz; j++) {
1420           col    = pj[j];
1421           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1422         }
1423 
1424         sctx.rs = rs;
1425         sctx.pv = rtmp2[i+1];
1426         ierr    = MatPivotCheck(B,A,info,&sctx,i+1);CHKERRQ(ierr);
1427         if (sctx.newshift) break;
1428         pc2  = b->a + bdiag[i+1];
1429         *pc2 = 1.0/sctx.pv;
1430         break;
1431 
1432       case 3:
1433         /*----------*/
1434         /* zero rtmp */
1435         /* L part */
1436         nz    = bi[i+1] - bi[i];
1437         bjtmp = bj + bi[i];
1438         for  (j=0; j<nz; j++) {
1439           col        = bjtmp[j];
1440           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1441         }
1442 
1443         /* U part */
1444         nz    = bdiag[i]-bdiag[i+1];
1445         bjtmp = bj + bdiag[i+1]+1;
1446         for  (j=0; j<nz; j++) {
1447           col        = bjtmp[j];
1448           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1449         }
1450 
1451         /* load in initial (unfactored row) */
1452         nz    = ai[r[i]+1] - ai[r[i]];
1453         ajtmp = aj + ai[r[i]];
1454         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2];
1455         for (j=0; j<nz; j++) {
1456           col        = ics[ajtmp[j]];
1457           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j];
1458         }
1459         /* ZeropivotApply(): shift the diagonal of the matrix  */
1460         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount;
1461 
1462         /* elimination */
1463         bjtmp = bj + bi[i];
1464         row   = *bjtmp++; /* pivot row */
1465         nzL   = bi[i+1] - bi[i];
1466         for (k=0; k < nzL; k++) {
1467           pc1 = rtmp1 + row;
1468           pc2 = rtmp2 + row;
1469           pc3 = rtmp3 + row;
1470           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1471             pv   = b->a + bdiag[row];
1472             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv);
1473             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3;
1474 
1475             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1476             pv = b->a + bdiag[row+1]+1;
1477             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1478             for (j=0; j<nz; j++) {
1479               col         = pj[j];
1480               rtmp1[col] -= mul1 * pv[j];
1481               rtmp2[col] -= mul2 * pv[j];
1482               rtmp3[col] -= mul3 * pv[j];
1483             }
1484             ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr);
1485           }
1486           row = *bjtmp++;
1487         }
1488 
1489         /* finished row i; check zero pivot, then stick row i into b->a */
1490         rs = 0.0;
1491         /* L part */
1492         pc1 = b->a + bi[i];
1493         pj  = b->j + bi[i];
1494         nz  = bi[i+1] - bi[i];
1495         for (j=0; j<nz; j++) {
1496           col    = pj[j];
1497           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1498         }
1499         /* U part */
1500         pc1 = b->a + bdiag[i+1]+1;
1501         pj  = b->j + bdiag[i+1]+1;
1502         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1503         for (j=0; j<nz; j++) {
1504           col    = pj[j];
1505           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1506         }
1507 
1508         sctx.rs = rs;
1509         sctx.pv = rtmp1[i];
1510         ierr    = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr);
1511         if (sctx.newshift) break;
1512         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1513         *pc1 = 1.0/sctx.pv;
1514 
1515         /* Now take care of 1st column of diagonal 3x3 block. */
1516         pc2 = rtmp2 + i;
1517         pc3 = rtmp3 + i;
1518         if (*pc2 != 0.0 || *pc3 != 0.0) {
1519           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1520           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1521           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1522           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1523           for (j=0; j<nz; j++) {
1524             col         = pj[j];
1525             rtmp2[col] -= mul2 * rtmp1[col];
1526             rtmp3[col] -= mul3 * rtmp1[col];
1527           }
1528           ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr);
1529         }
1530 
1531         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1532         rs = 0.0;
1533         /* L part */
1534         pc2 = b->a + bi[i+1];
1535         pj  = b->j + bi[i+1];
1536         nz  = bi[i+2] - bi[i+1];
1537         for (j=0; j<nz; j++) {
1538           col    = pj[j];
1539           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1540         }
1541         /* U part */
1542         pc2 = b->a + bdiag[i+2]+1;
1543         pj  = b->j + bdiag[i+2]+1;
1544         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1545         for (j=0; j<nz; j++) {
1546           col    = pj[j];
1547           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1548         }
1549 
1550         sctx.rs = rs;
1551         sctx.pv = rtmp2[i+1];
1552         ierr    = MatPivotCheck(B,A,info,&sctx,i+1);CHKERRQ(ierr);
1553         if (sctx.newshift) break;
1554         pc2  = b->a + bdiag[i+1];
1555         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1556 
1557         /* Now take care of 2nd column of diagonal 3x3 block. */
1558         pc3 = rtmp3 + i+1;
1559         if (*pc3 != 0.0) {
1560           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1561           pj   = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1562           nz   = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1563           for (j=0; j<nz; j++) {
1564             col         = pj[j];
1565             rtmp3[col] -= mul3 * rtmp2[col];
1566           }
1567           ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1568         }
1569 
1570         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1571         rs = 0.0;
1572         /* L part */
1573         pc3 = b->a + bi[i+2];
1574         pj  = b->j + bi[i+2];
1575         nz  = bi[i+3] - bi[i+2];
1576         for (j=0; j<nz; j++) {
1577           col    = pj[j];
1578           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1579         }
1580         /* U part */
1581         pc3 = b->a + bdiag[i+3]+1;
1582         pj  = b->j + bdiag[i+3]+1;
1583         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1584         for (j=0; j<nz; j++) {
1585           col    = pj[j];
1586           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1587         }
1588 
1589         sctx.rs = rs;
1590         sctx.pv = rtmp3[i+2];
1591         ierr    = MatPivotCheck(B,A,info,&sctx,i+2);CHKERRQ(ierr);
1592         if (sctx.newshift) break;
1593         pc3  = b->a + bdiag[i+2];
1594         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1595         break;
1596       case 4:
1597         /*----------*/
1598         /* zero rtmp */
1599         /* L part */
1600         nz    = bi[i+1] - bi[i];
1601         bjtmp = bj + bi[i];
1602         for  (j=0; j<nz; j++) {
1603           col        = bjtmp[j];
1604           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0;
1605         }
1606 
1607         /* U part */
1608         nz    = bdiag[i]-bdiag[i+1];
1609         bjtmp = bj + bdiag[i+1]+1;
1610         for  (j=0; j<nz; j++) {
1611           col        = bjtmp[j];
1612           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0;
1613         }
1614 
1615         /* load in initial (unfactored row) */
1616         nz    = ai[r[i]+1] - ai[r[i]];
1617         ajtmp = aj + ai[r[i]];
1618         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3];
1619         for (j=0; j<nz; j++) {
1620           col        = ics[ajtmp[j]];
1621           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j];
1622         }
1623         /* ZeropivotApply(): shift the diagonal of the matrix  */
1624         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount;
1625 
1626         /* elimination */
1627         bjtmp = bj + bi[i];
1628         row   = *bjtmp++; /* pivot row */
1629         nzL   = bi[i+1] - bi[i];
1630         for (k=0; k < nzL; k++) {
1631           pc1 = rtmp1 + row;
1632           pc2 = rtmp2 + row;
1633           pc3 = rtmp3 + row;
1634           pc4 = rtmp4 + row;
1635           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1636             pv   = b->a + bdiag[row];
1637             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv);
1638             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4;
1639 
1640             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1641             pv = b->a + bdiag[row+1]+1;
1642             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1643             for (j=0; j<nz; j++) {
1644               col         = pj[j];
1645               rtmp1[col] -= mul1 * pv[j];
1646               rtmp2[col] -= mul2 * pv[j];
1647               rtmp3[col] -= mul3 * pv[j];
1648               rtmp4[col] -= mul4 * pv[j];
1649             }
1650             ierr = PetscLogFlops(4+8*nz);CHKERRQ(ierr);
1651           }
1652           row = *bjtmp++;
1653         }
1654 
1655         /* finished row i; check zero pivot, then stick row i into b->a */
1656         rs = 0.0;
1657         /* L part */
1658         pc1 = b->a + bi[i];
1659         pj  = b->j + bi[i];
1660         nz  = bi[i+1] - bi[i];
1661         for (j=0; j<nz; j++) {
1662           col    = pj[j];
1663           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1664         }
1665         /* U part */
1666         pc1 = b->a + bdiag[i+1]+1;
1667         pj  = b->j + bdiag[i+1]+1;
1668         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1669         for (j=0; j<nz; j++) {
1670           col    = pj[j];
1671           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1672         }
1673 
1674         sctx.rs = rs;
1675         sctx.pv = rtmp1[i];
1676         ierr    = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr);
1677         if (sctx.newshift) break;
1678         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1679         *pc1 = 1.0/sctx.pv;
1680 
1681         /* Now take care of 1st column of diagonal 4x4 block. */
1682         pc2 = rtmp2 + i;
1683         pc3 = rtmp3 + i;
1684         pc4 = rtmp4 + i;
1685         if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1686           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1687           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1688           mul4 = (*pc4)*(*pc1); *pc4 = mul4;
1689           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1690           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1691           for (j=0; j<nz; j++) {
1692             col         = pj[j];
1693             rtmp2[col] -= mul2 * rtmp1[col];
1694             rtmp3[col] -= mul3 * rtmp1[col];
1695             rtmp4[col] -= mul4 * rtmp1[col];
1696           }
1697           ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr);
1698         }
1699 
1700         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1701         rs = 0.0;
1702         /* L part */
1703         pc2 = b->a + bi[i+1];
1704         pj  = b->j + bi[i+1];
1705         nz  = bi[i+2] - bi[i+1];
1706         for (j=0; j<nz; j++) {
1707           col    = pj[j];
1708           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1709         }
1710         /* U part */
1711         pc2 = b->a + bdiag[i+2]+1;
1712         pj  = b->j + bdiag[i+2]+1;
1713         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1714         for (j=0; j<nz; j++) {
1715           col    = pj[j];
1716           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1717         }
1718 
1719         sctx.rs = rs;
1720         sctx.pv = rtmp2[i+1];
1721         ierr    = MatPivotCheck(B,A,info,&sctx,i+1);CHKERRQ(ierr);
1722         if (sctx.newshift) break;
1723         pc2  = b->a + bdiag[i+1];
1724         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1725 
1726         /* Now take care of 2nd column of diagonal 4x4 block. */
1727         pc3 = rtmp3 + i+1;
1728         pc4 = rtmp4 + i+1;
1729         if (*pc3 != 0.0 || *pc4 != 0.0) {
1730           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1731           mul4 = (*pc4)*(*pc2); *pc4 = mul4;
1732           pj   = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1733           nz   = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1734           for (j=0; j<nz; j++) {
1735             col         = pj[j];
1736             rtmp3[col] -= mul3 * rtmp2[col];
1737             rtmp4[col] -= mul4 * rtmp2[col];
1738           }
1739           ierr = PetscLogFlops(4*nz);CHKERRQ(ierr);
1740         }
1741 
1742         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1743         rs = 0.0;
1744         /* L part */
1745         pc3 = b->a + bi[i+2];
1746         pj  = b->j + bi[i+2];
1747         nz  = bi[i+3] - bi[i+2];
1748         for (j=0; j<nz; j++) {
1749           col    = pj[j];
1750           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1751         }
1752         /* U part */
1753         pc3 = b->a + bdiag[i+3]+1;
1754         pj  = b->j + bdiag[i+3]+1;
1755         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1756         for (j=0; j<nz; j++) {
1757           col    = pj[j];
1758           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1759         }
1760 
1761         sctx.rs = rs;
1762         sctx.pv = rtmp3[i+2];
1763         ierr    = MatPivotCheck(B,A,info,&sctx,i+2);CHKERRQ(ierr);
1764         if (sctx.newshift) break;
1765         pc3  = b->a + bdiag[i+2];
1766         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1767 
1768         /* Now take care of 3rd column of diagonal 4x4 block. */
1769         pc4 = rtmp4 + i+2;
1770         if (*pc4 != 0.0) {
1771           mul4 = (*pc4)*(*pc3); *pc4 = mul4;
1772           pj   = b->j + bdiag[i+3]+1;     /* beginning of U(i+2,:) */
1773           nz   = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */
1774           for (j=0; j<nz; j++) {
1775             col         = pj[j];
1776             rtmp4[col] -= mul4 * rtmp3[col];
1777           }
1778           ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1779         }
1780 
1781         /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1782         rs = 0.0;
1783         /* L part */
1784         pc4 = b->a + bi[i+3];
1785         pj  = b->j + bi[i+3];
1786         nz  = bi[i+4] - bi[i+3];
1787         for (j=0; j<nz; j++) {
1788           col    = pj[j];
1789           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1790         }
1791         /* U part */
1792         pc4 = b->a + bdiag[i+4]+1;
1793         pj  = b->j + bdiag[i+4]+1;
1794         nz  = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */
1795         for (j=0; j<nz; j++) {
1796           col    = pj[j];
1797           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1798         }
1799 
1800         sctx.rs = rs;
1801         sctx.pv = rtmp4[i+3];
1802         ierr    = MatPivotCheck(B,A,info,&sctx,i+3);CHKERRQ(ierr);
1803         if (sctx.newshift) break;
1804         pc4  = b->a + bdiag[i+3];
1805         *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */
1806         break;
1807 
1808       default:
1809         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
1810       }
1811       if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1812       i += nodesz;                 /* Update the row */
1813     }
1814 
1815     /* MatPivotRefine() */
1816     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max) {
1817       /*
1818        * if no shift in this attempt & shifting & started shifting & can refine,
1819        * then try lower shift
1820        */
1821       sctx.shift_hi       = sctx.shift_fraction;
1822       sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.;
1823       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
1824       sctx.newshift       = PETSC_TRUE;
1825       sctx.nshift++;
1826     }
1827   } while (sctx.newshift);
1828 
1829   ierr = PetscFree4(rtmp1,rtmp2,rtmp3,rtmp4);CHKERRQ(ierr);
1830   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
1831   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
1832   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1833 
1834   if (b->inode.size) {
1835     C->ops->solve           = MatSolve_SeqAIJ_Inode;
1836   } else {
1837     C->ops->solve           = MatSolve_SeqAIJ;
1838   }
1839   C->ops->solveadd          = MatSolveAdd_SeqAIJ;
1840   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1841   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1842   C->ops->matsolve          = MatMatSolve_SeqAIJ;
1843   C->assembled              = PETSC_TRUE;
1844   C->preallocated           = PETSC_TRUE;
1845 
1846   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
1847 
1848   /* MatShiftView(A,info,&sctx) */
1849   if (sctx.nshift) {
1850     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) {
1851       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);CHKERRQ(ierr);
1852     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1853       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);CHKERRQ(ierr);
1854     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1855       ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %g\n",sctx.nshift,(double)info->shiftamount);CHKERRQ(ierr);
1856     }
1857   }
1858   PetscFunctionReturn(0);
1859 }
1860 
1861 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info)
1862 {
1863   Mat             C     = B;
1864   Mat_SeqAIJ      *a    = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1865   IS              iscol = b->col,isrow = b->row,isicol = b->icol;
1866   PetscErrorCode  ierr;
1867   const PetscInt  *r,*ic,*c,*ics;
1868   PetscInt        n   = A->rmap->n,*bi = b->i;
1869   PetscInt        *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1870   PetscInt        i,j,idx,*bd = b->diag,node_max,nodesz;
1871   PetscInt        *ai = a->i,*aj = a->j;
1872   PetscInt        *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1873   PetscScalar     mul1,mul2,mul3,tmp;
1874   MatScalar       *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1875   const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1;
1876   PetscReal       rs=0.0;
1877   FactorShiftCtx  sctx;
1878 
1879   PetscFunctionBegin;
1880   sctx.shift_top      = 0;
1881   sctx.nshift_max     = 0;
1882   sctx.shift_lo       = 0;
1883   sctx.shift_hi       = 0;
1884   sctx.shift_fraction = 0;
1885 
1886   /* if both shift schemes are chosen by user, only use info->shiftpd */
1887   if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1888     sctx.shift_top = 0;
1889     for (i=0; i<n; i++) {
1890       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1891       rs    = 0.0;
1892       ajtmp = aj + ai[i];
1893       rtmp1 = aa + ai[i];
1894       nz    = ai[i+1] - ai[i];
1895       for (j=0; j<nz; j++) {
1896         if (*ajtmp != i) {
1897           rs += PetscAbsScalar(*rtmp1++);
1898         } else {
1899           rs -= PetscRealPart(*rtmp1++);
1900         }
1901         ajtmp++;
1902       }
1903       if (rs>sctx.shift_top) sctx.shift_top = rs;
1904     }
1905     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1906     sctx.shift_top *= 1.1;
1907     sctx.nshift_max = 5;
1908     sctx.shift_lo   = 0.;
1909     sctx.shift_hi   = 1.;
1910   }
1911   sctx.shift_amount = 0;
1912   sctx.nshift       = 0;
1913 
1914   ierr   = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1915   ierr   = ISGetIndices(iscol,&c);CHKERRQ(ierr);
1916   ierr   = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1917   ierr   = PetscCalloc3(n,&rtmp11,n,&rtmp22,n,&rtmp33);CHKERRQ(ierr);
1918   ics    = ic;
1919 
1920   node_max = a->inode.node_count;
1921   ns       = a->inode.size;
1922   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1923 
1924   /* If max inode size > 3, split it into two inodes.*/
1925   /* also map the inode sizes according to the ordering */
1926   ierr = PetscMalloc1(n+1,&tmp_vec1);CHKERRQ(ierr);
1927   for (i=0,j=0; i<node_max; ++i,++j) {
1928     if (ns[i]>3) {
1929       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1930       ++j;
1931       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1932     } else {
1933       tmp_vec1[j] = ns[i];
1934     }
1935   }
1936   /* Use the correct node_max */
1937   node_max = j;
1938 
1939   /* Now reorder the inode info based on mat re-ordering info */
1940   /* First create a row -> inode_size_array_index map */
1941   ierr = PetscMalloc1(n+1,&nsmap);CHKERRQ(ierr);
1942   ierr = PetscMalloc1(node_max+1,&tmp_vec2);CHKERRQ(ierr);
1943   for (i=0,row=0; i<node_max; i++) {
1944     nodesz = tmp_vec1[i];
1945     for (j=0; j<nodesz; j++,row++) {
1946       nsmap[row] = i;
1947     }
1948   }
1949   /* Using nsmap, create a reordered ns structure */
1950   for (i=0,j=0; i< node_max; i++) {
1951     nodesz      = tmp_vec1[nsmap[r[j]]];     /* here the reordered row_no is in r[] */
1952     tmp_vec2[i] = nodesz;
1953     j          += nodesz;
1954   }
1955   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1956   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1957   /* Now use the correct ns */
1958   ns = tmp_vec2;
1959 
1960   do {
1961     sctx.newshift = PETSC_FALSE;
1962     /* Now loop over each block-row, and do the factorization */
1963     for (i=0,row=0; i<node_max; i++) {
1964       nodesz = ns[i];
1965       nz     = bi[row+1] - bi[row];
1966       bjtmp  = bj + bi[row];
1967 
1968       switch (nodesz) {
1969       case 1:
1970         for  (j=0; j<nz; j++) {
1971           idx         = bjtmp[j];
1972           rtmp11[idx] = 0.0;
1973         }
1974 
1975         /* load in initial (unfactored row) */
1976         idx    = r[row];
1977         nz_tmp = ai[idx+1] - ai[idx];
1978         ajtmp  = aj + ai[idx];
1979         v1     = aa + ai[idx];
1980 
1981         for (j=0; j<nz_tmp; j++) {
1982           idx         = ics[ajtmp[j]];
1983           rtmp11[idx] = v1[j];
1984         }
1985         rtmp11[ics[r[row]]] += sctx.shift_amount;
1986 
1987         prow = *bjtmp++;
1988         while (prow < row) {
1989           pc1 = rtmp11 + prow;
1990           if (*pc1 != 0.0) {
1991             pv     = ba + bd[prow];
1992             pj     = nbj + bd[prow];
1993             mul1   = *pc1 * *pv++;
1994             *pc1   = mul1;
1995             nz_tmp = bi[prow+1] - bd[prow] - 1;
1996             ierr   = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr);
1997             for (j=0; j<nz_tmp; j++) {
1998               tmp          = pv[j];
1999               idx          = pj[j];
2000               rtmp11[idx] -= mul1 * tmp;
2001             }
2002           }
2003           prow = *bjtmp++;
2004         }
2005         pj  = bj + bi[row];
2006         pc1 = ba + bi[row];
2007 
2008         sctx.pv     = rtmp11[row];
2009         rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
2010         rs          = 0.0;
2011         for (j=0; j<nz; j++) {
2012           idx    = pj[j];
2013           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2014           if (idx != row) rs += PetscAbsScalar(pc1[j]);
2015         }
2016         sctx.rs = rs;
2017         ierr    = MatPivotCheck(B,A,info,&sctx,row);CHKERRQ(ierr);
2018         if (sctx.newshift) goto endofwhile;
2019         break;
2020 
2021       case 2:
2022         for (j=0; j<nz; j++) {
2023           idx         = bjtmp[j];
2024           rtmp11[idx] = 0.0;
2025           rtmp22[idx] = 0.0;
2026         }
2027 
2028         /* load in initial (unfactored row) */
2029         idx    = r[row];
2030         nz_tmp = ai[idx+1] - ai[idx];
2031         ajtmp  = aj + ai[idx];
2032         v1     = aa + ai[idx];
2033         v2     = aa + ai[idx+1];
2034         for (j=0; j<nz_tmp; j++) {
2035           idx         = ics[ajtmp[j]];
2036           rtmp11[idx] = v1[j];
2037           rtmp22[idx] = v2[j];
2038         }
2039         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2040         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2041 
2042         prow = *bjtmp++;
2043         while (prow < row) {
2044           pc1 = rtmp11 + prow;
2045           pc2 = rtmp22 + prow;
2046           if (*pc1 != 0.0 || *pc2 != 0.0) {
2047             pv   = ba + bd[prow];
2048             pj   = nbj + bd[prow];
2049             mul1 = *pc1 * *pv;
2050             mul2 = *pc2 * *pv;
2051             ++pv;
2052             *pc1 = mul1;
2053             *pc2 = mul2;
2054 
2055             nz_tmp = bi[prow+1] - bd[prow] - 1;
2056             for (j=0; j<nz_tmp; j++) {
2057               tmp          = pv[j];
2058               idx          = pj[j];
2059               rtmp11[idx] -= mul1 * tmp;
2060               rtmp22[idx] -= mul2 * tmp;
2061             }
2062             ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr);
2063           }
2064           prow = *bjtmp++;
2065         }
2066 
2067         /* Now take care of diagonal 2x2 block. Note: prow = row here */
2068         pc1 = rtmp11 + prow;
2069         pc2 = rtmp22 + prow;
2070 
2071         sctx.pv = *pc1;
2072         pj      = bj + bi[prow];
2073         rs      = 0.0;
2074         for (j=0; j<nz; j++) {
2075           idx = pj[j];
2076           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2077         }
2078         sctx.rs = rs;
2079         ierr    = MatPivotCheck(B,A,info,&sctx,row);CHKERRQ(ierr);
2080         if (sctx.newshift) goto endofwhile;
2081 
2082         if (*pc2 != 0.0) {
2083           pj     = nbj + bd[prow];
2084           mul2   = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
2085           *pc2   = mul2;
2086           nz_tmp = bi[prow+1] - bd[prow] - 1;
2087           for (j=0; j<nz_tmp; j++) {
2088             idx          = pj[j];
2089             tmp          = rtmp11[idx];
2090             rtmp22[idx] -= mul2 * tmp;
2091           }
2092           ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr);
2093         }
2094 
2095         pj  = bj + bi[row];
2096         pc1 = ba + bi[row];
2097         pc2 = ba + bi[row+1];
2098 
2099         sctx.pv       = rtmp22[row+1];
2100         rs            = 0.0;
2101         rtmp11[row]   = 1.0/rtmp11[row];
2102         rtmp22[row+1] = 1.0/rtmp22[row+1];
2103         /* copy row entries from dense representation to sparse */
2104         for (j=0; j<nz; j++) {
2105           idx    = pj[j];
2106           pc1[j] = rtmp11[idx];
2107           pc2[j] = rtmp22[idx];
2108           if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
2109         }
2110         sctx.rs = rs;
2111         ierr    = MatPivotCheck(B,A,info,&sctx,row+1);CHKERRQ(ierr);
2112         if (sctx.newshift) goto endofwhile;
2113         break;
2114 
2115       case 3:
2116         for  (j=0; j<nz; j++) {
2117           idx         = bjtmp[j];
2118           rtmp11[idx] = 0.0;
2119           rtmp22[idx] = 0.0;
2120           rtmp33[idx] = 0.0;
2121         }
2122         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2123         idx    = r[row];
2124         nz_tmp = ai[idx+1] - ai[idx];
2125         ajtmp  = aj + ai[idx];
2126         v1     = aa + ai[idx];
2127         v2     = aa + ai[idx+1];
2128         v3     = aa + ai[idx+2];
2129         for (j=0; j<nz_tmp; j++) {
2130           idx         = ics[ajtmp[j]];
2131           rtmp11[idx] = v1[j];
2132           rtmp22[idx] = v2[j];
2133           rtmp33[idx] = v3[j];
2134         }
2135         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2136         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2137         rtmp33[ics[r[row+2]]] += sctx.shift_amount;
2138 
2139         /* loop over all pivot row blocks above this row block */
2140         prow = *bjtmp++;
2141         while (prow < row) {
2142           pc1 = rtmp11 + prow;
2143           pc2 = rtmp22 + prow;
2144           pc3 = rtmp33 + prow;
2145           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0) {
2146             pv   = ba  + bd[prow];
2147             pj   = nbj + bd[prow];
2148             mul1 = *pc1 * *pv;
2149             mul2 = *pc2 * *pv;
2150             mul3 = *pc3 * *pv;
2151             ++pv;
2152             *pc1 = mul1;
2153             *pc2 = mul2;
2154             *pc3 = mul3;
2155 
2156             nz_tmp = bi[prow+1] - bd[prow] - 1;
2157             /* update this row based on pivot row */
2158             for (j=0; j<nz_tmp; j++) {
2159               tmp          = pv[j];
2160               idx          = pj[j];
2161               rtmp11[idx] -= mul1 * tmp;
2162               rtmp22[idx] -= mul2 * tmp;
2163               rtmp33[idx] -= mul3 * tmp;
2164             }
2165             ierr = PetscLogFlops(3+6*nz_tmp);CHKERRQ(ierr);
2166           }
2167           prow = *bjtmp++;
2168         }
2169 
2170         /* Now take care of diagonal 3x3 block in this set of rows */
2171         /* note: prow = row here */
2172         pc1 = rtmp11 + prow;
2173         pc2 = rtmp22 + prow;
2174         pc3 = rtmp33 + prow;
2175 
2176         sctx.pv = *pc1;
2177         pj      = bj + bi[prow];
2178         rs      = 0.0;
2179         for (j=0; j<nz; j++) {
2180           idx = pj[j];
2181           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2182         }
2183         sctx.rs = rs;
2184         ierr    = MatPivotCheck(B,A,info,&sctx,row);CHKERRQ(ierr);
2185         if (sctx.newshift) goto endofwhile;
2186 
2187         if (*pc2 != 0.0 || *pc3 != 0.0) {
2188           mul2   = (*pc2)/(*pc1);
2189           mul3   = (*pc3)/(*pc1);
2190           *pc2   = mul2;
2191           *pc3   = mul3;
2192           nz_tmp = bi[prow+1] - bd[prow] - 1;
2193           pj     = nbj + bd[prow];
2194           for (j=0; j<nz_tmp; j++) {
2195             idx          = pj[j];
2196             tmp          = rtmp11[idx];
2197             rtmp22[idx] -= mul2 * tmp;
2198             rtmp33[idx] -= mul3 * tmp;
2199           }
2200           ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr);
2201         }
2202         ++prow;
2203 
2204         pc2     = rtmp22 + prow;
2205         pc3     = rtmp33 + prow;
2206         sctx.pv = *pc2;
2207         pj      = bj + bi[prow];
2208         rs      = 0.0;
2209         for (j=0; j<nz; j++) {
2210           idx = pj[j];
2211           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2212         }
2213         sctx.rs = rs;
2214         ierr    = MatPivotCheck(B,A,info,&sctx,row+1);CHKERRQ(ierr);
2215         if (sctx.newshift) goto endofwhile;
2216 
2217         if (*pc3 != 0.0) {
2218           mul3   = (*pc3)/(*pc2);
2219           *pc3   = mul3;
2220           pj     = nbj + bd[prow];
2221           nz_tmp = bi[prow+1] - bd[prow] - 1;
2222           for (j=0; j<nz_tmp; j++) {
2223             idx          = pj[j];
2224             tmp          = rtmp22[idx];
2225             rtmp33[idx] -= mul3 * tmp;
2226           }
2227           ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr);
2228         }
2229 
2230         pj  = bj + bi[row];
2231         pc1 = ba + bi[row];
2232         pc2 = ba + bi[row+1];
2233         pc3 = ba + bi[row+2];
2234 
2235         sctx.pv       = rtmp33[row+2];
2236         rs            = 0.0;
2237         rtmp11[row]   = 1.0/rtmp11[row];
2238         rtmp22[row+1] = 1.0/rtmp22[row+1];
2239         rtmp33[row+2] = 1.0/rtmp33[row+2];
2240         /* copy row entries from dense representation to sparse */
2241         for (j=0; j<nz; j++) {
2242           idx    = pj[j];
2243           pc1[j] = rtmp11[idx];
2244           pc2[j] = rtmp22[idx];
2245           pc3[j] = rtmp33[idx];
2246           if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
2247         }
2248 
2249         sctx.rs = rs;
2250         ierr    = MatPivotCheck(B,A,info,&sctx,row+2);CHKERRQ(ierr);
2251         if (sctx.newshift) goto endofwhile;
2252         break;
2253 
2254       default:
2255         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
2256       }
2257       row += nodesz;                 /* Update the row */
2258     }
2259 endofwhile:;
2260   } while (sctx.newshift);
2261   ierr = PetscFree3(rtmp11,rtmp22,rtmp33);CHKERRQ(ierr);
2262   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
2263   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
2264   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
2265   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
2266 
2267   (B)->ops->solve = MatSolve_SeqAIJ_inplace;
2268   /* do not set solve add, since MatSolve_Inode + Add is faster */
2269   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ_inplace;
2270   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2271   C->assembled              = PETSC_TRUE;
2272   C->preallocated           = PETSC_TRUE;
2273   if (sctx.nshift) {
2274     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2275       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);CHKERRQ(ierr);
2276     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2277       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);CHKERRQ(ierr);
2278     }
2279   }
2280   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
2281   ierr = MatSeqAIJCheckInode(C);CHKERRQ(ierr);
2282   PetscFunctionReturn(0);
2283 }
2284 
2285 
2286 /* ----------------------------------------------------------- */
2287 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
2288 {
2289   Mat_SeqAIJ        *a    = (Mat_SeqAIJ*)A->data;
2290   IS                iscol = a->col,isrow = a->row;
2291   PetscErrorCode    ierr;
2292   const PetscInt    *r,*c,*rout,*cout;
2293   PetscInt          i,j,n = A->rmap->n;
2294   PetscInt          node_max,row,nsz,aii,i0,i1,nz;
2295   const PetscInt    *ai = a->i,*a_j = a->j,*ns,*vi,*ad,*aj;
2296   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
2297   PetscScalar       sum1,sum2,sum3,sum4,sum5;
2298   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
2299   const PetscScalar *b;
2300 
2301   PetscFunctionBegin;
2302   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
2303   node_max = a->inode.node_count;
2304   ns       = a->inode.size;     /* Node Size array */
2305 
2306   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2307   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2308   tmp  = a->solve_work;
2309 
2310   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2311   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2312 
2313   /* forward solve the lower triangular */
2314   tmps = tmp;
2315   aa   = a_a;
2316   aj   = a_j;
2317   ad   = a->diag;
2318 
2319   for (i = 0,row = 0; i< node_max; ++i) {
2320     nsz = ns[i];
2321     aii = ai[row];
2322     v1  = aa + aii;
2323     vi  = aj + aii;
2324     nz  = ai[row+1]- ai[row];
2325 
2326     if (i < node_max-1) {
2327       /* Prefetch the indices for the next block */
2328       PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
2329       /* Prefetch the data for the next block */
2330       PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
2331     }
2332 
2333     switch (nsz) {               /* Each loop in 'case' is unrolled */
2334     case 1:
2335       sum1 = b[r[row]];
2336       for (j=0; j<nz-1; j+=2) {
2337         i0    = vi[j];
2338         i1    = vi[j+1];
2339         tmp0  = tmps[i0];
2340         tmp1  = tmps[i1];
2341         sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1;
2342       }
2343       if (j == nz-1) {
2344         tmp0  = tmps[vi[j]];
2345         sum1 -= v1[j]*tmp0;
2346       }
2347       tmp[row++]=sum1;
2348       break;
2349     case 2:
2350       sum1 = b[r[row]];
2351       sum2 = b[r[row+1]];
2352       v2   = aa + ai[row+1];
2353 
2354       for (j=0; j<nz-1; j+=2) {
2355         i0    = vi[j];
2356         i1    = vi[j+1];
2357         tmp0  = tmps[i0];
2358         tmp1  = tmps[i1];
2359         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2360         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2361       }
2362       if (j == nz-1) {
2363         tmp0  = tmps[vi[j]];
2364         sum1 -= v1[j] *tmp0;
2365         sum2 -= v2[j] *tmp0;
2366       }
2367       sum2     -= v2[nz] * sum1;
2368       tmp[row++]=sum1;
2369       tmp[row++]=sum2;
2370       break;
2371     case 3:
2372       sum1 = b[r[row]];
2373       sum2 = b[r[row+1]];
2374       sum3 = b[r[row+2]];
2375       v2   = aa + ai[row+1];
2376       v3   = aa + ai[row+2];
2377 
2378       for (j=0; j<nz-1; j+=2) {
2379         i0    = vi[j];
2380         i1    = vi[j+1];
2381         tmp0  = tmps[i0];
2382         tmp1  = tmps[i1];
2383         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2384         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2385         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2386       }
2387       if (j == nz-1) {
2388         tmp0  = tmps[vi[j]];
2389         sum1 -= v1[j] *tmp0;
2390         sum2 -= v2[j] *tmp0;
2391         sum3 -= v3[j] *tmp0;
2392       }
2393       sum2     -= v2[nz] * sum1;
2394       sum3     -= v3[nz] * sum1;
2395       sum3     -= v3[nz+1] * sum2;
2396       tmp[row++]=sum1;
2397       tmp[row++]=sum2;
2398       tmp[row++]=sum3;
2399       break;
2400 
2401     case 4:
2402       sum1 = b[r[row]];
2403       sum2 = b[r[row+1]];
2404       sum3 = b[r[row+2]];
2405       sum4 = b[r[row+3]];
2406       v2   = aa + ai[row+1];
2407       v3   = aa + ai[row+2];
2408       v4   = aa + ai[row+3];
2409 
2410       for (j=0; j<nz-1; j+=2) {
2411         i0    = vi[j];
2412         i1    = vi[j+1];
2413         tmp0  = tmps[i0];
2414         tmp1  = tmps[i1];
2415         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2416         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2417         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2418         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2419       }
2420       if (j == nz-1) {
2421         tmp0  = tmps[vi[j]];
2422         sum1 -= v1[j] *tmp0;
2423         sum2 -= v2[j] *tmp0;
2424         sum3 -= v3[j] *tmp0;
2425         sum4 -= v4[j] *tmp0;
2426       }
2427       sum2 -= v2[nz] * sum1;
2428       sum3 -= v3[nz] * sum1;
2429       sum4 -= v4[nz] * sum1;
2430       sum3 -= v3[nz+1] * sum2;
2431       sum4 -= v4[nz+1] * sum2;
2432       sum4 -= v4[nz+2] * sum3;
2433 
2434       tmp[row++]=sum1;
2435       tmp[row++]=sum2;
2436       tmp[row++]=sum3;
2437       tmp[row++]=sum4;
2438       break;
2439     case 5:
2440       sum1 = b[r[row]];
2441       sum2 = b[r[row+1]];
2442       sum3 = b[r[row+2]];
2443       sum4 = b[r[row+3]];
2444       sum5 = b[r[row+4]];
2445       v2   = aa + ai[row+1];
2446       v3   = aa + ai[row+2];
2447       v4   = aa + ai[row+3];
2448       v5   = aa + ai[row+4];
2449 
2450       for (j=0; j<nz-1; j+=2) {
2451         i0    = vi[j];
2452         i1    = vi[j+1];
2453         tmp0  = tmps[i0];
2454         tmp1  = tmps[i1];
2455         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2456         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2457         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2458         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2459         sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1;
2460       }
2461       if (j == nz-1) {
2462         tmp0  = tmps[vi[j]];
2463         sum1 -= v1[j] *tmp0;
2464         sum2 -= v2[j] *tmp0;
2465         sum3 -= v3[j] *tmp0;
2466         sum4 -= v4[j] *tmp0;
2467         sum5 -= v5[j] *tmp0;
2468       }
2469 
2470       sum2 -= v2[nz] * sum1;
2471       sum3 -= v3[nz] * sum1;
2472       sum4 -= v4[nz] * sum1;
2473       sum5 -= v5[nz] * sum1;
2474       sum3 -= v3[nz+1] * sum2;
2475       sum4 -= v4[nz+1] * sum2;
2476       sum5 -= v5[nz+1] * sum2;
2477       sum4 -= v4[nz+2] * sum3;
2478       sum5 -= v5[nz+2] * sum3;
2479       sum5 -= v5[nz+3] * sum4;
2480 
2481       tmp[row++]=sum1;
2482       tmp[row++]=sum2;
2483       tmp[row++]=sum3;
2484       tmp[row++]=sum4;
2485       tmp[row++]=sum5;
2486       break;
2487     default:
2488       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2489     }
2490   }
2491   /* backward solve the upper triangular */
2492   for (i=node_max -1,row = n-1; i>=0; i--) {
2493     nsz = ns[i];
2494     aii = ad[row+1] + 1;
2495     v1  = aa + aii;
2496     vi  = aj + aii;
2497     nz  = ad[row]- ad[row+1] - 1;
2498 
2499     if (i > 0) {
2500       /* Prefetch the indices for the next block */
2501       PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2502       /* Prefetch the data for the next block */
2503       PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2504     }
2505 
2506     switch (nsz) {               /* Each loop in 'case' is unrolled */
2507     case 1:
2508       sum1 = tmp[row];
2509 
2510       for (j=0; j<nz-1; j+=2) {
2511         i0    = vi[j];
2512         i1    = vi[j+1];
2513         tmp0  = tmps[i0];
2514         tmp1  = tmps[i1];
2515         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2516       }
2517       if (j == nz-1) {
2518         tmp0  = tmps[vi[j]];
2519         sum1 -= v1[j]*tmp0;
2520       }
2521       x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2522       break;
2523     case 2:
2524       sum1 = tmp[row];
2525       sum2 = tmp[row-1];
2526       v2   = aa + ad[row] + 1;
2527       for (j=0; j<nz-1; j+=2) {
2528         i0    = vi[j];
2529         i1    = vi[j+1];
2530         tmp0  = tmps[i0];
2531         tmp1  = tmps[i1];
2532         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2533         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2534       }
2535       if (j == nz-1) {
2536         tmp0  = tmps[vi[j]];
2537         sum1 -= v1[j]* tmp0;
2538         sum2 -= v2[j+1]* tmp0;
2539       }
2540 
2541       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2542       sum2     -= v2[0] * tmp0;
2543       x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2544       break;
2545     case 3:
2546       sum1 = tmp[row];
2547       sum2 = tmp[row -1];
2548       sum3 = tmp[row -2];
2549       v2   = aa + ad[row] + 1;
2550       v3   = aa + ad[row -1] + 1;
2551       for (j=0; j<nz-1; j+=2) {
2552         i0    = vi[j];
2553         i1    = vi[j+1];
2554         tmp0  = tmps[i0];
2555         tmp1  = tmps[i1];
2556         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2557         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2558         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2559       }
2560       if (j== nz-1) {
2561         tmp0  = tmps[vi[j]];
2562         sum1 -= v1[j] * tmp0;
2563         sum2 -= v2[j+1] * tmp0;
2564         sum3 -= v3[j+2] * tmp0;
2565       }
2566       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2567       sum2     -= v2[0]* tmp0;
2568       sum3     -= v3[1] * tmp0;
2569       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2570       sum3     -= v3[0]* tmp0;
2571       x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2572 
2573       break;
2574     case 4:
2575       sum1 = tmp[row];
2576       sum2 = tmp[row -1];
2577       sum3 = tmp[row -2];
2578       sum4 = tmp[row -3];
2579       v2   = aa + ad[row]+1;
2580       v3   = aa + ad[row -1]+1;
2581       v4   = aa + ad[row -2]+1;
2582 
2583       for (j=0; j<nz-1; j+=2) {
2584         i0    = vi[j];
2585         i1    = vi[j+1];
2586         tmp0  = tmps[i0];
2587         tmp1  = tmps[i1];
2588         sum1 -= v1[j] * tmp0   + v1[j+1] * tmp1;
2589         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2590         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2591         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2592       }
2593       if (j== nz-1) {
2594         tmp0  = tmps[vi[j]];
2595         sum1 -= v1[j] * tmp0;
2596         sum2 -= v2[j+1] * tmp0;
2597         sum3 -= v3[j+2] * tmp0;
2598         sum4 -= v4[j+3] * tmp0;
2599       }
2600 
2601       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2602       sum2     -= v2[0] * tmp0;
2603       sum3     -= v3[1] * tmp0;
2604       sum4     -= v4[2] * tmp0;
2605       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2606       sum3     -= v3[0] * tmp0;
2607       sum4     -= v4[1] * tmp0;
2608       tmp0      = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2609       sum4     -= v4[0] * tmp0;
2610       x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2611       break;
2612     case 5:
2613       sum1 = tmp[row];
2614       sum2 = tmp[row -1];
2615       sum3 = tmp[row -2];
2616       sum4 = tmp[row -3];
2617       sum5 = tmp[row -4];
2618       v2   = aa + ad[row]+1;
2619       v3   = aa + ad[row -1]+1;
2620       v4   = aa + ad[row -2]+1;
2621       v5   = aa + ad[row -3]+1;
2622       for (j=0; j<nz-1; j+=2) {
2623         i0    = vi[j];
2624         i1    = vi[j+1];
2625         tmp0  = tmps[i0];
2626         tmp1  = tmps[i1];
2627         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2628         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2629         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2630         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2631         sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1;
2632       }
2633       if (j==nz-1) {
2634         tmp0  = tmps[vi[j]];
2635         sum1 -= v1[j] * tmp0;
2636         sum2 -= v2[j+1] * tmp0;
2637         sum3 -= v3[j+2] * tmp0;
2638         sum4 -= v4[j+3] * tmp0;
2639         sum5 -= v5[j+4] * tmp0;
2640       }
2641 
2642       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2643       sum2     -= v2[0] * tmp0;
2644       sum3     -= v3[1] * tmp0;
2645       sum4     -= v4[2] * tmp0;
2646       sum5     -= v5[3] * tmp0;
2647       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2648       sum3     -= v3[0] * tmp0;
2649       sum4     -= v4[1] * tmp0;
2650       sum5     -= v5[2] * tmp0;
2651       tmp0      = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2652       sum4     -= v4[0] * tmp0;
2653       sum5     -= v5[1] * tmp0;
2654       tmp0      = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2655       sum5     -= v5[0] * tmp0;
2656       x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--;
2657       break;
2658     default:
2659       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2660     }
2661   }
2662   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2663   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2664   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2665   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2666   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
2667   PetscFunctionReturn(0);
2668 }
2669 
2670 
2671 /*
2672      Makes a longer coloring[] array and calls the usual code with that
2673 */
2674 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
2675 {
2676   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)mat->data;
2677   PetscErrorCode  ierr;
2678   PetscInt        n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
2679   PetscInt        *colorused,i;
2680   ISColoringValue *newcolor;
2681 
2682   PetscFunctionBegin;
2683   ierr = PetscMalloc1(n+1,&newcolor);CHKERRQ(ierr);
2684   /* loop over inodes, marking a color for each column*/
2685   row = 0;
2686   for (i=0; i<m; i++) {
2687     for (j=0; j<ns[i]; j++) {
2688       newcolor[row++] = coloring[i] + j*ncolors;
2689     }
2690   }
2691 
2692   /* eliminate unneeded colors */
2693   ierr = PetscCalloc1(5*ncolors,&colorused);CHKERRQ(ierr);
2694   for (i=0; i<n; i++) {
2695     colorused[newcolor[i]] = 1;
2696   }
2697 
2698   for (i=1; i<5*ncolors; i++) {
2699     colorused[i] += colorused[i-1];
2700   }
2701   ncolors = colorused[5*ncolors-1];
2702   for (i=0; i<n; i++) {
2703     newcolor[i] = colorused[newcolor[i]]-1;
2704   }
2705   ierr = PetscFree(colorused);CHKERRQ(ierr);
2706   ierr = ISColoringCreate(PetscObjectComm((PetscObject)mat),ncolors,n,newcolor,PETSC_OWN_POINTER,iscoloring);CHKERRQ(ierr);
2707   ierr = PetscFree(coloring);CHKERRQ(ierr);
2708   PetscFunctionReturn(0);
2709 }
2710 
2711 #include <petsc/private/kernels/blockinvert.h>
2712 
2713 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
2714 {
2715   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
2716   PetscScalar       sum1 = 0.0,sum2 = 0.0,sum3 = 0.0,sum4 = 0.0,sum5 = 0.0,tmp0,tmp1,tmp2,tmp3;
2717   MatScalar         *ibdiag,*bdiag,work[25],*t;
2718   PetscScalar       *x,tmp4,tmp5,x1,x2,x3,x4,x5;
2719   const MatScalar   *v = a->a,*v1 = NULL,*v2 = NULL,*v3 = NULL,*v4 = NULL,*v5 = NULL;
2720   const PetscScalar *xb, *b;
2721   PetscReal         zeropivot = 100.*PETSC_MACHINE_EPSILON, shift = 0.0;
2722   PetscErrorCode    ierr;
2723   PetscInt          n,m = a->inode.node_count,cnt = 0,i,j,row,i1,i2;
2724   PetscInt          sz,k,ipvt[5];
2725   PetscBool         allowzeropivot,zeropivotdetected;
2726   const PetscInt    *sizes = a->inode.size,*idx,*diag = a->diag,*ii = a->i;
2727 
2728   PetscFunctionBegin;
2729   allowzeropivot = PetscNot(A->erroriffailure);
2730   if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
2731   if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
2732 
2733   if (!a->inode.ibdiagvalid) {
2734     if (!a->inode.ibdiag) {
2735       /* calculate space needed for diagonal blocks */
2736       for (i=0; i<m; i++) {
2737         cnt += sizes[i]*sizes[i];
2738       }
2739       a->inode.bdiagsize = cnt;
2740 
2741       ierr = PetscMalloc3(cnt,&a->inode.ibdiag,cnt,&a->inode.bdiag,A->rmap->n,&a->inode.ssor_work);CHKERRQ(ierr);
2742     }
2743 
2744     /* copy over the diagonal blocks and invert them */
2745     ibdiag = a->inode.ibdiag;
2746     bdiag  = a->inode.bdiag;
2747     cnt    = 0;
2748     for (i=0, row = 0; i<m; i++) {
2749       for (j=0; j<sizes[i]; j++) {
2750         for (k=0; k<sizes[i]; k++) {
2751           bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
2752         }
2753       }
2754       ierr = PetscArraycpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]);CHKERRQ(ierr);
2755 
2756       switch (sizes[i]) {
2757       case 1:
2758         /* Create matrix data structure */
2759         if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) {
2760           if (allowzeropivot) {
2761             A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2762             A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]);
2763             A->factorerror_zeropivot_row   = row;
2764             ierr = PetscInfo1(A,"Zero pivot, row %D\n",row);CHKERRQ(ierr);
2765           } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
2766         }
2767         ibdiag[cnt] = 1.0/ibdiag[cnt];
2768         break;
2769       case 2:
2770         ierr = PetscKernel_A_gets_inverse_A_2(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
2771         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2772         break;
2773       case 3:
2774         ierr = PetscKernel_A_gets_inverse_A_3(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
2775         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2776         break;
2777       case 4:
2778         ierr = PetscKernel_A_gets_inverse_A_4(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
2779         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2780         break;
2781       case 5:
2782         ierr = PetscKernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
2783         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2784         break;
2785       default:
2786         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2787       }
2788       cnt += sizes[i]*sizes[i];
2789       row += sizes[i];
2790     }
2791     a->inode.ibdiagvalid = PETSC_TRUE;
2792   }
2793   ibdiag = a->inode.ibdiag;
2794   bdiag  = a->inode.bdiag;
2795   t      = a->inode.ssor_work;
2796 
2797   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2798   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2799   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2800   if (flag & SOR_ZERO_INITIAL_GUESS) {
2801     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
2802 
2803       for (i=0, row=0; i<m; i++) {
2804         sz  = diag[row] - ii[row];
2805         v1  = a->a + ii[row];
2806         idx = a->j + ii[row];
2807 
2808         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2809         switch (sizes[i]) {
2810         case 1:
2811 
2812           sum1 = b[row];
2813           for (n = 0; n<sz-1; n+=2) {
2814             i1    = idx[0];
2815             i2    = idx[1];
2816             idx  += 2;
2817             tmp0  = x[i1];
2818             tmp1  = x[i2];
2819             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2820           }
2821 
2822           if (n == sz-1) {
2823             tmp0  = x[*idx];
2824             sum1 -= *v1 * tmp0;
2825           }
2826           t[row]   = sum1;
2827           x[row++] = sum1*(*ibdiag++);
2828           break;
2829         case 2:
2830           v2   = a->a + ii[row+1];
2831           sum1 = b[row];
2832           sum2 = b[row+1];
2833           for (n = 0; n<sz-1; n+=2) {
2834             i1    = idx[0];
2835             i2    = idx[1];
2836             idx  += 2;
2837             tmp0  = x[i1];
2838             tmp1  = x[i2];
2839             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2840             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2841           }
2842 
2843           if (n == sz-1) {
2844             tmp0  = x[*idx];
2845             sum1 -= v1[0] * tmp0;
2846             sum2 -= v2[0] * tmp0;
2847           }
2848           t[row]   = sum1;
2849           t[row+1] = sum2;
2850           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
2851           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
2852           ibdiag  += 4;
2853           break;
2854         case 3:
2855           v2   = a->a + ii[row+1];
2856           v3   = a->a + ii[row+2];
2857           sum1 = b[row];
2858           sum2 = b[row+1];
2859           sum3 = b[row+2];
2860           for (n = 0; n<sz-1; n+=2) {
2861             i1    = idx[0];
2862             i2    = idx[1];
2863             idx  += 2;
2864             tmp0  = x[i1];
2865             tmp1  = x[i2];
2866             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2867             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2868             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2869           }
2870 
2871           if (n == sz-1) {
2872             tmp0  = x[*idx];
2873             sum1 -= v1[0] * tmp0;
2874             sum2 -= v2[0] * tmp0;
2875             sum3 -= v3[0] * tmp0;
2876           }
2877           t[row]   = sum1;
2878           t[row+1] = sum2;
2879           t[row+2] = sum3;
2880           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2881           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2882           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2883           ibdiag  += 9;
2884           break;
2885         case 4:
2886           v2   = a->a + ii[row+1];
2887           v3   = a->a + ii[row+2];
2888           v4   = a->a + ii[row+3];
2889           sum1 = b[row];
2890           sum2 = b[row+1];
2891           sum3 = b[row+2];
2892           sum4 = b[row+3];
2893           for (n = 0; n<sz-1; n+=2) {
2894             i1    = idx[0];
2895             i2    = idx[1];
2896             idx  += 2;
2897             tmp0  = x[i1];
2898             tmp1  = x[i2];
2899             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2900             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2901             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2902             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2903           }
2904 
2905           if (n == sz-1) {
2906             tmp0  = x[*idx];
2907             sum1 -= v1[0] * tmp0;
2908             sum2 -= v2[0] * tmp0;
2909             sum3 -= v3[0] * tmp0;
2910             sum4 -= v4[0] * tmp0;
2911           }
2912           t[row]   = sum1;
2913           t[row+1] = sum2;
2914           t[row+2] = sum3;
2915           t[row+3] = sum4;
2916           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2917           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2918           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2919           x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2920           ibdiag  += 16;
2921           break;
2922         case 5:
2923           v2   = a->a + ii[row+1];
2924           v3   = a->a + ii[row+2];
2925           v4   = a->a + ii[row+3];
2926           v5   = a->a + ii[row+4];
2927           sum1 = b[row];
2928           sum2 = b[row+1];
2929           sum3 = b[row+2];
2930           sum4 = b[row+3];
2931           sum5 = b[row+4];
2932           for (n = 0; n<sz-1; n+=2) {
2933             i1    = idx[0];
2934             i2    = idx[1];
2935             idx  += 2;
2936             tmp0  = x[i1];
2937             tmp1  = x[i2];
2938             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2939             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2940             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2941             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2942             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2943           }
2944 
2945           if (n == sz-1) {
2946             tmp0  = x[*idx];
2947             sum1 -= v1[0] * tmp0;
2948             sum2 -= v2[0] * tmp0;
2949             sum3 -= v3[0] * tmp0;
2950             sum4 -= v4[0] * tmp0;
2951             sum5 -= v5[0] * tmp0;
2952           }
2953           t[row]   = sum1;
2954           t[row+1] = sum2;
2955           t[row+2] = sum3;
2956           t[row+3] = sum4;
2957           t[row+4] = sum5;
2958           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2959           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
2960           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
2961           x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
2962           x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
2963           ibdiag  += 25;
2964           break;
2965         default:
2966           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2967         }
2968       }
2969 
2970       xb   = t;
2971       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2972     } else xb = b;
2973     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
2974 
2975       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
2976       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
2977         ibdiag -= sizes[i]*sizes[i];
2978         sz      = ii[row+1] - diag[row] - 1;
2979         v1      = a->a + diag[row] + 1;
2980         idx     = a->j + diag[row] + 1;
2981 
2982         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2983         switch (sizes[i]) {
2984         case 1:
2985 
2986           sum1 = xb[row];
2987           for (n = 0; n<sz-1; n+=2) {
2988             i1    = idx[0];
2989             i2    = idx[1];
2990             idx  += 2;
2991             tmp0  = x[i1];
2992             tmp1  = x[i2];
2993             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2994           }
2995 
2996           if (n == sz-1) {
2997             tmp0  = x[*idx];
2998             sum1 -= *v1*tmp0;
2999           }
3000           x[row--] = sum1*(*ibdiag);
3001           break;
3002 
3003         case 2:
3004 
3005           sum1 = xb[row];
3006           sum2 = xb[row-1];
3007           /* note that sum1 is associated with the second of the two rows */
3008           v2 = a->a + diag[row-1] + 2;
3009           for (n = 0; n<sz-1; n+=2) {
3010             i1    = idx[0];
3011             i2    = idx[1];
3012             idx  += 2;
3013             tmp0  = x[i1];
3014             tmp1  = x[i2];
3015             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3016             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3017           }
3018 
3019           if (n == sz-1) {
3020             tmp0  = x[*idx];
3021             sum1 -= *v1*tmp0;
3022             sum2 -= *v2*tmp0;
3023           }
3024           x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3025           x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3026           break;
3027         case 3:
3028 
3029           sum1 = xb[row];
3030           sum2 = xb[row-1];
3031           sum3 = xb[row-2];
3032           v2   = a->a + diag[row-1] + 2;
3033           v3   = a->a + diag[row-2] + 3;
3034           for (n = 0; n<sz-1; n+=2) {
3035             i1    = idx[0];
3036             i2    = idx[1];
3037             idx  += 2;
3038             tmp0  = x[i1];
3039             tmp1  = x[i2];
3040             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3041             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3042             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3043           }
3044 
3045           if (n == sz-1) {
3046             tmp0  = x[*idx];
3047             sum1 -= *v1*tmp0;
3048             sum2 -= *v2*tmp0;
3049             sum3 -= *v3*tmp0;
3050           }
3051           x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3052           x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3053           x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3054           break;
3055         case 4:
3056 
3057           sum1 = xb[row];
3058           sum2 = xb[row-1];
3059           sum3 = xb[row-2];
3060           sum4 = xb[row-3];
3061           v2   = a->a + diag[row-1] + 2;
3062           v3   = a->a + diag[row-2] + 3;
3063           v4   = a->a + diag[row-3] + 4;
3064           for (n = 0; n<sz-1; n+=2) {
3065             i1    = idx[0];
3066             i2    = idx[1];
3067             idx  += 2;
3068             tmp0  = x[i1];
3069             tmp1  = x[i2];
3070             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3071             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3072             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3073             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3074           }
3075 
3076           if (n == sz-1) {
3077             tmp0  = x[*idx];
3078             sum1 -= *v1*tmp0;
3079             sum2 -= *v2*tmp0;
3080             sum3 -= *v3*tmp0;
3081             sum4 -= *v4*tmp0;
3082           }
3083           x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3084           x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3085           x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3086           x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3087           break;
3088         case 5:
3089 
3090           sum1 = xb[row];
3091           sum2 = xb[row-1];
3092           sum3 = xb[row-2];
3093           sum4 = xb[row-3];
3094           sum5 = xb[row-4];
3095           v2   = a->a + diag[row-1] + 2;
3096           v3   = a->a + diag[row-2] + 3;
3097           v4   = a->a + diag[row-3] + 4;
3098           v5   = a->a + diag[row-4] + 5;
3099           for (n = 0; n<sz-1; n+=2) {
3100             i1    = idx[0];
3101             i2    = idx[1];
3102             idx  += 2;
3103             tmp0  = x[i1];
3104             tmp1  = x[i2];
3105             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3106             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3107             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3108             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3109             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3110           }
3111 
3112           if (n == sz-1) {
3113             tmp0  = x[*idx];
3114             sum1 -= *v1*tmp0;
3115             sum2 -= *v2*tmp0;
3116             sum3 -= *v3*tmp0;
3117             sum4 -= *v4*tmp0;
3118             sum5 -= *v5*tmp0;
3119           }
3120           x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3121           x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3122           x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3123           x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3124           x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3125           break;
3126         default:
3127           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3128         }
3129       }
3130 
3131       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3132     }
3133     its--;
3134   }
3135   while (its--) {
3136 
3137     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
3138       for (i=0, row=0, ibdiag = a->inode.ibdiag;
3139            i<m;
3140            row += sizes[i], ibdiag += sizes[i]*sizes[i], i++) {
3141 
3142         sz  = diag[row] - ii[row];
3143         v1  = a->a + ii[row];
3144         idx = a->j + ii[row];
3145         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3146         switch (sizes[i]) {
3147         case 1:
3148           sum1 = b[row];
3149           for (n = 0; n<sz-1; n+=2) {
3150             i1    = idx[0];
3151             i2    = idx[1];
3152             idx  += 2;
3153             tmp0  = x[i1];
3154             tmp1  = x[i2];
3155             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3156           }
3157           if (n == sz-1) {
3158             tmp0  = x[*idx++];
3159             sum1 -= *v1 * tmp0;
3160             v1++;
3161           }
3162           t[row]   = sum1;
3163           sz      = ii[row+1] - diag[row] - 1;
3164           idx     = a->j + diag[row] + 1;
3165           v1 += 1;
3166           for (n = 0; n<sz-1; n+=2) {
3167             i1    = idx[0];
3168             i2    = idx[1];
3169             idx  += 2;
3170             tmp0  = x[i1];
3171             tmp1  = x[i2];
3172             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3173           }
3174           if (n == sz-1) {
3175             tmp0  = x[*idx++];
3176             sum1 -= *v1 * tmp0;
3177           }
3178           /* in MatSOR_SeqAIJ this line would be
3179            *
3180            * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
3181            *
3182            * but omega == 1, so this becomes
3183            *
3184            * x[row] = sum1*(*ibdiag++);
3185            *
3186            */
3187           x[row] = sum1*(*ibdiag);
3188           break;
3189         case 2:
3190           v2   = a->a + ii[row+1];
3191           sum1 = b[row];
3192           sum2 = b[row+1];
3193           for (n = 0; n<sz-1; n+=2) {
3194             i1    = idx[0];
3195             i2    = idx[1];
3196             idx  += 2;
3197             tmp0  = x[i1];
3198             tmp1  = x[i2];
3199             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3200             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3201           }
3202           if (n == sz-1) {
3203             tmp0  = x[*idx++];
3204             sum1 -= v1[0] * tmp0;
3205             sum2 -= v2[0] * tmp0;
3206             v1++; v2++;
3207           }
3208           t[row]   = sum1;
3209           t[row+1] = sum2;
3210           sz      = ii[row+1] - diag[row] - 2;
3211           idx     = a->j + diag[row] + 2;
3212           v1 += 2;
3213           v2 += 2;
3214           for (n = 0; n<sz-1; n+=2) {
3215             i1    = idx[0];
3216             i2    = idx[1];
3217             idx  += 2;
3218             tmp0  = x[i1];
3219             tmp1  = x[i2];
3220             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3221             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3222           }
3223           if (n == sz-1) {
3224             tmp0  = x[*idx];
3225             sum1 -= v1[0] * tmp0;
3226             sum2 -= v2[0] * tmp0;
3227           }
3228           x[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3229           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3230           break;
3231         case 3:
3232           v2   = a->a + ii[row+1];
3233           v3   = a->a + ii[row+2];
3234           sum1 = b[row];
3235           sum2 = b[row+1];
3236           sum3 = b[row+2];
3237           for (n = 0; n<sz-1; n+=2) {
3238             i1    = idx[0];
3239             i2    = idx[1];
3240             idx  += 2;
3241             tmp0  = x[i1];
3242             tmp1  = x[i2];
3243             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3244             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3245             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3246           }
3247           if (n == sz-1) {
3248             tmp0  = x[*idx++];
3249             sum1 -= v1[0] * tmp0;
3250             sum2 -= v2[0] * tmp0;
3251             sum3 -= v3[0] * tmp0;
3252             v1++; v2++; v3++;
3253           }
3254           t[row]   = sum1;
3255           t[row+1] = sum2;
3256           t[row+2] = sum3;
3257           sz      = ii[row+1] - diag[row] - 3;
3258           idx     = a->j + diag[row] + 3;
3259           v1 += 3;
3260           v2 += 3;
3261           v3 += 3;
3262           for (n = 0; n<sz-1; n+=2) {
3263             i1    = idx[0];
3264             i2    = idx[1];
3265             idx  += 2;
3266             tmp0  = x[i1];
3267             tmp1  = x[i2];
3268             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3269             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3270             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3271           }
3272           if (n == sz-1) {
3273             tmp0  = x[*idx];
3274             sum1 -= v1[0] * tmp0;
3275             sum2 -= v2[0] * tmp0;
3276             sum3 -= v3[0] * tmp0;
3277           }
3278           x[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3279           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3280           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3281           break;
3282         case 4:
3283           v2   = a->a + ii[row+1];
3284           v3   = a->a + ii[row+2];
3285           v4   = a->a + ii[row+3];
3286           sum1 = b[row];
3287           sum2 = b[row+1];
3288           sum3 = b[row+2];
3289           sum4 = b[row+3];
3290           for (n = 0; n<sz-1; n+=2) {
3291             i1    = idx[0];
3292             i2    = idx[1];
3293             idx  += 2;
3294             tmp0  = x[i1];
3295             tmp1  = x[i2];
3296             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3297             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3298             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3299             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3300           }
3301           if (n == sz-1) {
3302             tmp0  = x[*idx++];
3303             sum1 -= v1[0] * tmp0;
3304             sum2 -= v2[0] * tmp0;
3305             sum3 -= v3[0] * tmp0;
3306             sum4 -= v4[0] * tmp0;
3307             v1++; v2++; v3++; v4++;
3308           }
3309           t[row]   = sum1;
3310           t[row+1] = sum2;
3311           t[row+2] = sum3;
3312           t[row+3] = sum4;
3313           sz      = ii[row+1] - diag[row] - 4;
3314           idx     = a->j + diag[row] + 4;
3315           v1 += 4;
3316           v2 += 4;
3317           v3 += 4;
3318           v4 += 4;
3319           for (n = 0; n<sz-1; n+=2) {
3320             i1    = idx[0];
3321             i2    = idx[1];
3322             idx  += 2;
3323             tmp0  = x[i1];
3324             tmp1  = x[i2];
3325             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3326             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3327             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3328             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3329           }
3330           if (n == sz-1) {
3331             tmp0  = x[*idx];
3332             sum1 -= v1[0] * tmp0;
3333             sum2 -= v2[0] * tmp0;
3334             sum3 -= v3[0] * tmp0;
3335             sum4 -= v4[0] * tmp0;
3336           }
3337           x[row] =   sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3338           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3339           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3340           x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3341           break;
3342         case 5:
3343           v2   = a->a + ii[row+1];
3344           v3   = a->a + ii[row+2];
3345           v4   = a->a + ii[row+3];
3346           v5   = a->a + ii[row+4];
3347           sum1 = b[row];
3348           sum2 = b[row+1];
3349           sum3 = b[row+2];
3350           sum4 = b[row+3];
3351           sum5 = b[row+4];
3352           for (n = 0; n<sz-1; n+=2) {
3353             i1    = idx[0];
3354             i2    = idx[1];
3355             idx  += 2;
3356             tmp0  = x[i1];
3357             tmp1  = x[i2];
3358             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3359             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3360             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3361             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3362             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3363           }
3364           if (n == sz-1) {
3365             tmp0  = x[*idx++];
3366             sum1 -= v1[0] * tmp0;
3367             sum2 -= v2[0] * tmp0;
3368             sum3 -= v3[0] * tmp0;
3369             sum4 -= v4[0] * tmp0;
3370             sum5 -= v5[0] * tmp0;
3371             v1++; v2++; v3++; v4++; v5++;
3372           }
3373           t[row]   = sum1;
3374           t[row+1] = sum2;
3375           t[row+2] = sum3;
3376           t[row+3] = sum4;
3377           t[row+4] = sum5;
3378           sz      = ii[row+1] - diag[row] - 5;
3379           idx     = a->j + diag[row] + 5;
3380           v1 += 5;
3381           v2 += 5;
3382           v3 += 5;
3383           v4 += 5;
3384           v5 += 5;
3385           for (n = 0; n<sz-1; n+=2) {
3386             i1    = idx[0];
3387             i2    = idx[1];
3388             idx  += 2;
3389             tmp0  = x[i1];
3390             tmp1  = x[i2];
3391             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3392             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3393             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3394             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3395             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3396           }
3397           if (n == sz-1) {
3398             tmp0  = x[*idx];
3399             sum1 -= v1[0] * tmp0;
3400             sum2 -= v2[0] * tmp0;
3401             sum3 -= v3[0] * tmp0;
3402             sum4 -= v4[0] * tmp0;
3403             sum5 -= v5[0] * tmp0;
3404           }
3405           x[row]   = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3406           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3407           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3408           x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3409           x[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3410           break;
3411         default:
3412           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3413         }
3414       }
3415       xb   = t;
3416       ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);  /* undercounts diag inverse */
3417     } else xb = b;
3418 
3419     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3420 
3421       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3422       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3423         ibdiag -= sizes[i]*sizes[i];
3424 
3425         /* set RHS */
3426         if (xb == b) {
3427           /* whole (old way) */
3428           sz      = ii[row+1] - ii[row];
3429           idx     = a->j + ii[row];
3430           switch (sizes[i]) {
3431           case 5:
3432             v5      = a->a + ii[row-4];
3433           case 4: /* fall through */
3434             v4      = a->a + ii[row-3];
3435           case 3:
3436             v3      = a->a + ii[row-2];
3437           case 2:
3438             v2      = a->a + ii[row-1];
3439           case 1:
3440             v1      = a->a + ii[row];
3441             break;
3442           default:
3443             SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3444           }
3445         } else {
3446           /* upper, no diag */
3447           sz      = ii[row+1] - diag[row] - 1;
3448           idx     = a->j + diag[row] + 1;
3449           switch (sizes[i]) {
3450           case 5:
3451             v5      = a->a + diag[row-4] + 5;
3452           case 4: /* fall through */
3453             v4      = a->a + diag[row-3] + 4;
3454           case 3:
3455             v3      = a->a + diag[row-2] + 3;
3456           case 2:
3457             v2      = a->a + diag[row-1] + 2;
3458           case 1:
3459             v1      = a->a + diag[row] + 1;
3460           }
3461         }
3462         /* set sum */
3463         switch (sizes[i]) {
3464         case 5:
3465           sum5 = xb[row-4];
3466         case 4: /* fall through */
3467           sum4 = xb[row-3];
3468         case 3:
3469           sum3 = xb[row-2];
3470         case 2:
3471           sum2 = xb[row-1];
3472         case 1:
3473           /* note that sum1 is associated with the last row */
3474           sum1 = xb[row];
3475         }
3476         /* do sums */
3477         for (n = 0; n<sz-1; n+=2) {
3478             i1    = idx[0];
3479             i2    = idx[1];
3480             idx  += 2;
3481             tmp0  = x[i1];
3482             tmp1  = x[i2];
3483             switch (sizes[i]) {
3484             case 5:
3485               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3486             case 4: /* fall through */
3487               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3488             case 3:
3489               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3490             case 2:
3491               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3492             case 1:
3493               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3494             }
3495         }
3496         /* ragged edge */
3497         if (n == sz-1) {
3498           tmp0  = x[*idx];
3499           switch (sizes[i]) {
3500           case 5:
3501             sum5 -= *v5*tmp0;
3502           case 4: /* fall through */
3503             sum4 -= *v4*tmp0;
3504           case 3:
3505             sum3 -= *v3*tmp0;
3506           case 2:
3507             sum2 -= *v2*tmp0;
3508           case 1:
3509             sum1 -= *v1*tmp0;
3510           }
3511         }
3512         /* update */
3513         if (xb == b) {
3514           /* whole (old way) w/ diag */
3515           switch (sizes[i]) {
3516           case 5:
3517             x[row--] += sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3518             x[row--] += sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3519             x[row--] += sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3520             x[row--] += sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3521             x[row--] += sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3522             break;
3523           case 4:
3524             x[row--] += sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3525             x[row--] += sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3526             x[row--] += sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3527             x[row--] += sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3528             break;
3529           case 3:
3530             x[row--] += sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3531             x[row--] += sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3532             x[row--] += sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3533             break;
3534           case 2:
3535             x[row--] += sum2*ibdiag[1] + sum1*ibdiag[3];
3536             x[row--] += sum2*ibdiag[0] + sum1*ibdiag[2];
3537             break;
3538           case 1:
3539             x[row--] += sum1*(*ibdiag);
3540             break;
3541           }
3542         } else {
3543           /* no diag so set =  */
3544           switch (sizes[i]) {
3545           case 5:
3546             x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3547             x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3548             x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3549             x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3550             x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3551             break;
3552           case 4:
3553             x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3554             x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3555             x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3556             x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3557             break;
3558           case 3:
3559             x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3560             x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3561             x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3562             break;
3563           case 2:
3564             x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3565             x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3566             break;
3567           case 1:
3568             x[row--] = sum1*(*ibdiag);
3569             break;
3570           }
3571         }
3572       }
3573       if (xb == b) {
3574         ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
3575       } else {
3576         ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); /* assumes 1/2 in upper, undercounts diag inverse */
3577       }
3578     }
3579   }
3580   if (flag & SOR_EISENSTAT) {
3581     /*
3582           Apply  (U + D)^-1  where D is now the block diagonal
3583     */
3584     ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3585     for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3586       ibdiag -= sizes[i]*sizes[i];
3587       sz      = ii[row+1] - diag[row] - 1;
3588       v1      = a->a + diag[row] + 1;
3589       idx     = a->j + diag[row] + 1;
3590       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3591       switch (sizes[i]) {
3592       case 1:
3593 
3594         sum1 = b[row];
3595         for (n = 0; n<sz-1; n+=2) {
3596           i1    = idx[0];
3597           i2    = idx[1];
3598           idx  += 2;
3599           tmp0  = x[i1];
3600           tmp1  = x[i2];
3601           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3602         }
3603 
3604         if (n == sz-1) {
3605           tmp0  = x[*idx];
3606           sum1 -= *v1*tmp0;
3607         }
3608         x[row] = sum1*(*ibdiag);row--;
3609         break;
3610 
3611       case 2:
3612 
3613         sum1 = b[row];
3614         sum2 = b[row-1];
3615         /* note that sum1 is associated with the second of the two rows */
3616         v2 = a->a + diag[row-1] + 2;
3617         for (n = 0; n<sz-1; n+=2) {
3618           i1    = idx[0];
3619           i2    = idx[1];
3620           idx  += 2;
3621           tmp0  = x[i1];
3622           tmp1  = x[i2];
3623           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3624           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3625         }
3626 
3627         if (n == sz-1) {
3628           tmp0  = x[*idx];
3629           sum1 -= *v1*tmp0;
3630           sum2 -= *v2*tmp0;
3631         }
3632         x[row]   = sum2*ibdiag[1] + sum1*ibdiag[3];
3633         x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
3634         row     -= 2;
3635         break;
3636       case 3:
3637 
3638         sum1 = b[row];
3639         sum2 = b[row-1];
3640         sum3 = b[row-2];
3641         v2   = a->a + diag[row-1] + 2;
3642         v3   = a->a + diag[row-2] + 3;
3643         for (n = 0; n<sz-1; n+=2) {
3644           i1    = idx[0];
3645           i2    = idx[1];
3646           idx  += 2;
3647           tmp0  = x[i1];
3648           tmp1  = x[i2];
3649           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3650           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3651           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3652         }
3653 
3654         if (n == sz-1) {
3655           tmp0  = x[*idx];
3656           sum1 -= *v1*tmp0;
3657           sum2 -= *v2*tmp0;
3658           sum3 -= *v3*tmp0;
3659         }
3660         x[row]   = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3661         x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3662         x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3663         row     -= 3;
3664         break;
3665       case 4:
3666 
3667         sum1 = b[row];
3668         sum2 = b[row-1];
3669         sum3 = b[row-2];
3670         sum4 = b[row-3];
3671         v2   = a->a + diag[row-1] + 2;
3672         v3   = a->a + diag[row-2] + 3;
3673         v4   = a->a + diag[row-3] + 4;
3674         for (n = 0; n<sz-1; n+=2) {
3675           i1    = idx[0];
3676           i2    = idx[1];
3677           idx  += 2;
3678           tmp0  = x[i1];
3679           tmp1  = x[i2];
3680           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3681           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3682           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3683           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3684         }
3685 
3686         if (n == sz-1) {
3687           tmp0  = x[*idx];
3688           sum1 -= *v1*tmp0;
3689           sum2 -= *v2*tmp0;
3690           sum3 -= *v3*tmp0;
3691           sum4 -= *v4*tmp0;
3692         }
3693         x[row]   = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3694         x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3695         x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3696         x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3697         row     -= 4;
3698         break;
3699       case 5:
3700 
3701         sum1 = b[row];
3702         sum2 = b[row-1];
3703         sum3 = b[row-2];
3704         sum4 = b[row-3];
3705         sum5 = b[row-4];
3706         v2   = a->a + diag[row-1] + 2;
3707         v3   = a->a + diag[row-2] + 3;
3708         v4   = a->a + diag[row-3] + 4;
3709         v5   = a->a + diag[row-4] + 5;
3710         for (n = 0; n<sz-1; n+=2) {
3711           i1    = idx[0];
3712           i2    = idx[1];
3713           idx  += 2;
3714           tmp0  = x[i1];
3715           tmp1  = x[i2];
3716           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3717           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3718           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3719           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3720           sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3721         }
3722 
3723         if (n == sz-1) {
3724           tmp0  = x[*idx];
3725           sum1 -= *v1*tmp0;
3726           sum2 -= *v2*tmp0;
3727           sum3 -= *v3*tmp0;
3728           sum4 -= *v4*tmp0;
3729           sum5 -= *v5*tmp0;
3730         }
3731         x[row]   = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3732         x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3733         x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3734         x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3735         x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3736         row     -= 5;
3737         break;
3738       default:
3739         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3740       }
3741     }
3742     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3743 
3744     /*
3745            t = b - D x    where D is the block diagonal
3746     */
3747     cnt = 0;
3748     for (i=0, row=0; i<m; i++) {
3749       switch (sizes[i]) {
3750       case 1:
3751         t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
3752         break;
3753       case 2:
3754         x1       = x[row]; x2 = x[row+1];
3755         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3756         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3757         t[row]   = b[row] - tmp1;
3758         t[row+1] = b[row+1] - tmp2; row += 2;
3759         cnt     += 4;
3760         break;
3761       case 3:
3762         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2];
3763         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3764         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3765         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3766         t[row]   = b[row] - tmp1;
3767         t[row+1] = b[row+1] - tmp2;
3768         t[row+2] = b[row+2] - tmp3; row += 3;
3769         cnt     += 9;
3770         break;
3771       case 4:
3772         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3773         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3774         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3775         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3776         tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3777         t[row]   = b[row] - tmp1;
3778         t[row+1] = b[row+1] - tmp2;
3779         t[row+2] = b[row+2] - tmp3;
3780         t[row+3] = b[row+3] - tmp4; row += 4;
3781         cnt     += 16;
3782         break;
3783       case 5:
3784         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3785         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3786         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3787         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3788         tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3789         tmp5     = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3790         t[row]   = b[row] - tmp1;
3791         t[row+1] = b[row+1] - tmp2;
3792         t[row+2] = b[row+2] - tmp3;
3793         t[row+3] = b[row+3] - tmp4;
3794         t[row+4] = b[row+4] - tmp5;row += 5;
3795         cnt     += 25;
3796         break;
3797       default:
3798         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3799       }
3800     }
3801     ierr = PetscLogFlops(m);CHKERRQ(ierr);
3802 
3803 
3804 
3805     /*
3806           Apply (L + D)^-1 where D is the block diagonal
3807     */
3808     for (i=0, row=0; i<m; i++) {
3809       sz  = diag[row] - ii[row];
3810       v1  = a->a + ii[row];
3811       idx = a->j + ii[row];
3812       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3813       switch (sizes[i]) {
3814       case 1:
3815 
3816         sum1 = t[row];
3817         for (n = 0; n<sz-1; n+=2) {
3818           i1    = idx[0];
3819           i2    = idx[1];
3820           idx  += 2;
3821           tmp0  = t[i1];
3822           tmp1  = t[i2];
3823           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3824         }
3825 
3826         if (n == sz-1) {
3827           tmp0  = t[*idx];
3828           sum1 -= *v1 * tmp0;
3829         }
3830         x[row] += t[row] = sum1*(*ibdiag++); row++;
3831         break;
3832       case 2:
3833         v2   = a->a + ii[row+1];
3834         sum1 = t[row];
3835         sum2 = t[row+1];
3836         for (n = 0; n<sz-1; n+=2) {
3837           i1    = idx[0];
3838           i2    = idx[1];
3839           idx  += 2;
3840           tmp0  = t[i1];
3841           tmp1  = t[i2];
3842           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3843           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3844         }
3845 
3846         if (n == sz-1) {
3847           tmp0  = t[*idx];
3848           sum1 -= v1[0] * tmp0;
3849           sum2 -= v2[0] * tmp0;
3850         }
3851         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3852         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3853         ibdiag   += 4; row += 2;
3854         break;
3855       case 3:
3856         v2   = a->a + ii[row+1];
3857         v3   = a->a + ii[row+2];
3858         sum1 = t[row];
3859         sum2 = t[row+1];
3860         sum3 = t[row+2];
3861         for (n = 0; n<sz-1; n+=2) {
3862           i1    = idx[0];
3863           i2    = idx[1];
3864           idx  += 2;
3865           tmp0  = t[i1];
3866           tmp1  = t[i2];
3867           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3868           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3869           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3870         }
3871 
3872         if (n == sz-1) {
3873           tmp0  = t[*idx];
3874           sum1 -= v1[0] * tmp0;
3875           sum2 -= v2[0] * tmp0;
3876           sum3 -= v3[0] * tmp0;
3877         }
3878         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3879         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3880         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3881         ibdiag   += 9; row += 3;
3882         break;
3883       case 4:
3884         v2   = a->a + ii[row+1];
3885         v3   = a->a + ii[row+2];
3886         v4   = a->a + ii[row+3];
3887         sum1 = t[row];
3888         sum2 = t[row+1];
3889         sum3 = t[row+2];
3890         sum4 = t[row+3];
3891         for (n = 0; n<sz-1; n+=2) {
3892           i1    = idx[0];
3893           i2    = idx[1];
3894           idx  += 2;
3895           tmp0  = t[i1];
3896           tmp1  = t[i2];
3897           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3898           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3899           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3900           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3901         }
3902 
3903         if (n == sz-1) {
3904           tmp0  = t[*idx];
3905           sum1 -= v1[0] * tmp0;
3906           sum2 -= v2[0] * tmp0;
3907           sum3 -= v3[0] * tmp0;
3908           sum4 -= v4[0] * tmp0;
3909         }
3910         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3911         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3912         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3913         x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3914         ibdiag   += 16; row += 4;
3915         break;
3916       case 5:
3917         v2   = a->a + ii[row+1];
3918         v3   = a->a + ii[row+2];
3919         v4   = a->a + ii[row+3];
3920         v5   = a->a + ii[row+4];
3921         sum1 = t[row];
3922         sum2 = t[row+1];
3923         sum3 = t[row+2];
3924         sum4 = t[row+3];
3925         sum5 = t[row+4];
3926         for (n = 0; n<sz-1; n+=2) {
3927           i1    = idx[0];
3928           i2    = idx[1];
3929           idx  += 2;
3930           tmp0  = t[i1];
3931           tmp1  = t[i2];
3932           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3933           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3934           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3935           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3936           sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3937         }
3938 
3939         if (n == sz-1) {
3940           tmp0  = t[*idx];
3941           sum1 -= v1[0] * tmp0;
3942           sum2 -= v2[0] * tmp0;
3943           sum3 -= v3[0] * tmp0;
3944           sum4 -= v4[0] * tmp0;
3945           sum5 -= v5[0] * tmp0;
3946         }
3947         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3948         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3949         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3950         x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3951         x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3952         ibdiag   += 25; row += 5;
3953         break;
3954       default:
3955         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3956       }
3957     }
3958     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3959   }
3960   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3961   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3962   PetscFunctionReturn(0);
3963 }
3964 
3965 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
3966 {
3967   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
3968   PetscScalar       *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
3969   const MatScalar   *bdiag = a->inode.bdiag;
3970   const PetscScalar *b;
3971   PetscErrorCode    ierr;
3972   PetscInt          m      = a->inode.node_count,cnt = 0,i,row;
3973   const PetscInt    *sizes = a->inode.size;
3974 
3975   PetscFunctionBegin;
3976   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3977   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3978   cnt  = 0;
3979   for (i=0, row=0; i<m; i++) {
3980     switch (sizes[i]) {
3981     case 1:
3982       x[row] = b[row]*bdiag[cnt++];row++;
3983       break;
3984     case 2:
3985       x1       = b[row]; x2 = b[row+1];
3986       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3987       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3988       x[row++] = tmp1;
3989       x[row++] = tmp2;
3990       cnt     += 4;
3991       break;
3992     case 3:
3993       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2];
3994       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3995       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3996       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3997       x[row++] = tmp1;
3998       x[row++] = tmp2;
3999       x[row++] = tmp3;
4000       cnt     += 9;
4001       break;
4002     case 4:
4003       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
4004       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
4005       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
4006       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
4007       tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
4008       x[row++] = tmp1;
4009       x[row++] = tmp2;
4010       x[row++] = tmp3;
4011       x[row++] = tmp4;
4012       cnt     += 16;
4013       break;
4014     case 5:
4015       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
4016       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
4017       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
4018       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
4019       tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
4020       tmp5     = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
4021       x[row++] = tmp1;
4022       x[row++] = tmp2;
4023       x[row++] = tmp3;
4024       x[row++] = tmp4;
4025       x[row++] = tmp5;
4026       cnt     += 25;
4027       break;
4028     default:
4029       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
4030     }
4031   }
4032   ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr);
4033   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4034   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
4035   PetscFunctionReturn(0);
4036 }
4037 
4038 /*
4039     samestructure indicates that the matrix has not changed its nonzero structure so we
4040     do not need to recompute the inodes
4041 */
4042 PetscErrorCode MatSeqAIJCheckInode(Mat A)
4043 {
4044   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4045   PetscErrorCode ierr;
4046   PetscInt       i,j,m,nzx,nzy,*ns,node_count,blk_size;
4047   PetscBool      flag;
4048   const PetscInt *idx,*idy,*ii;
4049 
4050   PetscFunctionBegin;
4051   if (!a->inode.use) PetscFunctionReturn(0);
4052   if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(0);
4053 
4054   m = A->rmap->n;
4055   if (a->inode.size) ns = a->inode.size;
4056   else {
4057     ierr = PetscMalloc1(m+1,&ns);CHKERRQ(ierr);
4058   }
4059 
4060   i          = 0;
4061   node_count = 0;
4062   idx        = a->j;
4063   ii         = a->i;
4064   while (i < m) {                /* For each row */
4065     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
4066     /* Limits the number of elements in a node to 'a->inode.limit' */
4067     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4068       nzy = ii[j+1] - ii[j];     /* Same number of nonzeros */
4069       if (nzy != nzx) break;
4070       idy += nzx;              /* Same nonzero pattern */
4071       ierr = PetscArraycmp(idx,idy,nzx,&flag);CHKERRQ(ierr);
4072       if (!flag) break;
4073     }
4074     ns[node_count++] = blk_size;
4075     idx             += blk_size*nzx;
4076     i                = j;
4077   }
4078   /* If not enough inodes found,, do not use inode version of the routines */
4079   if (!m || node_count > .8*m) {
4080     ierr = PetscFree(ns);CHKERRQ(ierr);
4081 
4082     a->inode.node_count       = 0;
4083     a->inode.size             = NULL;
4084     a->inode.use              = PETSC_FALSE;
4085     A->ops->mult              = MatMult_SeqAIJ;
4086     A->ops->sor               = MatSOR_SeqAIJ;
4087     A->ops->multadd           = MatMultAdd_SeqAIJ;
4088     A->ops->getrowij          = MatGetRowIJ_SeqAIJ;
4089     A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ;
4090     A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ;
4091     A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ;
4092     A->ops->coloringpatch     = 0;
4093     A->ops->multdiagonalblock = 0;
4094 
4095     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
4096   } else {
4097     if (!A->factortype) {
4098       A->ops->mult              = MatMult_SeqAIJ_Inode;
4099       A->ops->sor               = MatSOR_SeqAIJ_Inode;
4100       A->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
4101       A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4102       if (A->rmap->n == A->cmap->n) {
4103         A->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4104         A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4105         A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4106         A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4107         A->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4108       }
4109     } else {
4110       A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4111     }
4112     a->inode.node_count = node_count;
4113     a->inode.size       = ns;
4114     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
4115   }
4116   a->inode.checked          = PETSC_TRUE;
4117   a->inode.mat_nonzerostate = A->nonzerostate;
4118   PetscFunctionReturn(0);
4119 }
4120 
4121 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat *C)
4122 {
4123   Mat            B =*C;
4124   Mat_SeqAIJ     *c=(Mat_SeqAIJ*)B->data,*a=(Mat_SeqAIJ*)A->data;
4125   PetscErrorCode ierr;
4126   PetscInt       m=A->rmap->n;
4127 
4128   PetscFunctionBegin;
4129   c->inode.use       = a->inode.use;
4130   c->inode.limit     = a->inode.limit;
4131   c->inode.max_limit = a->inode.max_limit;
4132   if (a->inode.size) {
4133     ierr                = PetscMalloc1(m+1,&c->inode.size);CHKERRQ(ierr);
4134     c->inode.node_count = a->inode.node_count;
4135     ierr                = PetscArraycpy(c->inode.size,a->inode.size,m+1);CHKERRQ(ierr);
4136     /* note the table of functions below should match that in MatSeqAIJCheckInode() */
4137     if (!B->factortype) {
4138       B->ops->mult              = MatMult_SeqAIJ_Inode;
4139       B->ops->sor               = MatSOR_SeqAIJ_Inode;
4140       B->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
4141       B->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4142       B->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4143       B->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4144       B->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4145       B->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4146       B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4147     } else {
4148       B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4149     }
4150   } else {
4151     c->inode.size       = 0;
4152     c->inode.node_count = 0;
4153   }
4154   c->inode.ibdiagvalid = PETSC_FALSE;
4155   c->inode.ibdiag      = 0;
4156   c->inode.bdiag       = 0;
4157   PetscFunctionReturn(0);
4158 }
4159 
4160 PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,const PetscInt *ai,const PetscInt *aj,const PetscInt *adiag,PetscInt row)
4161 {
4162   PetscInt       k;
4163   const PetscInt *vi;
4164 
4165   PetscFunctionBegin;
4166   vi = aj + ai[row];
4167   for (k=0; k<nzl; k++) cols[k] = vi[k];
4168   vi        = aj + adiag[row];
4169   cols[nzl] = vi[0];
4170   vi        = aj + adiag[row+1]+1;
4171   for (k=0; k<nzu; k++) cols[nzl+1+k] = vi[k];
4172   PetscFunctionReturn(0);
4173 }
4174 /*
4175    MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix.
4176    Modified from MatSeqAIJCheckInode().
4177 
4178    Input Parameters:
4179 .  Mat A - ILU or LU matrix factor
4180 
4181 */
4182 PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A)
4183 {
4184   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4185   PetscErrorCode ierr;
4186   PetscInt       i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size;
4187   PetscInt       *cols1,*cols2,*ns;
4188   const PetscInt *ai = a->i,*aj = a->j, *adiag = a->diag;
4189   PetscBool      flag;
4190 
4191   PetscFunctionBegin;
4192   if (!a->inode.use)    PetscFunctionReturn(0);
4193   if (a->inode.checked) PetscFunctionReturn(0);
4194 
4195   m = A->rmap->n;
4196   if (a->inode.size) ns = a->inode.size;
4197   else {
4198     ierr = PetscMalloc1(m+1,&ns);CHKERRQ(ierr);
4199   }
4200 
4201   i          = 0;
4202   node_count = 0;
4203   ierr = PetscMalloc2(m,&cols1,m,&cols2);CHKERRQ(ierr);
4204   while (i < m) {                /* For each row */
4205     nzl1 = ai[i+1] - ai[i];       /* Number of nonzeros in L */
4206     nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/
4207     nzx  = nzl1 + nzu1 + 1;
4208     MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i);
4209 
4210     /* Limits the number of elements in a node to 'a->inode.limit' */
4211     for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4212       nzl2 = ai[j+1] - ai[j];
4213       nzu2 = adiag[j] - adiag[j+1] - 1;
4214       nzy  = nzl2 + nzu2 + 1;
4215       if (nzy != nzx) break;
4216       ierr = MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);CHKERRQ(ierr);
4217       ierr = PetscArraycmp(cols1,cols2,nzx,&flag);CHKERRQ(ierr);
4218       if (!flag) break;
4219     }
4220     ns[node_count++] = blk_size;
4221     i                = j;
4222   }
4223   ierr             = PetscFree2(cols1,cols2);CHKERRQ(ierr);
4224   /* If not enough inodes found,, do not use inode version of the routines */
4225   if (!m || node_count > .8*m) {
4226     ierr = PetscFree(ns);CHKERRQ(ierr);
4227 
4228     a->inode.node_count = 0;
4229     a->inode.size       = NULL;
4230     a->inode.use        = PETSC_FALSE;
4231 
4232     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
4233   } else {
4234     A->ops->mult              = 0;
4235     A->ops->sor               = 0;
4236     A->ops->multadd           = 0;
4237     A->ops->getrowij          = 0;
4238     A->ops->restorerowij      = 0;
4239     A->ops->getcolumnij       = 0;
4240     A->ops->restorecolumnij   = 0;
4241     A->ops->coloringpatch     = 0;
4242     A->ops->multdiagonalblock = 0;
4243     a->inode.node_count       = node_count;
4244     a->inode.size             = ns;
4245 
4246     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
4247   }
4248   a->inode.checked = PETSC_TRUE;
4249   PetscFunctionReturn(0);
4250 }
4251 
4252 PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4253 {
4254   Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;
4255 
4256   PetscFunctionBegin;
4257   a->inode.ibdiagvalid = PETSC_FALSE;
4258   PetscFunctionReturn(0);
4259 }
4260 
4261 /*
4262      This is really ugly. if inodes are used this replaces the
4263   permutations with ones that correspond to rows/cols of the matrix
4264   rather then inode blocks
4265 */
4266 PetscErrorCode  MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
4267 {
4268   PetscErrorCode ierr;
4269 
4270   PetscFunctionBegin;
4271   ierr = PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));CHKERRQ(ierr);
4272   PetscFunctionReturn(0);
4273 }
4274 
4275 PetscErrorCode  MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
4276 {
4277   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data;
4278   PetscErrorCode ierr;
4279   PetscInt       m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
4280   const PetscInt *ridx,*cidx;
4281   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
4282   PetscInt       nslim_col,*ns_col;
4283   IS             ris = *rperm,cis = *cperm;
4284 
4285   PetscFunctionBegin;
4286   if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */
4287   if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */
4288 
4289   ierr = MatCreateColInode_Private(A,&nslim_col,&ns_col);CHKERRQ(ierr);
4290   ierr = PetscMalloc1(((nslim_row>nslim_col) ? nslim_row : nslim_col)+1,&tns);CHKERRQ(ierr);
4291   ierr = PetscMalloc2(m,&permr,n,&permc);CHKERRQ(ierr);
4292 
4293   ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr);
4294   ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr);
4295 
4296   /* Form the inode structure for the rows of permuted matric using inv perm*/
4297   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
4298 
4299   /* Construct the permutations for rows*/
4300   for (i=0,row = 0; i<nslim_row; ++i) {
4301     indx      = ridx[i];
4302     start_val = tns[indx];
4303     end_val   = tns[indx + 1];
4304     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
4305   }
4306 
4307   /* Form the inode structure for the columns of permuted matrix using inv perm*/
4308   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
4309 
4310   /* Construct permutations for columns */
4311   for (i=0,col=0; i<nslim_col; ++i) {
4312     indx      = cidx[i];
4313     start_val = tns[indx];
4314     end_val   = tns[indx + 1];
4315     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
4316   }
4317 
4318   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm);CHKERRQ(ierr);
4319   ierr = ISSetPermutation(*rperm);CHKERRQ(ierr);
4320   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm);CHKERRQ(ierr);
4321   ierr = ISSetPermutation(*cperm);CHKERRQ(ierr);
4322 
4323   ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr);
4324   ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr);
4325 
4326   ierr = PetscFree(ns_col);CHKERRQ(ierr);
4327   ierr = PetscFree2(permr,permc);CHKERRQ(ierr);
4328   ierr = ISDestroy(&cis);CHKERRQ(ierr);
4329   ierr = ISDestroy(&ris);CHKERRQ(ierr);
4330   ierr = PetscFree(tns);CHKERRQ(ierr);
4331   PetscFunctionReturn(0);
4332 }
4333 
4334 /*@C
4335    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
4336 
4337    Not Collective
4338 
4339    Input Parameter:
4340 .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
4341 
4342    Output Parameter:
4343 +  node_count - no of inodes present in the matrix.
4344 .  sizes      - an array of size node_count,with sizes of each inode.
4345 -  limit      - the max size used to generate the inodes.
4346 
4347    Level: advanced
4348 
4349    Notes:
4350     This routine returns some internal storage information
4351    of the matrix, it is intended to be used by advanced users.
4352    It should be called after the matrix is assembled.
4353    The contents of the sizes[] array should not be changed.
4354    NULL may be passed for information not requested.
4355 
4356 .seealso: MatGetInfo()
4357 @*/
4358 PetscErrorCode  MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4359 {
4360   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
4361 
4362   PetscFunctionBegin;
4363   if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
4364   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",&f);CHKERRQ(ierr);
4365   if (f) {
4366     ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr);
4367   }
4368   PetscFunctionReturn(0);
4369 }
4370 
4371 PetscErrorCode  MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4372 {
4373   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4374 
4375   PetscFunctionBegin;
4376   if (node_count) *node_count = a->inode.node_count;
4377   if (sizes)      *sizes      = a->inode.size;
4378   if (limit)      *limit      = a->inode.limit;
4379   PetscFunctionReturn(0);
4380 }
4381