xref: /petsc/src/mat/impls/aij/seq/inode.c (revision abb87a526e8b6589f379e0025c2c875fd04c377e)
1 
2 /*
3   This file provides high performance routines for the Inode format (compressed sparse row)
4   by taking advantage of rows with identical nonzero structure (I-nodes).
5 */
6 #include <../src/mat/impls/aij/seq/aij.h>
7 
8 #undef __FUNCT__
9 #define __FUNCT__ "Mat_CreateColInode"
10 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt *size,PetscInt **ns)
11 {
12   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
13   PetscErrorCode ierr;
14   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;
15 
16   PetscFunctionBegin;
17   n      = A->cmap->n;
18   m      = A->rmap->n;
19   ns_row = a->inode.size;
20 
21   min_mn = (m < n) ? m : n;
22   if (!ns) {
23     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) ;
24     for (; count+1 < n; count++,i++) ;
25     if (count < n)  {
26       i++;
27     }
28     *size = i;
29     PetscFunctionReturn(0);
30   }
31   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr);
32 
33   /* Use the same row structure wherever feasible. */
34   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
35     ns_col[i] = ns_row[i];
36   }
37 
38   /* if m < n; pad up the remainder with inode_limit */
39   for (; count+1 < n; count++,i++) {
40     ns_col[i] = 1;
41   }
42   /* The last node is the odd ball. padd it up with the remaining rows; */
43   if (count < n) {
44     ns_col[i] = n - count;
45     i++;
46   } else if (count > n) {
47     /* Adjust for the over estimation */
48     ns_col[i-1] += n - count;
49   }
50   *size = i;
51   *ns   = ns_col;
52   PetscFunctionReturn(0);
53 }
54 
55 
56 /*
57       This builds symmetric version of nonzero structure,
58 */
59 #undef __FUNCT__
60 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric"
61 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
62 {
63   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
64   PetscErrorCode ierr;
65   PetscInt       *work,*ia,*ja,nz,nslim_row,nslim_col,m,row,col,n;
66   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2;
67   const PetscInt *j,*jmax,*ai= a->i,*aj = a->j;
68 
69   PetscFunctionBegin;
70   nslim_row = a->inode.node_count;
71   m         = A->rmap->n;
72   n         = A->cmap->n;
73   if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
74 
75   /* Use the row_inode as column_inode */
76   nslim_col = nslim_row;
77   ns_col    = ns_row;
78 
79   /* allocate space for reformated inode structure */
80   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
81   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
82   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
83 
84   for (i1=0,col=0; i1<nslim_col; ++i1) {
85     nsz = ns_col[i1];
86     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
87   }
88   /* allocate space for row pointers */
89   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
90   *iia = ia;
91   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
92   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
93 
94   /* determine the number of columns in each row */
95   ia[0] = oshift;
96   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
97 
98     j    = aj + ai[row] + ishift;
99     jmax = aj + ai[row+1] + ishift;
100     col  = *j++ + ishift;
101     i2   = tvc[col];
102     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
103       ia[i1+1]++;
104       ia[i2+1]++;
105       i2++;                     /* Start col of next node */
106       while ((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j;
107       i2 = tvc[col];
108     }
109     if (i2 == i1) ia[i2+1]++;    /* now the diagonal element */
110   }
111 
112   /* shift ia[i] to point to next row */
113   for (i1=1; i1<nslim_row+1; i1++) {
114     row        = ia[i1-1];
115     ia[i1]    += row;
116     work[i1-1] = row - oshift;
117   }
118 
119   /* allocate space for column pointers */
120   nz   = ia[nslim_row] + (!ishift);
121   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
122   *jja = ja;
123 
124   /* loop over lower triangular part putting into ja */
125   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
126     j    = aj + ai[row] + ishift;
127     jmax = aj + ai[row+1] + ishift;
128     col  = *j++ + ishift;
129     i2   = tvc[col];
130     while (i2<i1 && j<jmax) {
131       ja[work[i2]++] = i1 + oshift;
132       ja[work[i1]++] = i2 + oshift;
133       ++i2;
134       while ((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */
135       i2 = tvc[col];
136     }
137     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
138 
139   }
140   ierr = PetscFree(work);CHKERRQ(ierr);
141   ierr = PetscFree(tns);CHKERRQ(ierr);
142   ierr = PetscFree(tvc);CHKERRQ(ierr);
143   PetscFunctionReturn(0);
144 }
145 
146 /*
147       This builds nonsymmetric version of nonzero structure,
148 */
149 #undef __FUNCT__
150 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric"
151 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
152 {
153   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
154   PetscErrorCode ierr;
155   PetscInt       *work,*ia,*ja,nz,nslim_row,n,row,col,*ns_col,nslim_col;
156   PetscInt       *tns,*tvc,nsz,i1,i2;
157   const PetscInt *j,*ai= a->i,*aj = a->j,*ns_row = a->inode.size;
158 
159   PetscFunctionBegin;
160   nslim_row = a->inode.node_count;
161   n         = A->cmap->n;
162 
163   /* Create The column_inode for this matrix */
164   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
165 
166   /* allocate space for reformated column_inode structure */
167   ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
168   ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
169   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
170 
171   for (i1=0,col=0; i1<nslim_col; ++i1) {
172     nsz = ns_col[i1];
173     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
174   }
175   /* allocate space for row pointers */
176   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
177   *iia = ia;
178   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
179   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
180 
181   /* determine the number of columns in each row */
182   ia[0] = oshift;
183   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
184     j   = aj + ai[row] + ishift;
185     col = *j++ + ishift;
186     i2  = tvc[col];
187     nz  = ai[row+1] - ai[row];
188     while (nz-- > 0) {           /* off-diagonal elemets */
189       ia[i1+1]++;
190       i2++;                     /* Start col of next node */
191       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
192       if (nz > 0) i2 = tvc[col];
193     }
194   }
195 
196   /* shift ia[i] to point to next row */
197   for (i1=1; i1<nslim_row+1; i1++) {
198     row        = ia[i1-1];
199     ia[i1]    += row;
200     work[i1-1] = row - oshift;
201   }
202 
203   /* allocate space for column pointers */
204   nz   = ia[nslim_row] + (!ishift);
205   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
206   *jja = ja;
207 
208   /* loop over matrix putting into ja */
209   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
210     j   = aj + ai[row] + ishift;
211     col = *j++ + ishift;
212     i2  = tvc[col];
213     nz  = ai[row+1] - ai[row];
214     while (nz-- > 0) {
215       ja[work[i1]++] = i2 + oshift;
216       ++i2;
217       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
218       if (nz > 0) i2 = tvc[col];
219     }
220   }
221   ierr = PetscFree(ns_col);CHKERRQ(ierr);
222   ierr = PetscFree(work);CHKERRQ(ierr);
223   ierr = PetscFree(tns);CHKERRQ(ierr);
224   ierr = PetscFree(tvc);CHKERRQ(ierr);
225   PetscFunctionReturn(0);
226 }
227 
228 #undef __FUNCT__
229 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode"
230 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
231 {
232   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
233   PetscErrorCode ierr;
234 
235   PetscFunctionBegin;
236   *n = a->inode.node_count;
237   if (!ia) PetscFunctionReturn(0);
238   if (!blockcompressed) {
239     ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
240   } else if (symmetric) {
241     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
242   } else {
243     ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
244   }
245   PetscFunctionReturn(0);
246 }
247 
248 #undef __FUNCT__
249 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode"
250 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
251 {
252   PetscErrorCode ierr;
253 
254   PetscFunctionBegin;
255   if (!ia) PetscFunctionReturn(0);
256 
257   if (!blockcompressed) {
258     ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
259   } else {
260     ierr = PetscFree(*ia);CHKERRQ(ierr);
261     ierr = PetscFree(*ja);CHKERRQ(ierr);
262   }
263   PetscFunctionReturn(0);
264 }
265 
266 /* ----------------------------------------------------------- */
267 
268 #undef __FUNCT__
269 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric"
270 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
271 {
272   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
273   PetscErrorCode ierr;
274   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
275   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
276 
277   PetscFunctionBegin;
278   nslim_row = a->inode.node_count;
279   n         = A->cmap->n;
280 
281   /* Create The column_inode for this matrix */
282   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
283 
284   /* allocate space for reformated column_inode structure */
285   ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
286   ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
287   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
288 
289   for (i1=0,col=0; i1<nslim_col; ++i1) {
290     nsz = ns_col[i1];
291     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
292   }
293   /* allocate space for column pointers */
294   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
295   *iia = ia;
296   ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr);
297   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
298 
299   /* determine the number of columns in each row */
300   ia[0] = oshift;
301   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
302     j   = aj + ai[row] + ishift;
303     col = *j++ + ishift;
304     i2  = tvc[col];
305     nz  = ai[row+1] - ai[row];
306     while (nz-- > 0) {           /* off-diagonal elemets */
307       /* ia[i1+1]++; */
308       ia[i2+1]++;
309       i2++;
310       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
311       if (nz > 0) i2 = tvc[col];
312     }
313   }
314 
315   /* shift ia[i] to point to next col */
316   for (i1=1; i1<nslim_col+1; i1++) {
317     col        = ia[i1-1];
318     ia[i1]    += col;
319     work[i1-1] = col - oshift;
320   }
321 
322   /* allocate space for column pointers */
323   nz   = ia[nslim_col] + (!ishift);
324   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
325   *jja = ja;
326 
327   /* loop over matrix putting into ja */
328   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
329     j   = aj + ai[row] + ishift;
330     col = *j++ + ishift;
331     i2  = tvc[col];
332     nz  = ai[row+1] - ai[row];
333     while (nz-- > 0) {
334       /* ja[work[i1]++] = i2 + oshift; */
335       ja[work[i2]++] = i1 + oshift;
336       i2++;
337       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
338       if (nz > 0) i2 = tvc[col];
339     }
340   }
341   ierr = PetscFree(ns_col);CHKERRQ(ierr);
342   ierr = PetscFree(work);CHKERRQ(ierr);
343   ierr = PetscFree(tns);CHKERRQ(ierr);
344   ierr = PetscFree(tvc);CHKERRQ(ierr);
345   PetscFunctionReturn(0);
346 }
347 
348 #undef __FUNCT__
349 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode"
350 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
351 {
352   PetscErrorCode ierr;
353 
354   PetscFunctionBegin;
355   ierr = Mat_CreateColInode(A,n,NULL);CHKERRQ(ierr);
356   if (!ia) PetscFunctionReturn(0);
357 
358   if (!blockcompressed) {
359     ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
360   } else if (symmetric) {
361     /* Since the indices are symmetric it does'nt matter */
362     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
363   } else {
364     ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
365   }
366   PetscFunctionReturn(0);
367 }
368 
369 #undef __FUNCT__
370 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode"
371 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
372 {
373   PetscErrorCode ierr;
374 
375   PetscFunctionBegin;
376   if (!ia) PetscFunctionReturn(0);
377   if (!blockcompressed) {
378     ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
379   } else {
380     ierr = PetscFree(*ia);CHKERRQ(ierr);
381     ierr = PetscFree(*ja);CHKERRQ(ierr);
382   }
383   PetscFunctionReturn(0);
384 }
385 
386 /* ----------------------------------------------------------- */
387 
388 #undef __FUNCT__
389 #define __FUNCT__ "MatMult_SeqAIJ_Inode"
390 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
391 {
392   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
393   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
394   PetscScalar       *y;
395   const PetscScalar *x;
396   const MatScalar   *v1,*v2,*v3,*v4,*v5;
397   PetscErrorCode    ierr;
398   PetscInt          i1,i2,n,i,row,node_max,nsz,sz,nonzerorow=0;
399   const PetscInt    *idx,*ns,*ii;
400 
401 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
402 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
403 #endif
404 
405   PetscFunctionBegin;
406   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
407   node_max = a->inode.node_count;
408   ns       = a->inode.size;     /* Node Size array */
409   ierr     = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
410   ierr     = VecGetArray(yy,&y);CHKERRQ(ierr);
411   idx      = a->j;
412   v1       = a->a;
413   ii       = a->i;
414 
415   for (i = 0,row = 0; i< node_max; ++i) {
416     nsz         = ns[i];
417     n           = ii[1] - ii[0];
418     nonzerorow += (n>0)*nsz;
419     ii         += nsz;
420     PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA);    /* Prefetch the indices for the block row after the current one */
421     PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one  */
422     sz = n;                     /* No of non zeros in this row */
423                                 /* Switch on the size of Node */
424     switch (nsz) {               /* Each loop in 'case' is unrolled */
425     case 1:
426       sum1 = 0.;
427 
428       for (n = 0; n< sz-1; n+=2) {
429         i1    = idx[0];         /* The instructions are ordered to */
430         i2    = idx[1];         /* make the compiler's job easy */
431         idx  += 2;
432         tmp0  = x[i1];
433         tmp1  = x[i2];
434         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
435       }
436 
437       if (n == sz-1) {          /* Take care of the last nonzero  */
438         tmp0  = x[*idx++];
439         sum1 += *v1++ *tmp0;
440       }
441       y[row++]=sum1;
442       break;
443     case 2:
444       sum1 = 0.;
445       sum2 = 0.;
446       v2   = v1 + n;
447 
448       for (n = 0; n< sz-1; n+=2) {
449         i1    = idx[0];
450         i2    = idx[1];
451         idx  += 2;
452         tmp0  = x[i1];
453         tmp1  = x[i2];
454         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
455         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
456       }
457       if (n == sz-1) {
458         tmp0  = x[*idx++];
459         sum1 += *v1++ * tmp0;
460         sum2 += *v2++ * tmp0;
461       }
462       y[row++]=sum1;
463       y[row++]=sum2;
464       v1      =v2;              /* Since the next block to be processed starts there*/
465       idx    +=sz;
466       break;
467     case 3:
468       sum1 = 0.;
469       sum2 = 0.;
470       sum3 = 0.;
471       v2   = v1 + n;
472       v3   = v2 + n;
473 
474       for (n = 0; n< sz-1; n+=2) {
475         i1    = idx[0];
476         i2    = idx[1];
477         idx  += 2;
478         tmp0  = x[i1];
479         tmp1  = x[i2];
480         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
481         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
482         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
483       }
484       if (n == sz-1) {
485         tmp0  = x[*idx++];
486         sum1 += *v1++ * tmp0;
487         sum2 += *v2++ * tmp0;
488         sum3 += *v3++ * tmp0;
489       }
490       y[row++]=sum1;
491       y[row++]=sum2;
492       y[row++]=sum3;
493       v1      =v3;              /* Since the next block to be processed starts there*/
494       idx    +=2*sz;
495       break;
496     case 4:
497       sum1 = 0.;
498       sum2 = 0.;
499       sum3 = 0.;
500       sum4 = 0.;
501       v2   = v1 + n;
502       v3   = v2 + n;
503       v4   = v3 + n;
504 
505       for (n = 0; n< sz-1; n+=2) {
506         i1    = idx[0];
507         i2    = idx[1];
508         idx  += 2;
509         tmp0  = x[i1];
510         tmp1  = x[i2];
511         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
512         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
513         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
514         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
515       }
516       if (n == sz-1) {
517         tmp0  = x[*idx++];
518         sum1 += *v1++ * tmp0;
519         sum2 += *v2++ * tmp0;
520         sum3 += *v3++ * tmp0;
521         sum4 += *v4++ * tmp0;
522       }
523       y[row++]=sum1;
524       y[row++]=sum2;
525       y[row++]=sum3;
526       y[row++]=sum4;
527       v1      =v4;              /* Since the next block to be processed starts there*/
528       idx    +=3*sz;
529       break;
530     case 5:
531       sum1 = 0.;
532       sum2 = 0.;
533       sum3 = 0.;
534       sum4 = 0.;
535       sum5 = 0.;
536       v2   = v1 + n;
537       v3   = v2 + n;
538       v4   = v3 + n;
539       v5   = v4 + n;
540 
541       for (n = 0; n<sz-1; n+=2) {
542         i1    = idx[0];
543         i2    = idx[1];
544         idx  += 2;
545         tmp0  = x[i1];
546         tmp1  = x[i2];
547         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
548         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
549         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
550         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
551         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
552       }
553       if (n == sz-1) {
554         tmp0  = x[*idx++];
555         sum1 += *v1++ * tmp0;
556         sum2 += *v2++ * tmp0;
557         sum3 += *v3++ * tmp0;
558         sum4 += *v4++ * tmp0;
559         sum5 += *v5++ * tmp0;
560       }
561       y[row++]=sum1;
562       y[row++]=sum2;
563       y[row++]=sum3;
564       y[row++]=sum4;
565       y[row++]=sum5;
566       v1      =v5;       /* Since the next block to be processed starts there */
567       idx    +=4*sz;
568       break;
569     default:
570       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
571     }
572   }
573   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
574   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
575   ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr);
576   PetscFunctionReturn(0);
577 }
578 /* ----------------------------------------------------------- */
579 /* Almost same code as the MatMult_SeqAIJ_Inode() */
580 #undef __FUNCT__
581 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode"
582 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
583 {
584   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
585   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
586   const MatScalar   *v1,*v2,*v3,*v4,*v5;
587   const PetscScalar *x;
588   PetscScalar       *y,*z,*zt;
589   PetscErrorCode    ierr;
590   PetscInt          i1,i2,n,i,row,node_max,nsz,sz;
591   const PetscInt    *idx,*ns,*ii;
592 
593   PetscFunctionBegin;
594   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
595   node_max = a->inode.node_count;
596   ns       = a->inode.size;     /* Node Size array */
597 
598   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
599   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
600   if (zz != yy) {
601     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
602   } else {
603     z = y;
604   }
605   zt = z;
606 
607   idx = a->j;
608   v1  = a->a;
609   ii  = a->i;
610 
611   for (i = 0,row = 0; i< node_max; ++i) {
612     nsz = ns[i];
613     n   = ii[1] - ii[0];
614     ii += nsz;
615     sz  = n;                    /* No of non zeros in this row */
616                                 /* Switch on the size of Node */
617     switch (nsz) {               /* Each loop in 'case' is unrolled */
618     case 1:
619       sum1 = *zt++;
620 
621       for (n = 0; n< sz-1; n+=2) {
622         i1    = idx[0];         /* The instructions are ordered to */
623         i2    = idx[1];         /* make the compiler's job easy */
624         idx  += 2;
625         tmp0  = x[i1];
626         tmp1  = x[i2];
627         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
628       }
629 
630       if (n   == sz-1) {          /* Take care of the last nonzero  */
631         tmp0  = x[*idx++];
632         sum1 += *v1++ * tmp0;
633       }
634       y[row++]=sum1;
635       break;
636     case 2:
637       sum1 = *zt++;
638       sum2 = *zt++;
639       v2   = v1 + n;
640 
641       for (n = 0; n< sz-1; n+=2) {
642         i1    = idx[0];
643         i2    = idx[1];
644         idx  += 2;
645         tmp0  = x[i1];
646         tmp1  = x[i2];
647         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
648         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
649       }
650       if (n   == sz-1) {
651         tmp0  = x[*idx++];
652         sum1 += *v1++ * tmp0;
653         sum2 += *v2++ * tmp0;
654       }
655       y[row++]=sum1;
656       y[row++]=sum2;
657       v1      =v2;              /* Since the next block to be processed starts there*/
658       idx    +=sz;
659       break;
660     case 3:
661       sum1 = *zt++;
662       sum2 = *zt++;
663       sum3 = *zt++;
664       v2   = v1 + n;
665       v3   = v2 + n;
666 
667       for (n = 0; n< sz-1; n+=2) {
668         i1    = idx[0];
669         i2    = idx[1];
670         idx  += 2;
671         tmp0  = x[i1];
672         tmp1  = x[i2];
673         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
674         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
675         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
676       }
677       if (n == sz-1) {
678         tmp0  = x[*idx++];
679         sum1 += *v1++ * tmp0;
680         sum2 += *v2++ * tmp0;
681         sum3 += *v3++ * tmp0;
682       }
683       y[row++]=sum1;
684       y[row++]=sum2;
685       y[row++]=sum3;
686       v1      =v3;              /* Since the next block to be processed starts there*/
687       idx    +=2*sz;
688       break;
689     case 4:
690       sum1 = *zt++;
691       sum2 = *zt++;
692       sum3 = *zt++;
693       sum4 = *zt++;
694       v2   = v1 + n;
695       v3   = v2 + n;
696       v4   = v3 + n;
697 
698       for (n = 0; n< sz-1; n+=2) {
699         i1    = idx[0];
700         i2    = idx[1];
701         idx  += 2;
702         tmp0  = x[i1];
703         tmp1  = x[i2];
704         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
705         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
706         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
707         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
708       }
709       if (n == sz-1) {
710         tmp0  = x[*idx++];
711         sum1 += *v1++ * tmp0;
712         sum2 += *v2++ * tmp0;
713         sum3 += *v3++ * tmp0;
714         sum4 += *v4++ * tmp0;
715       }
716       y[row++]=sum1;
717       y[row++]=sum2;
718       y[row++]=sum3;
719       y[row++]=sum4;
720       v1      =v4;              /* Since the next block to be processed starts there*/
721       idx    +=3*sz;
722       break;
723     case 5:
724       sum1 = *zt++;
725       sum2 = *zt++;
726       sum3 = *zt++;
727       sum4 = *zt++;
728       sum5 = *zt++;
729       v2   = v1 + n;
730       v3   = v2 + n;
731       v4   = v3 + n;
732       v5   = v4 + n;
733 
734       for (n = 0; n<sz-1; n+=2) {
735         i1    = idx[0];
736         i2    = idx[1];
737         idx  += 2;
738         tmp0  = x[i1];
739         tmp1  = x[i2];
740         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
741         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
742         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
743         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
744         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
745       }
746       if (n   == sz-1) {
747         tmp0  = x[*idx++];
748         sum1 += *v1++ * tmp0;
749         sum2 += *v2++ * tmp0;
750         sum3 += *v3++ * tmp0;
751         sum4 += *v4++ * tmp0;
752         sum5 += *v5++ * tmp0;
753       }
754       y[row++]=sum1;
755       y[row++]=sum2;
756       y[row++]=sum3;
757       y[row++]=sum4;
758       y[row++]=sum5;
759       v1      =v5;       /* Since the next block to be processed starts there */
760       idx    +=4*sz;
761       break;
762     default:
763       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
764     }
765   }
766   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
767   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
768   if (zz != yy) {
769     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
770   }
771   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
772   PetscFunctionReturn(0);
773 }
774 
775 /* ----------------------------------------------------------- */
776 #undef __FUNCT__
777 #define __FUNCT__ "MatSolve_SeqAIJ_Inode_inplace"
778 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)
779 {
780   Mat_SeqAIJ        *a    = (Mat_SeqAIJ*)A->data;
781   IS                iscol = a->col,isrow = a->row;
782   PetscErrorCode    ierr;
783   const PetscInt    *r,*c,*rout,*cout;
784   PetscInt          i,j,n = A->rmap->n,nz;
785   PetscInt          node_max,*ns,row,nsz,aii,i0,i1;
786   const PetscInt    *ai = a->i,*a_j = a->j,*vi,*ad,*aj;
787   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
788   PetscScalar       sum1,sum2,sum3,sum4,sum5;
789   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
790   const PetscScalar *b;
791 
792   PetscFunctionBegin;
793   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
794   node_max = a->inode.node_count;
795   ns       = a->inode.size;     /* Node Size array */
796 
797   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
798   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
799   tmp  = a->solve_work;
800 
801   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
802   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
803 
804   /* forward solve the lower triangular */
805   tmps = tmp;
806   aa   = a_a;
807   aj   = a_j;
808   ad   = a->diag;
809 
810   for (i = 0,row = 0; i< node_max; ++i) {
811     nsz = ns[i];
812     aii = ai[row];
813     v1  = aa + aii;
814     vi  = aj + aii;
815     nz  = ad[row]- aii;
816     if (i < node_max-1) {
817       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
818       * but our indexing to determine it's size could. */
819       PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
820       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
821       PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
822       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
823     }
824 
825     switch (nsz) {               /* Each loop in 'case' is unrolled */
826     case 1:
827       sum1 = b[*r++];
828       for (j=0; j<nz-1; j+=2) {
829         i0    = vi[0];
830         i1    = vi[1];
831         vi   +=2;
832         tmp0  = tmps[i0];
833         tmp1  = tmps[i1];
834         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
835       }
836       if (j == nz-1) {
837         tmp0  = tmps[*vi++];
838         sum1 -= *v1++ *tmp0;
839       }
840       tmp[row++]=sum1;
841       break;
842     case 2:
843       sum1 = b[*r++];
844       sum2 = b[*r++];
845       v2   = aa + ai[row+1];
846 
847       for (j=0; j<nz-1; j+=2) {
848         i0    = vi[0];
849         i1    = vi[1];
850         vi   +=2;
851         tmp0  = tmps[i0];
852         tmp1  = tmps[i1];
853         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
854         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
855       }
856       if (j == nz-1) {
857         tmp0  = tmps[*vi++];
858         sum1 -= *v1++ *tmp0;
859         sum2 -= *v2++ *tmp0;
860       }
861       sum2     -= *v2++ *sum1;
862       tmp[row++]=sum1;
863       tmp[row++]=sum2;
864       break;
865     case 3:
866       sum1 = b[*r++];
867       sum2 = b[*r++];
868       sum3 = b[*r++];
869       v2   = aa + ai[row+1];
870       v3   = aa + ai[row+2];
871 
872       for (j=0; j<nz-1; j+=2) {
873         i0    = vi[0];
874         i1    = vi[1];
875         vi   +=2;
876         tmp0  = tmps[i0];
877         tmp1  = tmps[i1];
878         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
879         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
880         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
881       }
882       if (j == nz-1) {
883         tmp0  = tmps[*vi++];
884         sum1 -= *v1++ *tmp0;
885         sum2 -= *v2++ *tmp0;
886         sum3 -= *v3++ *tmp0;
887       }
888       sum2 -= *v2++ * sum1;
889       sum3 -= *v3++ * sum1;
890       sum3 -= *v3++ * sum2;
891 
892       tmp[row++]=sum1;
893       tmp[row++]=sum2;
894       tmp[row++]=sum3;
895       break;
896 
897     case 4:
898       sum1 = b[*r++];
899       sum2 = b[*r++];
900       sum3 = b[*r++];
901       sum4 = b[*r++];
902       v2   = aa + ai[row+1];
903       v3   = aa + ai[row+2];
904       v4   = aa + ai[row+3];
905 
906       for (j=0; j<nz-1; j+=2) {
907         i0    = vi[0];
908         i1    = vi[1];
909         vi   +=2;
910         tmp0  = tmps[i0];
911         tmp1  = tmps[i1];
912         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
913         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
914         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
915         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
916       }
917       if (j == nz-1) {
918         tmp0  = tmps[*vi++];
919         sum1 -= *v1++ *tmp0;
920         sum2 -= *v2++ *tmp0;
921         sum3 -= *v3++ *tmp0;
922         sum4 -= *v4++ *tmp0;
923       }
924       sum2 -= *v2++ * sum1;
925       sum3 -= *v3++ * sum1;
926       sum4 -= *v4++ * sum1;
927       sum3 -= *v3++ * sum2;
928       sum4 -= *v4++ * sum2;
929       sum4 -= *v4++ * sum3;
930 
931       tmp[row++]=sum1;
932       tmp[row++]=sum2;
933       tmp[row++]=sum3;
934       tmp[row++]=sum4;
935       break;
936     case 5:
937       sum1 = b[*r++];
938       sum2 = b[*r++];
939       sum3 = b[*r++];
940       sum4 = b[*r++];
941       sum5 = b[*r++];
942       v2   = aa + ai[row+1];
943       v3   = aa + ai[row+2];
944       v4   = aa + ai[row+3];
945       v5   = aa + ai[row+4];
946 
947       for (j=0; j<nz-1; j+=2) {
948         i0    = vi[0];
949         i1    = vi[1];
950         vi   +=2;
951         tmp0  = tmps[i0];
952         tmp1  = tmps[i1];
953         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
954         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
955         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
956         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
957         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
958       }
959       if (j == nz-1) {
960         tmp0  = tmps[*vi++];
961         sum1 -= *v1++ *tmp0;
962         sum2 -= *v2++ *tmp0;
963         sum3 -= *v3++ *tmp0;
964         sum4 -= *v4++ *tmp0;
965         sum5 -= *v5++ *tmp0;
966       }
967 
968       sum2 -= *v2++ * sum1;
969       sum3 -= *v3++ * sum1;
970       sum4 -= *v4++ * sum1;
971       sum5 -= *v5++ * sum1;
972       sum3 -= *v3++ * sum2;
973       sum4 -= *v4++ * sum2;
974       sum5 -= *v5++ * sum2;
975       sum4 -= *v4++ * sum3;
976       sum5 -= *v5++ * sum3;
977       sum5 -= *v5++ * sum4;
978 
979       tmp[row++]=sum1;
980       tmp[row++]=sum2;
981       tmp[row++]=sum3;
982       tmp[row++]=sum4;
983       tmp[row++]=sum5;
984       break;
985     default:
986       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
987     }
988   }
989   /* backward solve the upper triangular */
990   for (i=node_max -1,row = n-1; i>=0; i--) {
991     nsz = ns[i];
992     aii = ai[row+1] -1;
993     v1  = aa + aii;
994     vi  = aj + aii;
995     nz  = aii- ad[row];
996     switch (nsz) {               /* Each loop in 'case' is unrolled */
997     case 1:
998       sum1 = tmp[row];
999 
1000       for (j=nz; j>1; j-=2) {
1001         vi   -=2;
1002         i0    = vi[2];
1003         i1    = vi[1];
1004         tmp0  = tmps[i0];
1005         tmp1  = tmps[i1];
1006         v1   -= 2;
1007         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1008       }
1009       if (j==1) {
1010         tmp0  = tmps[*vi--];
1011         sum1 -= *v1-- * tmp0;
1012       }
1013       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1014       break;
1015     case 2:
1016       sum1 = tmp[row];
1017       sum2 = tmp[row -1];
1018       v2   = aa + ai[row]-1;
1019       for (j=nz; j>1; j-=2) {
1020         vi   -=2;
1021         i0    = vi[2];
1022         i1    = vi[1];
1023         tmp0  = tmps[i0];
1024         tmp1  = tmps[i1];
1025         v1   -= 2;
1026         v2   -= 2;
1027         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1028         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1029       }
1030       if (j==1) {
1031         tmp0  = tmps[*vi--];
1032         sum1 -= *v1-- * tmp0;
1033         sum2 -= *v2-- * tmp0;
1034       }
1035 
1036       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1037       sum2   -= *v2-- * tmp0;
1038       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1039       break;
1040     case 3:
1041       sum1 = tmp[row];
1042       sum2 = tmp[row -1];
1043       sum3 = tmp[row -2];
1044       v2   = aa + ai[row]-1;
1045       v3   = aa + ai[row -1]-1;
1046       for (j=nz; j>1; j-=2) {
1047         vi   -=2;
1048         i0    = vi[2];
1049         i1    = vi[1];
1050         tmp0  = tmps[i0];
1051         tmp1  = tmps[i1];
1052         v1   -= 2;
1053         v2   -= 2;
1054         v3   -= 2;
1055         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1056         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1057         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1058       }
1059       if (j==1) {
1060         tmp0  = tmps[*vi--];
1061         sum1 -= *v1-- * tmp0;
1062         sum2 -= *v2-- * tmp0;
1063         sum3 -= *v3-- * tmp0;
1064       }
1065       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1066       sum2   -= *v2-- * tmp0;
1067       sum3   -= *v3-- * tmp0;
1068       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1069       sum3   -= *v3-- * tmp0;
1070       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1071 
1072       break;
1073     case 4:
1074       sum1 = tmp[row];
1075       sum2 = tmp[row -1];
1076       sum3 = tmp[row -2];
1077       sum4 = tmp[row -3];
1078       v2   = aa + ai[row]-1;
1079       v3   = aa + ai[row -1]-1;
1080       v4   = aa + ai[row -2]-1;
1081 
1082       for (j=nz; j>1; j-=2) {
1083         vi   -=2;
1084         i0    = vi[2];
1085         i1    = vi[1];
1086         tmp0  = tmps[i0];
1087         tmp1  = tmps[i1];
1088         v1   -= 2;
1089         v2   -= 2;
1090         v3   -= 2;
1091         v4   -= 2;
1092         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1093         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1094         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1095         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1096       }
1097       if (j==1) {
1098         tmp0  = tmps[*vi--];
1099         sum1 -= *v1-- * tmp0;
1100         sum2 -= *v2-- * tmp0;
1101         sum3 -= *v3-- * tmp0;
1102         sum4 -= *v4-- * tmp0;
1103       }
1104 
1105       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1106       sum2   -= *v2-- * tmp0;
1107       sum3   -= *v3-- * tmp0;
1108       sum4   -= *v4-- * tmp0;
1109       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1110       sum3   -= *v3-- * tmp0;
1111       sum4   -= *v4-- * tmp0;
1112       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1113       sum4   -= *v4-- * tmp0;
1114       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1115       break;
1116     case 5:
1117       sum1 = tmp[row];
1118       sum2 = tmp[row -1];
1119       sum3 = tmp[row -2];
1120       sum4 = tmp[row -3];
1121       sum5 = tmp[row -4];
1122       v2   = aa + ai[row]-1;
1123       v3   = aa + ai[row -1]-1;
1124       v4   = aa + ai[row -2]-1;
1125       v5   = aa + ai[row -3]-1;
1126       for (j=nz; j>1; j-=2) {
1127         vi   -= 2;
1128         i0    = vi[2];
1129         i1    = vi[1];
1130         tmp0  = tmps[i0];
1131         tmp1  = tmps[i1];
1132         v1   -= 2;
1133         v2   -= 2;
1134         v3   -= 2;
1135         v4   -= 2;
1136         v5   -= 2;
1137         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1138         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1139         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1140         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1141         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1142       }
1143       if (j==1) {
1144         tmp0  = tmps[*vi--];
1145         sum1 -= *v1-- * tmp0;
1146         sum2 -= *v2-- * tmp0;
1147         sum3 -= *v3-- * tmp0;
1148         sum4 -= *v4-- * tmp0;
1149         sum5 -= *v5-- * tmp0;
1150       }
1151 
1152       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1153       sum2   -= *v2-- * tmp0;
1154       sum3   -= *v3-- * tmp0;
1155       sum4   -= *v4-- * tmp0;
1156       sum5   -= *v5-- * tmp0;
1157       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1158       sum3   -= *v3-- * tmp0;
1159       sum4   -= *v4-- * tmp0;
1160       sum5   -= *v5-- * tmp0;
1161       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1162       sum4   -= *v4-- * tmp0;
1163       sum5   -= *v5-- * tmp0;
1164       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1165       sum5   -= *v5-- * tmp0;
1166       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1167       break;
1168     default:
1169       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
1170     }
1171   }
1172   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1173   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1174   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1175   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1176   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
1177   PetscFunctionReturn(0);
1178 }
1179 
1180 #undef __FUNCT__
1181 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode"
1182 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info)
1183 {
1184   Mat             C     =B;
1185   Mat_SeqAIJ      *a    =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)C->data;
1186   IS              isrow = b->row,isicol = b->icol;
1187   PetscErrorCode  ierr;
1188   const PetscInt  *r,*ic,*ics;
1189   const PetscInt  n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag;
1190   PetscInt        i,j,k,nz,nzL,row,*pj;
1191   const PetscInt  *ajtmp,*bjtmp;
1192   MatScalar       *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4;
1193   const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4;
1194   FactorShiftCtx  sctx;
1195   const PetscInt  *ddiag;
1196   PetscReal       rs;
1197   MatScalar       d;
1198   PetscInt        inod,nodesz,node_max,col;
1199   const PetscInt  *ns;
1200   PetscInt        *tmp_vec1,*tmp_vec2,*nsmap;
1201 
1202   PetscFunctionBegin;
1203   /* MatPivotSetUp(): initialize shift context sctx */
1204   ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr);
1205 
1206   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1207     ddiag          = a->diag;
1208     sctx.shift_top = info->zeropivot;
1209     for (i=0; i<n; i++) {
1210       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1211       d  = (aa)[ddiag[i]];
1212       rs = -PetscAbsScalar(d) - PetscRealPart(d);
1213       v  = aa+ai[i];
1214       nz = ai[i+1] - ai[i];
1215       for (j=0; j<nz; j++) rs += PetscAbsScalar(v[j]);
1216       if (rs>sctx.shift_top) sctx.shift_top = rs;
1217     }
1218     sctx.shift_top *= 1.1;
1219     sctx.nshift_max = 5;
1220     sctx.shift_lo   = 0.;
1221     sctx.shift_hi   = 1.;
1222   }
1223 
1224   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1225   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1226 
1227   ierr  = PetscMalloc((4*n+1)*sizeof(PetscScalar),&rtmp1);CHKERRQ(ierr);
1228   ierr  = PetscMemzero(rtmp1,(4*n+1)*sizeof(PetscScalar));CHKERRQ(ierr);
1229   rtmp2 = rtmp1 + n;
1230   rtmp3 = rtmp2 + n;
1231   rtmp4 = rtmp3 + n;
1232   ics   = ic;
1233 
1234   node_max = a->inode.node_count;
1235   ns       = a->inode.size;
1236   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1237 
1238   /* If max inode size > 4, split it into two inodes.*/
1239   /* also map the inode sizes according to the ordering */
1240   ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr);
1241   for (i=0,j=0; i<node_max; ++i,++j) {
1242     if (ns[i] > 4) {
1243       tmp_vec1[j] = 4;
1244       ++j;
1245       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1246     } else {
1247       tmp_vec1[j] = ns[i];
1248     }
1249   }
1250   /* Use the correct node_max */
1251   node_max = j;
1252 
1253   /* Now reorder the inode info based on mat re-ordering info */
1254   /* First create a row -> inode_size_array_index map */
1255   ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr);
1256   ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr);
1257   for (i=0,row=0; i<node_max; i++) {
1258     nodesz = tmp_vec1[i];
1259     for (j=0; j<nodesz; j++,row++) {
1260       nsmap[row] = i;
1261     }
1262   }
1263   /* Using nsmap, create a reordered ns structure */
1264   for (i=0,j=0; i< node_max; i++) {
1265     nodesz      = tmp_vec1[nsmap[r[j]]];     /* here the reordered row_no is in r[] */
1266     tmp_vec2[i] = nodesz;
1267     j          += nodesz;
1268   }
1269   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1270   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1271 
1272   /* Now use the correct ns */
1273   ns = tmp_vec2;
1274 
1275   do {
1276     sctx.newshift = PETSC_FALSE;
1277     /* Now loop over each block-row, and do the factorization */
1278     for (inod=0,i=0; inod<node_max; inod++) { /* i: row index; inod: inode index */
1279       nodesz = ns[inod];
1280 
1281       switch (nodesz) {
1282       case 1:
1283         /*----------*/
1284         /* zero rtmp1 */
1285         /* L part */
1286         nz    = bi[i+1] - bi[i];
1287         bjtmp = bj + bi[i];
1288         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1289 
1290         /* U part */
1291         nz    = bdiag[i]-bdiag[i+1];
1292         bjtmp = bj + bdiag[i+1]+1;
1293         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1294 
1295         /* load in initial (unfactored row) */
1296         nz    = ai[r[i]+1] - ai[r[i]];
1297         ajtmp = aj + ai[r[i]];
1298         v     = aa + ai[r[i]];
1299         for (j=0; j<nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];
1300 
1301         /* ZeropivotApply() */
1302         rtmp1[i] += sctx.shift_amount;  /* shift the diagonal of the matrix */
1303 
1304         /* elimination */
1305         bjtmp = bj + bi[i];
1306         row   = *bjtmp++;
1307         nzL   = bi[i+1] - bi[i];
1308         for (k=0; k < nzL; k++) {
1309           pc = rtmp1 + row;
1310           if (*pc != 0.0) {
1311             pv   = b->a + bdiag[row];
1312             mul1 = *pc * (*pv);
1313             *pc  = mul1;
1314             pj   = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1315             pv   = b->a + bdiag[row+1]+1;
1316             nz   = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1317             for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1318             ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1319           }
1320           row = *bjtmp++;
1321         }
1322 
1323         /* finished row so stick it into b->a */
1324         rs = 0.0;
1325         /* L part */
1326         pv = b->a + bi[i];
1327         pj = b->j + bi[i];
1328         nz = bi[i+1] - bi[i];
1329         for (j=0; j<nz; j++) {
1330           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1331         }
1332 
1333         /* U part */
1334         pv = b->a + bdiag[i+1]+1;
1335         pj = b->j + bdiag[i+1]+1;
1336         nz = bdiag[i] - bdiag[i+1]-1;
1337         for (j=0; j<nz; j++) {
1338           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1339         }
1340 
1341         /* Check zero pivot */
1342         sctx.rs = rs;
1343         sctx.pv = rtmp1[i];
1344         ierr    = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr);
1345         if (sctx.newshift) break;
1346 
1347         /* Mark diagonal and invert diagonal for simplier triangular solves */
1348         pv  = b->a + bdiag[i];
1349         *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1350         break;
1351 
1352       case 2:
1353         /*----------*/
1354         /* zero rtmp1 and rtmp2 */
1355         /* L part */
1356         nz    = bi[i+1] - bi[i];
1357         bjtmp = bj + bi[i];
1358         for  (j=0; j<nz; j++) {
1359           col        = bjtmp[j];
1360           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1361         }
1362 
1363         /* U part */
1364         nz    = bdiag[i]-bdiag[i+1];
1365         bjtmp = bj + bdiag[i+1]+1;
1366         for  (j=0; j<nz; j++) {
1367           col        = bjtmp[j];
1368           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1369         }
1370 
1371         /* load in initial (unfactored row) */
1372         nz    = ai[r[i]+1] - ai[r[i]];
1373         ajtmp = aj + ai[r[i]];
1374         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1];
1375         for (j=0; j<nz; j++) {
1376           col        = ics[ajtmp[j]];
1377           rtmp1[col] = v1[j]; rtmp2[col] = v2[j];
1378         }
1379         /* ZeropivotApply(): shift the diagonal of the matrix  */
1380         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount;
1381 
1382         /* elimination */
1383         bjtmp = bj + bi[i];
1384         row   = *bjtmp++; /* pivot row */
1385         nzL   = bi[i+1] - bi[i];
1386         for (k=0; k < nzL; k++) {
1387           pc1 = rtmp1 + row;
1388           pc2 = rtmp2 + row;
1389           if (*pc1 != 0.0 || *pc2 != 0.0) {
1390             pv   = b->a + bdiag[row];
1391             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv);
1392             *pc1 = mul1;       *pc2 = mul2;
1393 
1394             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1395             pv = b->a + bdiag[row+1]+1;
1396             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1397             for (j=0; j<nz; j++) {
1398               col         = pj[j];
1399               rtmp1[col] -= mul1 * pv[j];
1400               rtmp2[col] -= mul2 * pv[j];
1401             }
1402             ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr);
1403           }
1404           row = *bjtmp++;
1405         }
1406 
1407         /* finished row i; check zero pivot, then stick row i into b->a */
1408         rs = 0.0;
1409         /* L part */
1410         pc1 = b->a + bi[i];
1411         pj  = b->j + bi[i];
1412         nz  = bi[i+1] - bi[i];
1413         for (j=0; j<nz; j++) {
1414           col    = pj[j];
1415           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1416         }
1417         /* U part */
1418         pc1 = b->a + bdiag[i+1]+1;
1419         pj  = b->j + bdiag[i+1]+1;
1420         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1421         for (j=0; j<nz; j++) {
1422           col    = pj[j];
1423           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1424         }
1425 
1426         sctx.rs = rs;
1427         sctx.pv = rtmp1[i];
1428         ierr    = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr);
1429         if (sctx.newshift) break;
1430         pc1  = b->a + bdiag[i]; /* Mark diagonal */
1431         *pc1 = 1.0/sctx.pv;
1432 
1433         /* Now take care of diagonal 2x2 block. */
1434         pc2 = rtmp2 + i;
1435         if (*pc2 != 0.0) {
1436           mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */
1437           *pc2 = mul1;          /* insert L entry */
1438           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1439           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1440           for (j=0; j<nz; j++) {
1441             col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col];
1442           }
1443           ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1444         }
1445 
1446         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1447         rs = 0.0;
1448         /* L part */
1449         pc2 = b->a + bi[i+1];
1450         pj  = b->j + bi[i+1];
1451         nz  = bi[i+2] - bi[i+1];
1452         for (j=0; j<nz; j++) {
1453           col    = pj[j];
1454           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1455         }
1456         /* U part */
1457         pc2 = b->a + bdiag[i+2]+1;
1458         pj  = b->j + bdiag[i+2]+1;
1459         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1460         for (j=0; j<nz; j++) {
1461           col    = pj[j];
1462           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1463         }
1464 
1465         sctx.rs = rs;
1466         sctx.pv = rtmp2[i+1];
1467         ierr    = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr);
1468         if (sctx.newshift) break;
1469         pc2  = b->a + bdiag[i+1];
1470         *pc2 = 1.0/sctx.pv;
1471         break;
1472 
1473       case 3:
1474         /*----------*/
1475         /* zero rtmp */
1476         /* L part */
1477         nz    = bi[i+1] - bi[i];
1478         bjtmp = bj + bi[i];
1479         for  (j=0; j<nz; j++) {
1480           col        = bjtmp[j];
1481           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1482         }
1483 
1484         /* U part */
1485         nz    = bdiag[i]-bdiag[i+1];
1486         bjtmp = bj + bdiag[i+1]+1;
1487         for  (j=0; j<nz; j++) {
1488           col        = bjtmp[j];
1489           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1490         }
1491 
1492         /* load in initial (unfactored row) */
1493         nz    = ai[r[i]+1] - ai[r[i]];
1494         ajtmp = aj + ai[r[i]];
1495         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2];
1496         for (j=0; j<nz; j++) {
1497           col        = ics[ajtmp[j]];
1498           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j];
1499         }
1500         /* ZeropivotApply(): shift the diagonal of the matrix  */
1501         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount;
1502 
1503         /* elimination */
1504         bjtmp = bj + bi[i];
1505         row   = *bjtmp++; /* pivot row */
1506         nzL   = bi[i+1] - bi[i];
1507         for (k=0; k < nzL; k++) {
1508           pc1 = rtmp1 + row;
1509           pc2 = rtmp2 + row;
1510           pc3 = rtmp3 + row;
1511           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1512             pv   = b->a + bdiag[row];
1513             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv);
1514             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3;
1515 
1516             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1517             pv = b->a + bdiag[row+1]+1;
1518             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1519             for (j=0; j<nz; j++) {
1520               col         = pj[j];
1521               rtmp1[col] -= mul1 * pv[j];
1522               rtmp2[col] -= mul2 * pv[j];
1523               rtmp3[col] -= mul3 * pv[j];
1524             }
1525             ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr);
1526           }
1527           row = *bjtmp++;
1528         }
1529 
1530         /* finished row i; check zero pivot, then stick row i into b->a */
1531         rs = 0.0;
1532         /* L part */
1533         pc1 = b->a + bi[i];
1534         pj  = b->j + bi[i];
1535         nz  = bi[i+1] - bi[i];
1536         for (j=0; j<nz; j++) {
1537           col    = pj[j];
1538           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1539         }
1540         /* U part */
1541         pc1 = b->a + bdiag[i+1]+1;
1542         pj  = b->j + bdiag[i+1]+1;
1543         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1544         for (j=0; j<nz; j++) {
1545           col    = pj[j];
1546           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1547         }
1548 
1549         sctx.rs = rs;
1550         sctx.pv = rtmp1[i];
1551         ierr    = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr);
1552         if (sctx.newshift) break;
1553         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1554         *pc1 = 1.0/sctx.pv;
1555 
1556         /* Now take care of 1st column of diagonal 3x3 block. */
1557         pc2 = rtmp2 + i;
1558         pc3 = rtmp3 + i;
1559         if (*pc2 != 0.0 || *pc3 != 0.0) {
1560           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1561           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1562           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1563           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1564           for (j=0; j<nz; j++) {
1565             col         = pj[j];
1566             rtmp2[col] -= mul2 * rtmp1[col];
1567             rtmp3[col] -= mul3 * rtmp1[col];
1568           }
1569           ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr);
1570         }
1571 
1572         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1573         rs = 0.0;
1574         /* L part */
1575         pc2 = b->a + bi[i+1];
1576         pj  = b->j + bi[i+1];
1577         nz  = bi[i+2] - bi[i+1];
1578         for (j=0; j<nz; j++) {
1579           col    = pj[j];
1580           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1581         }
1582         /* U part */
1583         pc2 = b->a + bdiag[i+2]+1;
1584         pj  = b->j + bdiag[i+2]+1;
1585         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1586         for (j=0; j<nz; j++) {
1587           col    = pj[j];
1588           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1589         }
1590 
1591         sctx.rs = rs;
1592         sctx.pv = rtmp2[i+1];
1593         ierr    = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr);
1594         if (sctx.newshift) break;
1595         pc2  = b->a + bdiag[i+1];
1596         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1597 
1598         /* Now take care of 2nd column of diagonal 3x3 block. */
1599         pc3 = rtmp3 + i+1;
1600         if (*pc3 != 0.0) {
1601           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1602           pj   = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1603           nz   = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1604           for (j=0; j<nz; j++) {
1605             col         = pj[j];
1606             rtmp3[col] -= mul3 * rtmp2[col];
1607           }
1608           ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1609         }
1610 
1611         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1612         rs = 0.0;
1613         /* L part */
1614         pc3 = b->a + bi[i+2];
1615         pj  = b->j + bi[i+2];
1616         nz  = bi[i+3] - bi[i+2];
1617         for (j=0; j<nz; j++) {
1618           col    = pj[j];
1619           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1620         }
1621         /* U part */
1622         pc3 = b->a + bdiag[i+3]+1;
1623         pj  = b->j + bdiag[i+3]+1;
1624         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1625         for (j=0; j<nz; j++) {
1626           col    = pj[j];
1627           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1628         }
1629 
1630         sctx.rs = rs;
1631         sctx.pv = rtmp3[i+2];
1632         ierr    = MatPivotCheck(A,info,&sctx,i+2);CHKERRQ(ierr);
1633         if (sctx.newshift) break;
1634         pc3  = b->a + bdiag[i+2];
1635         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1636         break;
1637       case 4:
1638         /*----------*/
1639         /* zero rtmp */
1640         /* L part */
1641         nz    = bi[i+1] - bi[i];
1642         bjtmp = bj + bi[i];
1643         for  (j=0; j<nz; j++) {
1644           col        = bjtmp[j];
1645           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0;
1646         }
1647 
1648         /* U part */
1649         nz    = bdiag[i]-bdiag[i+1];
1650         bjtmp = bj + bdiag[i+1]+1;
1651         for  (j=0; j<nz; j++) {
1652           col        = bjtmp[j];
1653           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0;
1654         }
1655 
1656         /* load in initial (unfactored row) */
1657         nz    = ai[r[i]+1] - ai[r[i]];
1658         ajtmp = aj + ai[r[i]];
1659         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3];
1660         for (j=0; j<nz; j++) {
1661           col        = ics[ajtmp[j]];
1662           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j];
1663         }
1664         /* ZeropivotApply(): shift the diagonal of the matrix  */
1665         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount;
1666 
1667         /* elimination */
1668         bjtmp = bj + bi[i];
1669         row   = *bjtmp++; /* pivot row */
1670         nzL   = bi[i+1] - bi[i];
1671         for (k=0; k < nzL; k++) {
1672           pc1 = rtmp1 + row;
1673           pc2 = rtmp2 + row;
1674           pc3 = rtmp3 + row;
1675           pc4 = rtmp4 + row;
1676           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1677             pv   = b->a + bdiag[row];
1678             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv);
1679             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4;
1680 
1681             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1682             pv = b->a + bdiag[row+1]+1;
1683             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1684             for (j=0; j<nz; j++) {
1685               col         = pj[j];
1686               rtmp1[col] -= mul1 * pv[j];
1687               rtmp2[col] -= mul2 * pv[j];
1688               rtmp3[col] -= mul3 * pv[j];
1689               rtmp4[col] -= mul4 * pv[j];
1690             }
1691             ierr = PetscLogFlops(4+8*nz);CHKERRQ(ierr);
1692           }
1693           row = *bjtmp++;
1694         }
1695 
1696         /* finished row i; check zero pivot, then stick row i into b->a */
1697         rs = 0.0;
1698         /* L part */
1699         pc1 = b->a + bi[i];
1700         pj  = b->j + bi[i];
1701         nz  = bi[i+1] - bi[i];
1702         for (j=0; j<nz; j++) {
1703           col    = pj[j];
1704           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1705         }
1706         /* U part */
1707         pc1 = b->a + bdiag[i+1]+1;
1708         pj  = b->j + bdiag[i+1]+1;
1709         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1710         for (j=0; j<nz; j++) {
1711           col    = pj[j];
1712           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1713         }
1714 
1715         sctx.rs = rs;
1716         sctx.pv = rtmp1[i];
1717         ierr    = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr);
1718         if (sctx.newshift) break;
1719         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1720         *pc1 = 1.0/sctx.pv;
1721 
1722         /* Now take care of 1st column of diagonal 4x4 block. */
1723         pc2 = rtmp2 + i;
1724         pc3 = rtmp3 + i;
1725         pc4 = rtmp4 + i;
1726         if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1727           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1728           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1729           mul4 = (*pc4)*(*pc1); *pc4 = mul4;
1730           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1731           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1732           for (j=0; j<nz; j++) {
1733             col         = pj[j];
1734             rtmp2[col] -= mul2 * rtmp1[col];
1735             rtmp3[col] -= mul3 * rtmp1[col];
1736             rtmp4[col] -= mul4 * rtmp1[col];
1737           }
1738           ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr);
1739         }
1740 
1741         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1742         rs = 0.0;
1743         /* L part */
1744         pc2 = b->a + bi[i+1];
1745         pj  = b->j + bi[i+1];
1746         nz  = bi[i+2] - bi[i+1];
1747         for (j=0; j<nz; j++) {
1748           col    = pj[j];
1749           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1750         }
1751         /* U part */
1752         pc2 = b->a + bdiag[i+2]+1;
1753         pj  = b->j + bdiag[i+2]+1;
1754         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1755         for (j=0; j<nz; j++) {
1756           col    = pj[j];
1757           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1758         }
1759 
1760         sctx.rs = rs;
1761         sctx.pv = rtmp2[i+1];
1762         ierr    = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr);
1763         if (sctx.newshift) break;
1764         pc2  = b->a + bdiag[i+1];
1765         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1766 
1767         /* Now take care of 2nd column of diagonal 4x4 block. */
1768         pc3 = rtmp3 + i+1;
1769         pc4 = rtmp4 + i+1;
1770         if (*pc3 != 0.0 || *pc4 != 0.0) {
1771           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1772           mul4 = (*pc4)*(*pc2); *pc4 = mul4;
1773           pj   = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1774           nz   = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1775           for (j=0; j<nz; j++) {
1776             col         = pj[j];
1777             rtmp3[col] -= mul3 * rtmp2[col];
1778             rtmp4[col] -= mul4 * rtmp2[col];
1779           }
1780           ierr = PetscLogFlops(4*nz);CHKERRQ(ierr);
1781         }
1782 
1783         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1784         rs = 0.0;
1785         /* L part */
1786         pc3 = b->a + bi[i+2];
1787         pj  = b->j + bi[i+2];
1788         nz  = bi[i+3] - bi[i+2];
1789         for (j=0; j<nz; j++) {
1790           col    = pj[j];
1791           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1792         }
1793         /* U part */
1794         pc3 = b->a + bdiag[i+3]+1;
1795         pj  = b->j + bdiag[i+3]+1;
1796         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1797         for (j=0; j<nz; j++) {
1798           col    = pj[j];
1799           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1800         }
1801 
1802         sctx.rs = rs;
1803         sctx.pv = rtmp3[i+2];
1804         ierr    = MatPivotCheck(A,info,&sctx,i+2);CHKERRQ(ierr);
1805         if (sctx.newshift) break;
1806         pc3  = b->a + bdiag[i+2];
1807         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1808 
1809         /* Now take care of 3rd column of diagonal 4x4 block. */
1810         pc4 = rtmp4 + i+2;
1811         if (*pc4 != 0.0) {
1812           mul4 = (*pc4)*(*pc3); *pc4 = mul4;
1813           pj   = b->j + bdiag[i+3]+1;     /* beginning of U(i+2,:) */
1814           nz   = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */
1815           for (j=0; j<nz; j++) {
1816             col         = pj[j];
1817             rtmp4[col] -= mul4 * rtmp3[col];
1818           }
1819           ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1820         }
1821 
1822         /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1823         rs = 0.0;
1824         /* L part */
1825         pc4 = b->a + bi[i+3];
1826         pj  = b->j + bi[i+3];
1827         nz  = bi[i+4] - bi[i+3];
1828         for (j=0; j<nz; j++) {
1829           col    = pj[j];
1830           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1831         }
1832         /* U part */
1833         pc4 = b->a + bdiag[i+4]+1;
1834         pj  = b->j + bdiag[i+4]+1;
1835         nz  = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */
1836         for (j=0; j<nz; j++) {
1837           col    = pj[j];
1838           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1839         }
1840 
1841         sctx.rs = rs;
1842         sctx.pv = rtmp4[i+3];
1843         ierr    = MatPivotCheck(A,info,&sctx,i+3);CHKERRQ(ierr);
1844         if (sctx.newshift) break;
1845         pc4  = b->a + bdiag[i+3];
1846         *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */
1847         break;
1848 
1849       default:
1850         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
1851       }
1852       if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1853       i += nodesz;                 /* Update the row */
1854     }
1855 
1856     /* MatPivotRefine() */
1857     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max) {
1858       /*
1859        * if no shift in this attempt & shifting & started shifting & can refine,
1860        * then try lower shift
1861        */
1862       sctx.shift_hi       = sctx.shift_fraction;
1863       sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.;
1864       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
1865       sctx.newshift       = PETSC_TRUE;
1866       sctx.nshift++;
1867     }
1868   } while (sctx.newshift);
1869 
1870   ierr = PetscFree(rtmp1);CHKERRQ(ierr);
1871   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
1872   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
1873   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1874 
1875   if (b->inode.size) {
1876     C->ops->solve           = MatSolve_SeqAIJ_Inode;
1877   } else {
1878     C->ops->solve           = MatSolve_SeqAIJ;
1879   }
1880   C->ops->solveadd          = MatSolveAdd_SeqAIJ;
1881   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1882   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1883   C->ops->matsolve          = MatMatSolve_SeqAIJ;
1884   C->assembled              = PETSC_TRUE;
1885   C->preallocated           = PETSC_TRUE;
1886 
1887   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
1888 
1889   /* MatShiftView(A,info,&sctx) */
1890   if (sctx.nshift) {
1891     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) {
1892       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr);
1893     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1894       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
1895     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1896       ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %G\n",sctx.nshift,info->shiftamount);CHKERRQ(ierr);
1897     }
1898   }
1899   PetscFunctionReturn(0);
1900 }
1901 
1902 #undef __FUNCT__
1903 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode_inplace"
1904 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info)
1905 {
1906   Mat             C     = B;
1907   Mat_SeqAIJ      *a    = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1908   IS              iscol = b->col,isrow = b->row,isicol = b->icol;
1909   PetscErrorCode  ierr;
1910   const PetscInt  *r,*ic,*c,*ics;
1911   PetscInt        n   = A->rmap->n,*bi = b->i;
1912   PetscInt        *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1913   PetscInt        i,j,idx,*bd = b->diag,node_max,nodesz;
1914   PetscInt        *ai = a->i,*aj = a->j;
1915   PetscInt        *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1916   PetscScalar     mul1,mul2,mul3,tmp;
1917   MatScalar       *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1918   const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1;
1919   PetscReal       rs=0.0;
1920   FactorShiftCtx  sctx;
1921 
1922   PetscFunctionBegin;
1923   sctx.shift_top      = 0;
1924   sctx.nshift_max     = 0;
1925   sctx.shift_lo       = 0;
1926   sctx.shift_hi       = 0;
1927   sctx.shift_fraction = 0;
1928 
1929   /* if both shift schemes are chosen by user, only use info->shiftpd */
1930   if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1931     sctx.shift_top = 0;
1932     for (i=0; i<n; i++) {
1933       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1934       rs    = 0.0;
1935       ajtmp = aj + ai[i];
1936       rtmp1 = aa + ai[i];
1937       nz    = ai[i+1] - ai[i];
1938       for (j=0; j<nz; j++) {
1939         if (*ajtmp != i) {
1940           rs += PetscAbsScalar(*rtmp1++);
1941         } else {
1942           rs -= PetscRealPart(*rtmp1++);
1943         }
1944         ajtmp++;
1945       }
1946       if (rs>sctx.shift_top) sctx.shift_top = rs;
1947     }
1948     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1949     sctx.shift_top *= 1.1;
1950     sctx.nshift_max = 5;
1951     sctx.shift_lo   = 0.;
1952     sctx.shift_hi   = 1.;
1953   }
1954   sctx.shift_amount = 0;
1955   sctx.nshift       = 0;
1956 
1957   ierr   = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1958   ierr   = ISGetIndices(iscol,&c);CHKERRQ(ierr);
1959   ierr   = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1960   ierr   = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp11);CHKERRQ(ierr);
1961   ierr   = PetscMemzero(rtmp11,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr);
1962   ics    = ic;
1963   rtmp22 = rtmp11 + n;
1964   rtmp33 = rtmp22 + n;
1965 
1966   node_max = a->inode.node_count;
1967   ns       = a->inode.size;
1968   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1969 
1970   /* If max inode size > 3, split it into two inodes.*/
1971   /* also map the inode sizes according to the ordering */
1972   ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr);
1973   for (i=0,j=0; i<node_max; ++i,++j) {
1974     if (ns[i]>3) {
1975       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1976       ++j;
1977       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1978     } else {
1979       tmp_vec1[j] = ns[i];
1980     }
1981   }
1982   /* Use the correct node_max */
1983   node_max = j;
1984 
1985   /* Now reorder the inode info based on mat re-ordering info */
1986   /* First create a row -> inode_size_array_index map */
1987   ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr);
1988   ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr);
1989   for (i=0,row=0; i<node_max; i++) {
1990     nodesz = tmp_vec1[i];
1991     for (j=0; j<nodesz; j++,row++) {
1992       nsmap[row] = i;
1993     }
1994   }
1995   /* Using nsmap, create a reordered ns structure */
1996   for (i=0,j=0; i< node_max; i++) {
1997     nodesz      = tmp_vec1[nsmap[r[j]]];     /* here the reordered row_no is in r[] */
1998     tmp_vec2[i] = nodesz;
1999     j          += nodesz;
2000   }
2001   ierr = PetscFree(nsmap);CHKERRQ(ierr);
2002   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
2003   /* Now use the correct ns */
2004   ns = tmp_vec2;
2005 
2006   do {
2007     sctx.newshift = PETSC_FALSE;
2008     /* Now loop over each block-row, and do the factorization */
2009     for (i=0,row=0; i<node_max; i++) {
2010       nodesz = ns[i];
2011       nz     = bi[row+1] - bi[row];
2012       bjtmp  = bj + bi[row];
2013 
2014       switch (nodesz) {
2015       case 1:
2016         for  (j=0; j<nz; j++) {
2017           idx         = bjtmp[j];
2018           rtmp11[idx] = 0.0;
2019         }
2020 
2021         /* load in initial (unfactored row) */
2022         idx    = r[row];
2023         nz_tmp = ai[idx+1] - ai[idx];
2024         ajtmp  = aj + ai[idx];
2025         v1     = aa + ai[idx];
2026 
2027         for (j=0; j<nz_tmp; j++) {
2028           idx         = ics[ajtmp[j]];
2029           rtmp11[idx] = v1[j];
2030         }
2031         rtmp11[ics[r[row]]] += sctx.shift_amount;
2032 
2033         prow = *bjtmp++;
2034         while (prow < row) {
2035           pc1 = rtmp11 + prow;
2036           if (*pc1 != 0.0) {
2037             pv     = ba + bd[prow];
2038             pj     = nbj + bd[prow];
2039             mul1   = *pc1 * *pv++;
2040             *pc1   = mul1;
2041             nz_tmp = bi[prow+1] - bd[prow] - 1;
2042             ierr   = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr);
2043             for (j=0; j<nz_tmp; j++) {
2044               tmp          = pv[j];
2045               idx          = pj[j];
2046               rtmp11[idx] -= mul1 * tmp;
2047             }
2048           }
2049           prow = *bjtmp++;
2050         }
2051         pj  = bj + bi[row];
2052         pc1 = ba + bi[row];
2053 
2054         sctx.pv     = rtmp11[row];
2055         rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
2056         rs          = 0.0;
2057         for (j=0; j<nz; j++) {
2058           idx    = pj[j];
2059           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2060           if (idx != row) rs += PetscAbsScalar(pc1[j]);
2061         }
2062         sctx.rs = rs;
2063         ierr    = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr);
2064         if (sctx.newshift) goto endofwhile;
2065         break;
2066 
2067       case 2:
2068         for (j=0; j<nz; j++) {
2069           idx         = bjtmp[j];
2070           rtmp11[idx] = 0.0;
2071           rtmp22[idx] = 0.0;
2072         }
2073 
2074         /* load in initial (unfactored row) */
2075         idx    = r[row];
2076         nz_tmp = ai[idx+1] - ai[idx];
2077         ajtmp  = aj + ai[idx];
2078         v1     = aa + ai[idx];
2079         v2     = aa + ai[idx+1];
2080         for (j=0; j<nz_tmp; j++) {
2081           idx         = ics[ajtmp[j]];
2082           rtmp11[idx] = v1[j];
2083           rtmp22[idx] = v2[j];
2084         }
2085         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2086         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2087 
2088         prow = *bjtmp++;
2089         while (prow < row) {
2090           pc1 = rtmp11 + prow;
2091           pc2 = rtmp22 + prow;
2092           if (*pc1 != 0.0 || *pc2 != 0.0) {
2093             pv   = ba + bd[prow];
2094             pj   = nbj + bd[prow];
2095             mul1 = *pc1 * *pv;
2096             mul2 = *pc2 * *pv;
2097             ++pv;
2098             *pc1 = mul1;
2099             *pc2 = mul2;
2100 
2101             nz_tmp = bi[prow+1] - bd[prow] - 1;
2102             for (j=0; j<nz_tmp; j++) {
2103               tmp          = pv[j];
2104               idx          = pj[j];
2105               rtmp11[idx] -= mul1 * tmp;
2106               rtmp22[idx] -= mul2 * tmp;
2107             }
2108             ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr);
2109           }
2110           prow = *bjtmp++;
2111         }
2112 
2113         /* Now take care of diagonal 2x2 block. Note: prow = row here */
2114         pc1 = rtmp11 + prow;
2115         pc2 = rtmp22 + prow;
2116 
2117         sctx.pv = *pc1;
2118         pj      = bj + bi[prow];
2119         rs      = 0.0;
2120         for (j=0; j<nz; j++) {
2121           idx = pj[j];
2122           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2123         }
2124         sctx.rs = rs;
2125         ierr    = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr);
2126         if (sctx.newshift) goto endofwhile;
2127 
2128         if (*pc2 != 0.0) {
2129           pj     = nbj + bd[prow];
2130           mul2   = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
2131           *pc2   = mul2;
2132           nz_tmp = bi[prow+1] - bd[prow] - 1;
2133           for (j=0; j<nz_tmp; j++) {
2134             idx          = pj[j];
2135             tmp          = rtmp11[idx];
2136             rtmp22[idx] -= mul2 * tmp;
2137           }
2138           ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr);
2139         }
2140 
2141         pj  = bj + bi[row];
2142         pc1 = ba + bi[row];
2143         pc2 = ba + bi[row+1];
2144 
2145         sctx.pv       = rtmp22[row+1];
2146         rs            = 0.0;
2147         rtmp11[row]   = 1.0/rtmp11[row];
2148         rtmp22[row+1] = 1.0/rtmp22[row+1];
2149         /* copy row entries from dense representation to sparse */
2150         for (j=0; j<nz; j++) {
2151           idx    = pj[j];
2152           pc1[j] = rtmp11[idx];
2153           pc2[j] = rtmp22[idx];
2154           if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
2155         }
2156         sctx.rs = rs;
2157         ierr    = MatPivotCheck(A,info,&sctx,row+1);CHKERRQ(ierr);
2158         if (sctx.newshift) goto endofwhile;
2159         break;
2160 
2161       case 3:
2162         for  (j=0; j<nz; j++) {
2163           idx         = bjtmp[j];
2164           rtmp11[idx] = 0.0;
2165           rtmp22[idx] = 0.0;
2166           rtmp33[idx] = 0.0;
2167         }
2168         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2169         idx    = r[row];
2170         nz_tmp = ai[idx+1] - ai[idx];
2171         ajtmp  = aj + ai[idx];
2172         v1     = aa + ai[idx];
2173         v2     = aa + ai[idx+1];
2174         v3     = aa + ai[idx+2];
2175         for (j=0; j<nz_tmp; j++) {
2176           idx         = ics[ajtmp[j]];
2177           rtmp11[idx] = v1[j];
2178           rtmp22[idx] = v2[j];
2179           rtmp33[idx] = v3[j];
2180         }
2181         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2182         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2183         rtmp33[ics[r[row+2]]] += sctx.shift_amount;
2184 
2185         /* loop over all pivot row blocks above this row block */
2186         prow = *bjtmp++;
2187         while (prow < row) {
2188           pc1 = rtmp11 + prow;
2189           pc2 = rtmp22 + prow;
2190           pc3 = rtmp33 + prow;
2191           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0) {
2192             pv   = ba  + bd[prow];
2193             pj   = nbj + bd[prow];
2194             mul1 = *pc1 * *pv;
2195             mul2 = *pc2 * *pv;
2196             mul3 = *pc3 * *pv;
2197             ++pv;
2198             *pc1 = mul1;
2199             *pc2 = mul2;
2200             *pc3 = mul3;
2201 
2202             nz_tmp = bi[prow+1] - bd[prow] - 1;
2203             /* update this row based on pivot row */
2204             for (j=0; j<nz_tmp; j++) {
2205               tmp          = pv[j];
2206               idx          = pj[j];
2207               rtmp11[idx] -= mul1 * tmp;
2208               rtmp22[idx] -= mul2 * tmp;
2209               rtmp33[idx] -= mul3 * tmp;
2210             }
2211             ierr = PetscLogFlops(3+6*nz_tmp);CHKERRQ(ierr);
2212           }
2213           prow = *bjtmp++;
2214         }
2215 
2216         /* Now take care of diagonal 3x3 block in this set of rows */
2217         /* note: prow = row here */
2218         pc1 = rtmp11 + prow;
2219         pc2 = rtmp22 + prow;
2220         pc3 = rtmp33 + prow;
2221 
2222         sctx.pv = *pc1;
2223         pj      = bj + bi[prow];
2224         rs      = 0.0;
2225         for (j=0; j<nz; j++) {
2226           idx = pj[j];
2227           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2228         }
2229         sctx.rs = rs;
2230         ierr    = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr);
2231         if (sctx.newshift) goto endofwhile;
2232 
2233         if (*pc2 != 0.0 || *pc3 != 0.0) {
2234           mul2   = (*pc2)/(*pc1);
2235           mul3   = (*pc3)/(*pc1);
2236           *pc2   = mul2;
2237           *pc3   = mul3;
2238           nz_tmp = bi[prow+1] - bd[prow] - 1;
2239           pj     = nbj + bd[prow];
2240           for (j=0; j<nz_tmp; j++) {
2241             idx          = pj[j];
2242             tmp          = rtmp11[idx];
2243             rtmp22[idx] -= mul2 * tmp;
2244             rtmp33[idx] -= mul3 * tmp;
2245           }
2246           ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr);
2247         }
2248         ++prow;
2249 
2250         pc2     = rtmp22 + prow;
2251         pc3     = rtmp33 + prow;
2252         sctx.pv = *pc2;
2253         pj      = bj + bi[prow];
2254         rs      = 0.0;
2255         for (j=0; j<nz; j++) {
2256           idx = pj[j];
2257           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2258         }
2259         sctx.rs = rs;
2260         ierr    = MatPivotCheck(A,info,&sctx,row+1);CHKERRQ(ierr);
2261         if (sctx.newshift) goto endofwhile;
2262 
2263         if (*pc3 != 0.0) {
2264           mul3   = (*pc3)/(*pc2);
2265           *pc3   = mul3;
2266           pj     = nbj + bd[prow];
2267           nz_tmp = bi[prow+1] - bd[prow] - 1;
2268           for (j=0; j<nz_tmp; j++) {
2269             idx          = pj[j];
2270             tmp          = rtmp22[idx];
2271             rtmp33[idx] -= mul3 * tmp;
2272           }
2273           ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr);
2274         }
2275 
2276         pj  = bj + bi[row];
2277         pc1 = ba + bi[row];
2278         pc2 = ba + bi[row+1];
2279         pc3 = ba + bi[row+2];
2280 
2281         sctx.pv       = rtmp33[row+2];
2282         rs            = 0.0;
2283         rtmp11[row]   = 1.0/rtmp11[row];
2284         rtmp22[row+1] = 1.0/rtmp22[row+1];
2285         rtmp33[row+2] = 1.0/rtmp33[row+2];
2286         /* copy row entries from dense representation to sparse */
2287         for (j=0; j<nz; j++) {
2288           idx    = pj[j];
2289           pc1[j] = rtmp11[idx];
2290           pc2[j] = rtmp22[idx];
2291           pc3[j] = rtmp33[idx];
2292           if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
2293         }
2294 
2295         sctx.rs = rs;
2296         ierr    = MatPivotCheck(A,info,&sctx,row+2);CHKERRQ(ierr);
2297         if (sctx.newshift) goto endofwhile;
2298         break;
2299 
2300       default:
2301         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
2302       }
2303       row += nodesz;                 /* Update the row */
2304     }
2305 endofwhile:;
2306   } while (sctx.newshift);
2307   ierr = PetscFree(rtmp11);CHKERRQ(ierr);
2308   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
2309   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
2310   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
2311   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
2312 
2313   (B)->ops->solve = MatSolve_SeqAIJ_inplace;
2314   /* do not set solve add, since MatSolve_Inode + Add is faster */
2315   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ_inplace;
2316   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2317   C->assembled              = PETSC_TRUE;
2318   C->preallocated           = PETSC_TRUE;
2319   if (sctx.nshift) {
2320     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2321       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr);
2322     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2323       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
2324     }
2325   }
2326   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
2327   ierr = Mat_CheckInode(C,PETSC_FALSE);CHKERRQ(ierr);
2328   PetscFunctionReturn(0);
2329 }
2330 
2331 
2332 /* ----------------------------------------------------------- */
2333 #undef __FUNCT__
2334 #define __FUNCT__ "MatSolve_SeqAIJ_Inode"
2335 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
2336 {
2337   Mat_SeqAIJ        *a    = (Mat_SeqAIJ*)A->data;
2338   IS                iscol = a->col,isrow = a->row;
2339   PetscErrorCode    ierr;
2340   const PetscInt    *r,*c,*rout,*cout;
2341   PetscInt          i,j,n = A->rmap->n;
2342   PetscInt          node_max,row,nsz,aii,i0,i1,nz;
2343   const PetscInt    *ai = a->i,*a_j = a->j,*ns,*vi,*ad,*aj;
2344   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
2345   PetscScalar       sum1,sum2,sum3,sum4,sum5;
2346   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
2347   const PetscScalar *b;
2348 
2349   PetscFunctionBegin;
2350   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
2351   node_max = a->inode.node_count;
2352   ns       = a->inode.size;     /* Node Size array */
2353 
2354   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2355   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2356   tmp  = a->solve_work;
2357 
2358   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2359   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2360 
2361   /* forward solve the lower triangular */
2362   tmps = tmp;
2363   aa   = a_a;
2364   aj   = a_j;
2365   ad   = a->diag;
2366 
2367   for (i = 0,row = 0; i< node_max; ++i) {
2368     nsz = ns[i];
2369     aii = ai[row];
2370     v1  = aa + aii;
2371     vi  = aj + aii;
2372     nz  = ai[row+1]- ai[row];
2373 
2374     if (i < node_max-1) {
2375       /* Prefetch the indices for the next block */
2376       PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
2377       /* Prefetch the data for the next block */
2378       PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
2379     }
2380 
2381     switch (nsz) {               /* Each loop in 'case' is unrolled */
2382     case 1:
2383       sum1 = b[r[row]];
2384       for (j=0; j<nz-1; j+=2) {
2385         i0    = vi[j];
2386         i1    = vi[j+1];
2387         tmp0  = tmps[i0];
2388         tmp1  = tmps[i1];
2389         sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1;
2390       }
2391       if (j == nz-1) {
2392         tmp0  = tmps[vi[j]];
2393         sum1 -= v1[j]*tmp0;
2394       }
2395       tmp[row++]=sum1;
2396       break;
2397     case 2:
2398       sum1 = b[r[row]];
2399       sum2 = b[r[row+1]];
2400       v2   = aa + ai[row+1];
2401 
2402       for (j=0; j<nz-1; j+=2) {
2403         i0    = vi[j];
2404         i1    = vi[j+1];
2405         tmp0  = tmps[i0];
2406         tmp1  = tmps[i1];
2407         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2408         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2409       }
2410       if (j == nz-1) {
2411         tmp0  = tmps[vi[j]];
2412         sum1 -= v1[j] *tmp0;
2413         sum2 -= v2[j] *tmp0;
2414       }
2415       sum2     -= v2[nz] * sum1;
2416       tmp[row++]=sum1;
2417       tmp[row++]=sum2;
2418       break;
2419     case 3:
2420       sum1 = b[r[row]];
2421       sum2 = b[r[row+1]];
2422       sum3 = b[r[row+2]];
2423       v2   = aa + ai[row+1];
2424       v3   = aa + ai[row+2];
2425 
2426       for (j=0; j<nz-1; j+=2) {
2427         i0    = vi[j];
2428         i1    = vi[j+1];
2429         tmp0  = tmps[i0];
2430         tmp1  = tmps[i1];
2431         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2432         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2433         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2434       }
2435       if (j == nz-1) {
2436         tmp0  = tmps[vi[j]];
2437         sum1 -= v1[j] *tmp0;
2438         sum2 -= v2[j] *tmp0;
2439         sum3 -= v3[j] *tmp0;
2440       }
2441       sum2     -= v2[nz] * sum1;
2442       sum3     -= v3[nz] * sum1;
2443       sum3     -= v3[nz+1] * sum2;
2444       tmp[row++]=sum1;
2445       tmp[row++]=sum2;
2446       tmp[row++]=sum3;
2447       break;
2448 
2449     case 4:
2450       sum1 = b[r[row]];
2451       sum2 = b[r[row+1]];
2452       sum3 = b[r[row+2]];
2453       sum4 = b[r[row+3]];
2454       v2   = aa + ai[row+1];
2455       v3   = aa + ai[row+2];
2456       v4   = aa + ai[row+3];
2457 
2458       for (j=0; j<nz-1; j+=2) {
2459         i0    = vi[j];
2460         i1    = vi[j+1];
2461         tmp0  = tmps[i0];
2462         tmp1  = tmps[i1];
2463         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2464         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2465         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2466         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2467       }
2468       if (j == nz-1) {
2469         tmp0  = tmps[vi[j]];
2470         sum1 -= v1[j] *tmp0;
2471         sum2 -= v2[j] *tmp0;
2472         sum3 -= v3[j] *tmp0;
2473         sum4 -= v4[j] *tmp0;
2474       }
2475       sum2 -= v2[nz] * sum1;
2476       sum3 -= v3[nz] * sum1;
2477       sum4 -= v4[nz] * sum1;
2478       sum3 -= v3[nz+1] * sum2;
2479       sum4 -= v4[nz+1] * sum2;
2480       sum4 -= v4[nz+2] * sum3;
2481 
2482       tmp[row++]=sum1;
2483       tmp[row++]=sum2;
2484       tmp[row++]=sum3;
2485       tmp[row++]=sum4;
2486       break;
2487     case 5:
2488       sum1 = b[r[row]];
2489       sum2 = b[r[row+1]];
2490       sum3 = b[r[row+2]];
2491       sum4 = b[r[row+3]];
2492       sum5 = b[r[row+4]];
2493       v2   = aa + ai[row+1];
2494       v3   = aa + ai[row+2];
2495       v4   = aa + ai[row+3];
2496       v5   = aa + ai[row+4];
2497 
2498       for (j=0; j<nz-1; j+=2) {
2499         i0    = vi[j];
2500         i1    = vi[j+1];
2501         tmp0  = tmps[i0];
2502         tmp1  = tmps[i1];
2503         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2504         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2505         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2506         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2507         sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1;
2508       }
2509       if (j == nz-1) {
2510         tmp0  = tmps[vi[j]];
2511         sum1 -= v1[j] *tmp0;
2512         sum2 -= v2[j] *tmp0;
2513         sum3 -= v3[j] *tmp0;
2514         sum4 -= v4[j] *tmp0;
2515         sum5 -= v5[j] *tmp0;
2516       }
2517 
2518       sum2 -= v2[nz] * sum1;
2519       sum3 -= v3[nz] * sum1;
2520       sum4 -= v4[nz] * sum1;
2521       sum5 -= v5[nz] * sum1;
2522       sum3 -= v3[nz+1] * sum2;
2523       sum4 -= v4[nz+1] * sum2;
2524       sum5 -= v5[nz+1] * sum2;
2525       sum4 -= v4[nz+2] * sum3;
2526       sum5 -= v5[nz+2] * sum3;
2527       sum5 -= v5[nz+3] * sum4;
2528 
2529       tmp[row++]=sum1;
2530       tmp[row++]=sum2;
2531       tmp[row++]=sum3;
2532       tmp[row++]=sum4;
2533       tmp[row++]=sum5;
2534       break;
2535     default:
2536       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2537     }
2538   }
2539   /* backward solve the upper triangular */
2540   for (i=node_max -1,row = n-1; i>=0; i--) {
2541     nsz = ns[i];
2542     aii = ad[row+1] + 1;
2543     v1  = aa + aii;
2544     vi  = aj + aii;
2545     nz  = ad[row]- ad[row+1] - 1;
2546 
2547     if (i > 0) {
2548       /* Prefetch the indices for the next block */
2549       PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2550       /* Prefetch the data for the next block */
2551       PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2552     }
2553 
2554     switch (nsz) {               /* Each loop in 'case' is unrolled */
2555     case 1:
2556       sum1 = tmp[row];
2557 
2558       for (j=0; j<nz-1; j+=2) {
2559         i0    = vi[j];
2560         i1    = vi[j+1];
2561         tmp0  = tmps[i0];
2562         tmp1  = tmps[i1];
2563         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2564       }
2565       if (j == nz-1) {
2566         tmp0  = tmps[vi[j]];
2567         sum1 -= v1[j]*tmp0;
2568       }
2569       x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2570       break;
2571     case 2:
2572       sum1 = tmp[row];
2573       sum2 = tmp[row-1];
2574       v2   = aa + ad[row] + 1;
2575       for (j=0; j<nz-1; j+=2) {
2576         i0    = vi[j];
2577         i1    = vi[j+1];
2578         tmp0  = tmps[i0];
2579         tmp1  = tmps[i1];
2580         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2581         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2582       }
2583       if (j == nz-1) {
2584         tmp0  = tmps[vi[j]];
2585         sum1 -= v1[j]* tmp0;
2586         sum2 -= v2[j+1]* tmp0;
2587       }
2588 
2589       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2590       sum2     -= v2[0] * tmp0;
2591       x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2592       break;
2593     case 3:
2594       sum1 = tmp[row];
2595       sum2 = tmp[row -1];
2596       sum3 = tmp[row -2];
2597       v2   = aa + ad[row] + 1;
2598       v3   = aa + ad[row -1] + 1;
2599       for (j=0; j<nz-1; j+=2) {
2600         i0    = vi[j];
2601         i1    = vi[j+1];
2602         tmp0  = tmps[i0];
2603         tmp1  = tmps[i1];
2604         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2605         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2606         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2607       }
2608       if (j== nz-1) {
2609         tmp0  = tmps[vi[j]];
2610         sum1 -= v1[j] * tmp0;
2611         sum2 -= v2[j+1] * tmp0;
2612         sum3 -= v3[j+2] * tmp0;
2613       }
2614       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2615       sum2     -= v2[0]* tmp0;
2616       sum3     -= v3[1] * tmp0;
2617       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2618       sum3     -= v3[0]* tmp0;
2619       x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2620 
2621       break;
2622     case 4:
2623       sum1 = tmp[row];
2624       sum2 = tmp[row -1];
2625       sum3 = tmp[row -2];
2626       sum4 = tmp[row -3];
2627       v2   = aa + ad[row]+1;
2628       v3   = aa + ad[row -1]+1;
2629       v4   = aa + ad[row -2]+1;
2630 
2631       for (j=0; j<nz-1; j+=2) {
2632         i0    = vi[j];
2633         i1    = vi[j+1];
2634         tmp0  = tmps[i0];
2635         tmp1  = tmps[i1];
2636         sum1 -= v1[j] * tmp0   + v1[j+1] * tmp1;
2637         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2638         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2639         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2640       }
2641       if (j== nz-1) {
2642         tmp0  = tmps[vi[j]];
2643         sum1 -= v1[j] * tmp0;
2644         sum2 -= v2[j+1] * tmp0;
2645         sum3 -= v3[j+2] * tmp0;
2646         sum4 -= v4[j+3] * tmp0;
2647       }
2648 
2649       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2650       sum2     -= v2[0] * tmp0;
2651       sum3     -= v3[1] * tmp0;
2652       sum4     -= v4[2] * tmp0;
2653       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2654       sum3     -= v3[0] * tmp0;
2655       sum4     -= v4[1] * tmp0;
2656       tmp0      = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2657       sum4     -= v4[0] * tmp0;
2658       x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2659       break;
2660     case 5:
2661       sum1 = tmp[row];
2662       sum2 = tmp[row -1];
2663       sum3 = tmp[row -2];
2664       sum4 = tmp[row -3];
2665       sum5 = tmp[row -4];
2666       v2   = aa + ad[row]+1;
2667       v3   = aa + ad[row -1]+1;
2668       v4   = aa + ad[row -2]+1;
2669       v5   = aa + ad[row -3]+1;
2670       for (j=0; j<nz-1; j+=2) {
2671         i0    = vi[j];
2672         i1    = vi[j+1];
2673         tmp0  = tmps[i0];
2674         tmp1  = tmps[i1];
2675         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2676         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2677         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2678         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2679         sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1;
2680       }
2681       if (j==nz-1) {
2682         tmp0  = tmps[vi[j]];
2683         sum1 -= v1[j] * tmp0;
2684         sum2 -= v2[j+1] * tmp0;
2685         sum3 -= v3[j+2] * tmp0;
2686         sum4 -= v4[j+3] * tmp0;
2687         sum5 -= v5[j+4] * tmp0;
2688       }
2689 
2690       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2691       sum2     -= v2[0] * tmp0;
2692       sum3     -= v3[1] * tmp0;
2693       sum4     -= v4[2] * tmp0;
2694       sum5     -= v5[3] * tmp0;
2695       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2696       sum3     -= v3[0] * tmp0;
2697       sum4     -= v4[1] * tmp0;
2698       sum5     -= v5[2] * tmp0;
2699       tmp0      = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2700       sum4     -= v4[0] * tmp0;
2701       sum5     -= v5[1] * tmp0;
2702       tmp0      = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2703       sum5     -= v5[0] * tmp0;
2704       x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--;
2705       break;
2706     default:
2707       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2708     }
2709   }
2710   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2711   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2712   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2713   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2714   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
2715   PetscFunctionReturn(0);
2716 }
2717 
2718 
2719 /*
2720      Makes a longer coloring[] array and calls the usual code with that
2721 */
2722 #undef __FUNCT__
2723 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode"
2724 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
2725 {
2726   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)mat->data;
2727   PetscErrorCode  ierr;
2728   PetscInt        n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
2729   PetscInt        *colorused,i;
2730   ISColoringValue *newcolor;
2731 
2732   PetscFunctionBegin;
2733   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr);
2734   /* loop over inodes, marking a color for each column*/
2735   row = 0;
2736   for (i=0; i<m; i++) {
2737     for (j=0; j<ns[i]; j++) {
2738       newcolor[row++] = coloring[i] + j*ncolors;
2739     }
2740   }
2741 
2742   /* eliminate unneeded colors */
2743   ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr);
2744   ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr);
2745   for (i=0; i<n; i++) {
2746     colorused[newcolor[i]] = 1;
2747   }
2748 
2749   for (i=1; i<5*ncolors; i++) {
2750     colorused[i] += colorused[i-1];
2751   }
2752   ncolors = colorused[5*ncolors-1];
2753   for (i=0; i<n; i++) {
2754     newcolor[i] = colorused[newcolor[i]]-1;
2755   }
2756   ierr = PetscFree(colorused);CHKERRQ(ierr);
2757   ierr = ISColoringCreate(PetscObjectComm((PetscObject)mat),ncolors,n,newcolor,iscoloring);CHKERRQ(ierr);
2758   ierr = PetscFree(coloring);CHKERRQ(ierr);
2759   PetscFunctionReturn(0);
2760 }
2761 
2762 #include <petsc-private/kernels/blockinvert.h>
2763 
2764 #undef __FUNCT__
2765 #define __FUNCT__ "MatSOR_SeqAIJ_Inode"
2766 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
2767 {
2768   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
2769   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3;
2770   MatScalar         *ibdiag,*bdiag,work[25],*t;
2771   PetscScalar       *x,tmp4,tmp5,x1,x2,x3,x4,x5;
2772   const MatScalar   *v = a->a,*v1,*v2,*v3,*v4,*v5;
2773   const PetscScalar *xb, *b;
2774   PetscReal         zeropivot = 1.0e-15, shift = 0.0;
2775   PetscErrorCode    ierr;
2776   PetscInt          n,m = a->inode.node_count,cnt = 0,i,j,row,i1,i2;
2777   PetscInt          sz,k,ipvt[5];
2778   const PetscInt    *sizes = a->inode.size,*idx,*diag = a->diag,*ii = a->i;
2779 
2780   PetscFunctionBegin;
2781   if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
2782   if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
2783 
2784   if (!a->inode.ibdiagvalid) {
2785     if (!a->inode.ibdiag) {
2786       /* calculate space needed for diagonal blocks */
2787       for (i=0; i<m; i++) {
2788         cnt += sizes[i]*sizes[i];
2789       }
2790       a->inode.bdiagsize = cnt;
2791 
2792       ierr = PetscMalloc3(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag,A->rmap->n,MatScalar,&a->inode.ssor_work);CHKERRQ(ierr);
2793     }
2794 
2795     /* copy over the diagonal blocks and invert them */
2796     ibdiag = a->inode.ibdiag;
2797     bdiag  = a->inode.bdiag;
2798     cnt    = 0;
2799     for (i=0, row = 0; i<m; i++) {
2800       for (j=0; j<sizes[i]; j++) {
2801         for (k=0; k<sizes[i]; k++) {
2802           bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
2803         }
2804       }
2805       ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr);
2806 
2807       switch (sizes[i]) {
2808       case 1:
2809         /* Create matrix data structure */
2810         if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
2811         ibdiag[cnt] = 1.0/ibdiag[cnt];
2812         break;
2813       case 2:
2814         ierr = PetscKernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr);
2815         break;
2816       case 3:
2817         ierr = PetscKernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr);
2818         break;
2819       case 4:
2820         ierr = PetscKernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr);
2821         break;
2822       case 5:
2823         ierr = PetscKernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift);CHKERRQ(ierr);
2824         break;
2825       default:
2826         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2827       }
2828       cnt += sizes[i]*sizes[i];
2829       row += sizes[i];
2830     }
2831     a->inode.ibdiagvalid = PETSC_TRUE;
2832   }
2833   ibdiag = a->inode.ibdiag;
2834   bdiag  = a->inode.bdiag;
2835   t      = a->inode.ssor_work;
2836 
2837   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2838   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2839   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2840   if (flag & SOR_ZERO_INITIAL_GUESS) {
2841     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
2842 
2843       for (i=0, row=0; i<m; i++) {
2844         sz  = diag[row] - ii[row];
2845         v1  = a->a + ii[row];
2846         idx = a->j + ii[row];
2847 
2848         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2849         switch (sizes[i]) {
2850         case 1:
2851 
2852           sum1 = b[row];
2853           for (n = 0; n<sz-1; n+=2) {
2854             i1    = idx[0];
2855             i2    = idx[1];
2856             idx  += 2;
2857             tmp0  = x[i1];
2858             tmp1  = x[i2];
2859             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2860           }
2861 
2862           if (n == sz-1) {
2863             tmp0  = x[*idx];
2864             sum1 -= *v1 * tmp0;
2865           }
2866           t[row]   = sum1;
2867           x[row++] = sum1*(*ibdiag++);
2868           break;
2869         case 2:
2870           v2   = a->a + ii[row+1];
2871           sum1 = b[row];
2872           sum2 = b[row+1];
2873           for (n = 0; n<sz-1; n+=2) {
2874             i1    = idx[0];
2875             i2    = idx[1];
2876             idx  += 2;
2877             tmp0  = x[i1];
2878             tmp1  = x[i2];
2879             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2880             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2881           }
2882 
2883           if (n == sz-1) {
2884             tmp0  = x[*idx];
2885             sum1 -= v1[0] * tmp0;
2886             sum2 -= v2[0] * tmp0;
2887           }
2888           t[row]   = sum1;
2889           t[row+1] = sum2;
2890           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
2891           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
2892           ibdiag  += 4;
2893           break;
2894         case 3:
2895           v2   = a->a + ii[row+1];
2896           v3   = a->a + ii[row+2];
2897           sum1 = b[row];
2898           sum2 = b[row+1];
2899           sum3 = b[row+2];
2900           for (n = 0; n<sz-1; n+=2) {
2901             i1    = idx[0];
2902             i2    = idx[1];
2903             idx  += 2;
2904             tmp0  = x[i1];
2905             tmp1  = x[i2];
2906             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2907             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2908             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2909           }
2910 
2911           if (n == sz-1) {
2912             tmp0  = x[*idx];
2913             sum1 -= v1[0] * tmp0;
2914             sum2 -= v2[0] * tmp0;
2915             sum3 -= v3[0] * tmp0;
2916           }
2917           t[row]   = sum1;
2918           t[row+1] = sum2;
2919           t[row+2] = sum3;
2920           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2921           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2922           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2923           ibdiag  += 9;
2924           break;
2925         case 4:
2926           v2   = a->a + ii[row+1];
2927           v3   = a->a + ii[row+2];
2928           v4   = a->a + ii[row+3];
2929           sum1 = b[row];
2930           sum2 = b[row+1];
2931           sum3 = b[row+2];
2932           sum4 = b[row+3];
2933           for (n = 0; n<sz-1; n+=2) {
2934             i1    = idx[0];
2935             i2    = idx[1];
2936             idx  += 2;
2937             tmp0  = x[i1];
2938             tmp1  = x[i2];
2939             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2940             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2941             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2942             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2943           }
2944 
2945           if (n == sz-1) {
2946             tmp0  = x[*idx];
2947             sum1 -= v1[0] * tmp0;
2948             sum2 -= v2[0] * tmp0;
2949             sum3 -= v3[0] * tmp0;
2950             sum4 -= v4[0] * tmp0;
2951           }
2952           t[row]   = sum1;
2953           t[row+1] = sum2;
2954           t[row+2] = sum3;
2955           t[row+3] = sum4;
2956           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2957           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2958           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2959           x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2960           ibdiag  += 16;
2961           break;
2962         case 5:
2963           v2   = a->a + ii[row+1];
2964           v3   = a->a + ii[row+2];
2965           v4   = a->a + ii[row+3];
2966           v5   = a->a + ii[row+4];
2967           sum1 = b[row];
2968           sum2 = b[row+1];
2969           sum3 = b[row+2];
2970           sum4 = b[row+3];
2971           sum5 = b[row+4];
2972           for (n = 0; n<sz-1; n+=2) {
2973             i1    = idx[0];
2974             i2    = idx[1];
2975             idx  += 2;
2976             tmp0  = x[i1];
2977             tmp1  = x[i2];
2978             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2979             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2980             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2981             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2982             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2983           }
2984 
2985           if (n == sz-1) {
2986             tmp0  = x[*idx];
2987             sum1 -= v1[0] * tmp0;
2988             sum2 -= v2[0] * tmp0;
2989             sum3 -= v3[0] * tmp0;
2990             sum4 -= v4[0] * tmp0;
2991             sum5 -= v5[0] * tmp0;
2992           }
2993           t[row]   = sum1;
2994           t[row+1] = sum2;
2995           t[row+2] = sum3;
2996           t[row+3] = sum4;
2997           t[row+4] = sum5;
2998           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2999           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3000           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3001           x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3002           x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3003           ibdiag  += 25;
3004           break;
3005         default:
3006           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3007         }
3008       }
3009 
3010       xb   = t;
3011       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3012     } else xb = b;
3013     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3014 
3015       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3016       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3017         ibdiag -= sizes[i]*sizes[i];
3018         sz      = ii[row+1] - diag[row] - 1;
3019         v1      = a->a + diag[row] + 1;
3020         idx     = a->j + diag[row] + 1;
3021 
3022         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3023         switch (sizes[i]) {
3024         case 1:
3025 
3026           sum1 = xb[row];
3027           for (n = 0; n<sz-1; n+=2) {
3028             i1    = idx[0];
3029             i2    = idx[1];
3030             idx  += 2;
3031             tmp0  = x[i1];
3032             tmp1  = x[i2];
3033             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3034           }
3035 
3036           if (n == sz-1) {
3037             tmp0  = x[*idx];
3038             sum1 -= *v1*tmp0;
3039           }
3040           x[row--] = sum1*(*ibdiag);
3041           break;
3042 
3043         case 2:
3044 
3045           sum1 = xb[row];
3046           sum2 = xb[row-1];
3047           /* note that sum1 is associated with the second of the two rows */
3048           v2 = a->a + diag[row-1] + 2;
3049           for (n = 0; n<sz-1; n+=2) {
3050             i1    = idx[0];
3051             i2    = idx[1];
3052             idx  += 2;
3053             tmp0  = x[i1];
3054             tmp1  = x[i2];
3055             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3056             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3057           }
3058 
3059           if (n == sz-1) {
3060             tmp0  = x[*idx];
3061             sum1 -= *v1*tmp0;
3062             sum2 -= *v2*tmp0;
3063           }
3064           x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3065           x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3066           break;
3067         case 3:
3068 
3069           sum1 = xb[row];
3070           sum2 = xb[row-1];
3071           sum3 = xb[row-2];
3072           v2   = a->a + diag[row-1] + 2;
3073           v3   = a->a + diag[row-2] + 3;
3074           for (n = 0; n<sz-1; n+=2) {
3075             i1    = idx[0];
3076             i2    = idx[1];
3077             idx  += 2;
3078             tmp0  = x[i1];
3079             tmp1  = x[i2];
3080             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3081             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3082             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3083           }
3084 
3085           if (n == sz-1) {
3086             tmp0  = x[*idx];
3087             sum1 -= *v1*tmp0;
3088             sum2 -= *v2*tmp0;
3089             sum3 -= *v3*tmp0;
3090           }
3091           x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3092           x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3093           x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3094           break;
3095         case 4:
3096 
3097           sum1 = xb[row];
3098           sum2 = xb[row-1];
3099           sum3 = xb[row-2];
3100           sum4 = xb[row-3];
3101           v2   = a->a + diag[row-1] + 2;
3102           v3   = a->a + diag[row-2] + 3;
3103           v4   = a->a + diag[row-3] + 4;
3104           for (n = 0; n<sz-1; n+=2) {
3105             i1    = idx[0];
3106             i2    = idx[1];
3107             idx  += 2;
3108             tmp0  = x[i1];
3109             tmp1  = x[i2];
3110             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3111             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3112             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3113             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3114           }
3115 
3116           if (n == sz-1) {
3117             tmp0  = x[*idx];
3118             sum1 -= *v1*tmp0;
3119             sum2 -= *v2*tmp0;
3120             sum3 -= *v3*tmp0;
3121             sum4 -= *v4*tmp0;
3122           }
3123           x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3124           x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3125           x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3126           x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3127           break;
3128         case 5:
3129 
3130           sum1 = xb[row];
3131           sum2 = xb[row-1];
3132           sum3 = xb[row-2];
3133           sum4 = xb[row-3];
3134           sum5 = xb[row-4];
3135           v2   = a->a + diag[row-1] + 2;
3136           v3   = a->a + diag[row-2] + 3;
3137           v4   = a->a + diag[row-3] + 4;
3138           v5   = a->a + diag[row-4] + 5;
3139           for (n = 0; n<sz-1; n+=2) {
3140             i1    = idx[0];
3141             i2    = idx[1];
3142             idx  += 2;
3143             tmp0  = x[i1];
3144             tmp1  = x[i2];
3145             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3146             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3147             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3148             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3149             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3150           }
3151 
3152           if (n == sz-1) {
3153             tmp0  = x[*idx];
3154             sum1 -= *v1*tmp0;
3155             sum2 -= *v2*tmp0;
3156             sum3 -= *v3*tmp0;
3157             sum4 -= *v4*tmp0;
3158             sum5 -= *v5*tmp0;
3159           }
3160           x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3161           x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3162           x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3163           x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3164           x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3165           break;
3166         default:
3167           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3168         }
3169       }
3170 
3171       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3172     }
3173     its--;
3174   }
3175   while (its--) {
3176 
3177     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
3178       ibdiag = a->inode.ibdiag;
3179 
3180       for (i=0, row=0; i<m; i++) {
3181         sz  = ii[row + 1] - ii[row];
3182         v1  = a->a + ii[row];
3183         idx = a->j + ii[row];
3184 
3185         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3186         switch (sizes[i]) {
3187         case 1:
3188 
3189           sum1 = b[row];
3190           for (n = 0; n<sz-1; n+=2) {
3191             i1    = idx[0];
3192             i2    = idx[1];
3193             idx  += 2;
3194             tmp0  = x[i1];
3195             tmp1  = x[i2];
3196             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3197           }
3198 
3199           if (n == sz-1) {
3200             tmp0  = x[*idx];
3201             sum1 -= *v1 * tmp0;
3202           }
3203 
3204           /* in MatSOR_SeqAIJ this line would be
3205            *
3206            * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
3207            *
3208            * but omega == 1, so this becomes
3209            *
3210            * x[row] = (sum1+(*bdiag++)*x[row])*(*ibdiag++);
3211            *
3212            * but bdiag and ibdiag cancel each other, so we can change this
3213            * to adding sum1*(*ibdiag++).  We can skip bdiag for the larger
3214            * block sizes as well
3215            */
3216           x[row++] += sum1*(*ibdiag++);
3217           break;
3218         case 2:
3219           v2   = a->a + ii[row+1];
3220           sum1 = b[row];
3221           sum2 = b[row+1];
3222           for (n = 0; n<sz-1; n+=2) {
3223             i1    = idx[0];
3224             i2    = idx[1];
3225             idx  += 2;
3226             tmp0  = x[i1];
3227             tmp1  = x[i2];
3228             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3229             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3230           }
3231 
3232           if (n == sz-1) {
3233             tmp0  = x[*idx];
3234             sum1 -= v1[0] * tmp0;
3235             sum2 -= v2[0] * tmp0;
3236           }
3237           x[row++] += sum1*ibdiag[0] + sum2*ibdiag[2];
3238           x[row++] += sum1*ibdiag[1] + sum2*ibdiag[3];
3239           ibdiag   += 4;
3240           break;
3241         case 3:
3242           v2   = a->a + ii[row+1];
3243           v3   = a->a + ii[row+2];
3244           sum1 = b[row];
3245           sum2 = b[row+1];
3246           sum3 = b[row+2];
3247           for (n = 0; n<sz-1; n+=2) {
3248             i1    = idx[0];
3249             i2    = idx[1];
3250             idx  += 2;
3251             tmp0  = x[i1];
3252             tmp1  = x[i2];
3253             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3254             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3255             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3256           }
3257 
3258           if (n == sz-1) {
3259             tmp0  = x[*idx];
3260             sum1 -= v1[0] * tmp0;
3261             sum2 -= v2[0] * tmp0;
3262             sum3 -= v3[0] * tmp0;
3263           }
3264           x[row++] += sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3265           x[row++] += sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3266           x[row++] += sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3267           ibdiag   += 9;
3268           break;
3269         case 4:
3270           v2   = a->a + ii[row+1];
3271           v3   = a->a + ii[row+2];
3272           v4   = a->a + ii[row+3];
3273           sum1 = b[row];
3274           sum2 = b[row+1];
3275           sum3 = b[row+2];
3276           sum4 = b[row+3];
3277           for (n = 0; n<sz-1; n+=2) {
3278             i1    = idx[0];
3279             i2    = idx[1];
3280             idx  += 2;
3281             tmp0  = x[i1];
3282             tmp1  = x[i2];
3283             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3284             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3285             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3286             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3287           }
3288 
3289           if (n == sz-1) {
3290             tmp0  = x[*idx];
3291             sum1 -= v1[0] * tmp0;
3292             sum2 -= v2[0] * tmp0;
3293             sum3 -= v3[0] * tmp0;
3294             sum4 -= v4[0] * tmp0;
3295           }
3296           x[row++] += sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3297           x[row++] += sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3298           x[row++] += sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3299           x[row++] += sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3300           ibdiag   += 16;
3301           break;
3302         case 5:
3303           v2   = a->a + ii[row+1];
3304           v3   = a->a + ii[row+2];
3305           v4   = a->a + ii[row+3];
3306           v5   = a->a + ii[row+4];
3307           sum1 = b[row];
3308           sum2 = b[row+1];
3309           sum3 = b[row+2];
3310           sum4 = b[row+3];
3311           sum5 = b[row+4];
3312           for (n = 0; n<sz-1; n+=2) {
3313             i1    = idx[0];
3314             i2    = idx[1];
3315             idx  += 2;
3316             tmp0  = x[i1];
3317             tmp1  = x[i2];
3318             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3319             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3320             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3321             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3322             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3323           }
3324 
3325           if (n == sz-1) {
3326             tmp0  = x[*idx];
3327             sum1 -= v1[0] * tmp0;
3328             sum2 -= v2[0] * tmp0;
3329             sum3 -= v3[0] * tmp0;
3330             sum4 -= v4[0] * tmp0;
3331             sum5 -= v5[0] * tmp0;
3332           }
3333           x[row++] += sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3334           x[row++] += sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3335           x[row++] += sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3336           x[row++] += sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3337           x[row++] += sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3338           ibdiag   += 25;
3339           break;
3340         default:
3341           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3342         }
3343       }
3344 
3345       ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
3346     }
3347     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3348 
3349       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3350       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3351         ibdiag -= sizes[i]*sizes[i];
3352         sz      = ii[row+1] - ii[row];
3353         v1      = a->a + ii[row];
3354         idx     = a->j + ii[row];
3355 
3356         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3357         switch (sizes[i]) {
3358         case 1:
3359 
3360           sum1 = b[row];
3361           for (n = 0; n<sz-1; n+=2) {
3362             i1    = idx[0];
3363             i2    = idx[1];
3364             idx  += 2;
3365             tmp0  = x[i1];
3366             tmp1  = x[i2];
3367             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3368           }
3369 
3370           if (n == sz-1) {
3371             tmp0  = x[*idx];
3372             sum1 -= *v1*tmp0;
3373           }
3374           x[row--] += sum1*(*ibdiag);
3375           break;
3376 
3377         case 2:
3378 
3379           sum1 = b[row];
3380           sum2 = b[row-1];
3381           /* note that sum1 is associated with the second of the two rows */
3382           v2 = a->a + ii[row - 1];
3383           for (n = 0; n<sz-1; n+=2) {
3384             i1    = idx[0];
3385             i2    = idx[1];
3386             idx  += 2;
3387             tmp0  = x[i1];
3388             tmp1  = x[i2];
3389             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3390             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3391           }
3392 
3393           if (n == sz-1) {
3394             tmp0  = x[*idx];
3395             sum1 -= *v1*tmp0;
3396             sum2 -= *v2*tmp0;
3397           }
3398           x[row--] += sum2*ibdiag[1] + sum1*ibdiag[3];
3399           x[row--] += sum2*ibdiag[0] + sum1*ibdiag[2];
3400           break;
3401         case 3:
3402 
3403           sum1 = b[row];
3404           sum2 = b[row-1];
3405           sum3 = b[row-2];
3406           v2   = a->a + ii[row-1];
3407           v3   = a->a + ii[row-2];
3408           for (n = 0; n<sz-1; n+=2) {
3409             i1    = idx[0];
3410             i2    = idx[1];
3411             idx  += 2;
3412             tmp0  = x[i1];
3413             tmp1  = x[i2];
3414             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3415             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3416             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3417           }
3418 
3419           if (n == sz-1) {
3420             tmp0  = x[*idx];
3421             sum1 -= *v1*tmp0;
3422             sum2 -= *v2*tmp0;
3423             sum3 -= *v3*tmp0;
3424           }
3425           x[row--] += sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3426           x[row--] += sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3427           x[row--] += sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3428           break;
3429         case 4:
3430 
3431           sum1 = b[row];
3432           sum2 = b[row-1];
3433           sum3 = b[row-2];
3434           sum4 = b[row-3];
3435           v2   = a->a + ii[row-1];
3436           v3   = a->a + ii[row-2];
3437           v4   = a->a + ii[row-3];
3438           for (n = 0; n<sz-1; n+=2) {
3439             i1    = idx[0];
3440             i2    = idx[1];
3441             idx  += 2;
3442             tmp0  = x[i1];
3443             tmp1  = x[i2];
3444             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3445             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3446             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3447             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3448           }
3449 
3450           if (n == sz-1) {
3451             tmp0  = x[*idx];
3452             sum1 -= *v1*tmp0;
3453             sum2 -= *v2*tmp0;
3454             sum3 -= *v3*tmp0;
3455             sum4 -= *v4*tmp0;
3456           }
3457           x[row--] += sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3458           x[row--] += sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3459           x[row--] += sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3460           x[row--] += sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3461           break;
3462         case 5:
3463 
3464           sum1 = b[row];
3465           sum2 = b[row-1];
3466           sum3 = b[row-2];
3467           sum4 = b[row-3];
3468           sum5 = b[row-4];
3469           v2   = a->a + ii[row-1];
3470           v3   = a->a + ii[row-2];
3471           v4   = a->a + ii[row-3];
3472           v5   = a->a + ii[row-4];
3473           for (n = 0; n<sz-1; n+=2) {
3474             i1    = idx[0];
3475             i2    = idx[1];
3476             idx  += 2;
3477             tmp0  = x[i1];
3478             tmp1  = x[i2];
3479             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3480             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3481             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3482             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3483             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3484           }
3485 
3486           if (n == sz-1) {
3487             tmp0  = x[*idx];
3488             sum1 -= *v1*tmp0;
3489             sum2 -= *v2*tmp0;
3490             sum3 -= *v3*tmp0;
3491             sum4 -= *v4*tmp0;
3492             sum5 -= *v5*tmp0;
3493           }
3494           x[row--] += sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3495           x[row--] += sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3496           x[row--] += sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3497           x[row--] += sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3498           x[row--] += sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3499           break;
3500         default:
3501           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3502         }
3503       }
3504 
3505       ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
3506     }
3507   }
3508   if (flag & SOR_EISENSTAT) {
3509     /*
3510           Apply  (U + D)^-1  where D is now the block diagonal
3511     */
3512     ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3513     for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3514       ibdiag -= sizes[i]*sizes[i];
3515       sz      = ii[row+1] - diag[row] - 1;
3516       v1      = a->a + diag[row] + 1;
3517       idx     = a->j + diag[row] + 1;
3518       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3519       switch (sizes[i]) {
3520       case 1:
3521 
3522         sum1 = b[row];
3523         for (n = 0; n<sz-1; n+=2) {
3524           i1    = idx[0];
3525           i2    = idx[1];
3526           idx  += 2;
3527           tmp0  = x[i1];
3528           tmp1  = x[i2];
3529           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3530         }
3531 
3532         if (n == sz-1) {
3533           tmp0  = x[*idx];
3534           sum1 -= *v1*tmp0;
3535         }
3536         x[row] = sum1*(*ibdiag);row--;
3537         break;
3538 
3539       case 2:
3540 
3541         sum1 = b[row];
3542         sum2 = b[row-1];
3543         /* note that sum1 is associated with the second of the two rows */
3544         v2 = a->a + diag[row-1] + 2;
3545         for (n = 0; n<sz-1; n+=2) {
3546           i1    = idx[0];
3547           i2    = idx[1];
3548           idx  += 2;
3549           tmp0  = x[i1];
3550           tmp1  = x[i2];
3551           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3552           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3553         }
3554 
3555         if (n == sz-1) {
3556           tmp0  = x[*idx];
3557           sum1 -= *v1*tmp0;
3558           sum2 -= *v2*tmp0;
3559         }
3560         x[row]   = sum2*ibdiag[1] + sum1*ibdiag[3];
3561         x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
3562         row     -= 2;
3563         break;
3564       case 3:
3565 
3566         sum1 = b[row];
3567         sum2 = b[row-1];
3568         sum3 = b[row-2];
3569         v2   = a->a + diag[row-1] + 2;
3570         v3   = a->a + diag[row-2] + 3;
3571         for (n = 0; n<sz-1; n+=2) {
3572           i1    = idx[0];
3573           i2    = idx[1];
3574           idx  += 2;
3575           tmp0  = x[i1];
3576           tmp1  = x[i2];
3577           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3578           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3579           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3580         }
3581 
3582         if (n == sz-1) {
3583           tmp0  = x[*idx];
3584           sum1 -= *v1*tmp0;
3585           sum2 -= *v2*tmp0;
3586           sum3 -= *v3*tmp0;
3587         }
3588         x[row]   = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3589         x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3590         x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3591         row     -= 3;
3592         break;
3593       case 4:
3594 
3595         sum1 = b[row];
3596         sum2 = b[row-1];
3597         sum3 = b[row-2];
3598         sum4 = b[row-3];
3599         v2   = a->a + diag[row-1] + 2;
3600         v3   = a->a + diag[row-2] + 3;
3601         v4   = a->a + diag[row-3] + 4;
3602         for (n = 0; n<sz-1; n+=2) {
3603           i1    = idx[0];
3604           i2    = idx[1];
3605           idx  += 2;
3606           tmp0  = x[i1];
3607           tmp1  = x[i2];
3608           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3609           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3610           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3611           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3612         }
3613 
3614         if (n == sz-1) {
3615           tmp0  = x[*idx];
3616           sum1 -= *v1*tmp0;
3617           sum2 -= *v2*tmp0;
3618           sum3 -= *v3*tmp0;
3619           sum4 -= *v4*tmp0;
3620         }
3621         x[row]   = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3622         x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3623         x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3624         x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3625         row     -= 4;
3626         break;
3627       case 5:
3628 
3629         sum1 = b[row];
3630         sum2 = b[row-1];
3631         sum3 = b[row-2];
3632         sum4 = b[row-3];
3633         sum5 = b[row-4];
3634         v2   = a->a + diag[row-1] + 2;
3635         v3   = a->a + diag[row-2] + 3;
3636         v4   = a->a + diag[row-3] + 4;
3637         v5   = a->a + diag[row-4] + 5;
3638         for (n = 0; n<sz-1; n+=2) {
3639           i1    = idx[0];
3640           i2    = idx[1];
3641           idx  += 2;
3642           tmp0  = x[i1];
3643           tmp1  = x[i2];
3644           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3645           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3646           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3647           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3648           sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3649         }
3650 
3651         if (n == sz-1) {
3652           tmp0  = x[*idx];
3653           sum1 -= *v1*tmp0;
3654           sum2 -= *v2*tmp0;
3655           sum3 -= *v3*tmp0;
3656           sum4 -= *v4*tmp0;
3657           sum5 -= *v5*tmp0;
3658         }
3659         x[row]   = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3660         x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3661         x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3662         x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3663         x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3664         row     -= 5;
3665         break;
3666       default:
3667         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3668       }
3669     }
3670     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3671 
3672     /*
3673            t = b - D x    where D is the block diagonal
3674     */
3675     cnt = 0;
3676     for (i=0, row=0; i<m; i++) {
3677       switch (sizes[i]) {
3678       case 1:
3679         t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
3680         break;
3681       case 2:
3682         x1       = x[row]; x2 = x[row+1];
3683         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3684         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3685         t[row]   = b[row] - tmp1;
3686         t[row+1] = b[row+1] - tmp2; row += 2;
3687         cnt     += 4;
3688         break;
3689       case 3:
3690         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2];
3691         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3692         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3693         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3694         t[row]   = b[row] - tmp1;
3695         t[row+1] = b[row+1] - tmp2;
3696         t[row+2] = b[row+2] - tmp3; row += 3;
3697         cnt     += 9;
3698         break;
3699       case 4:
3700         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3701         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3702         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3703         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3704         tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3705         t[row]   = b[row] - tmp1;
3706         t[row+1] = b[row+1] - tmp2;
3707         t[row+2] = b[row+2] - tmp3;
3708         t[row+3] = b[row+3] - tmp4; row += 4;
3709         cnt     += 16;
3710         break;
3711       case 5:
3712         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3713         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3714         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3715         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3716         tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3717         tmp5     = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3718         t[row]   = b[row] - tmp1;
3719         t[row+1] = b[row+1] - tmp2;
3720         t[row+2] = b[row+2] - tmp3;
3721         t[row+3] = b[row+3] - tmp4;
3722         t[row+4] = b[row+4] - tmp5;row += 5;
3723         cnt     += 25;
3724         break;
3725       default:
3726         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3727       }
3728     }
3729     ierr = PetscLogFlops(m);CHKERRQ(ierr);
3730 
3731 
3732 
3733     /*
3734           Apply (L + D)^-1 where D is the block diagonal
3735     */
3736     for (i=0, row=0; i<m; i++) {
3737       sz  = diag[row] - ii[row];
3738       v1  = a->a + ii[row];
3739       idx = a->j + ii[row];
3740       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3741       switch (sizes[i]) {
3742       case 1:
3743 
3744         sum1 = t[row];
3745         for (n = 0; n<sz-1; n+=2) {
3746           i1    = idx[0];
3747           i2    = idx[1];
3748           idx  += 2;
3749           tmp0  = t[i1];
3750           tmp1  = t[i2];
3751           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3752         }
3753 
3754         if (n == sz-1) {
3755           tmp0  = t[*idx];
3756           sum1 -= *v1 * tmp0;
3757         }
3758         x[row] += t[row] = sum1*(*ibdiag++); row++;
3759         break;
3760       case 2:
3761         v2   = a->a + ii[row+1];
3762         sum1 = t[row];
3763         sum2 = t[row+1];
3764         for (n = 0; n<sz-1; n+=2) {
3765           i1    = idx[0];
3766           i2    = idx[1];
3767           idx  += 2;
3768           tmp0  = t[i1];
3769           tmp1  = t[i2];
3770           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3771           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3772         }
3773 
3774         if (n == sz-1) {
3775           tmp0  = t[*idx];
3776           sum1 -= v1[0] * tmp0;
3777           sum2 -= v2[0] * tmp0;
3778         }
3779         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3780         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3781         ibdiag   += 4; row += 2;
3782         break;
3783       case 3:
3784         v2   = a->a + ii[row+1];
3785         v3   = a->a + ii[row+2];
3786         sum1 = t[row];
3787         sum2 = t[row+1];
3788         sum3 = t[row+2];
3789         for (n = 0; n<sz-1; n+=2) {
3790           i1    = idx[0];
3791           i2    = idx[1];
3792           idx  += 2;
3793           tmp0  = t[i1];
3794           tmp1  = t[i2];
3795           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3796           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3797           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3798         }
3799 
3800         if (n == sz-1) {
3801           tmp0  = t[*idx];
3802           sum1 -= v1[0] * tmp0;
3803           sum2 -= v2[0] * tmp0;
3804           sum3 -= v3[0] * tmp0;
3805         }
3806         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3807         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3808         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3809         ibdiag   += 9; row += 3;
3810         break;
3811       case 4:
3812         v2   = a->a + ii[row+1];
3813         v3   = a->a + ii[row+2];
3814         v4   = a->a + ii[row+3];
3815         sum1 = t[row];
3816         sum2 = t[row+1];
3817         sum3 = t[row+2];
3818         sum4 = t[row+3];
3819         for (n = 0; n<sz-1; n+=2) {
3820           i1    = idx[0];
3821           i2    = idx[1];
3822           idx  += 2;
3823           tmp0  = t[i1];
3824           tmp1  = t[i2];
3825           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3826           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3827           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3828           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3829         }
3830 
3831         if (n == sz-1) {
3832           tmp0  = t[*idx];
3833           sum1 -= v1[0] * tmp0;
3834           sum2 -= v2[0] * tmp0;
3835           sum3 -= v3[0] * tmp0;
3836           sum4 -= v4[0] * tmp0;
3837         }
3838         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3839         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3840         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3841         x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3842         ibdiag   += 16; row += 4;
3843         break;
3844       case 5:
3845         v2   = a->a + ii[row+1];
3846         v3   = a->a + ii[row+2];
3847         v4   = a->a + ii[row+3];
3848         v5   = a->a + ii[row+4];
3849         sum1 = t[row];
3850         sum2 = t[row+1];
3851         sum3 = t[row+2];
3852         sum4 = t[row+3];
3853         sum5 = t[row+4];
3854         for (n = 0; n<sz-1; n+=2) {
3855           i1    = idx[0];
3856           i2    = idx[1];
3857           idx  += 2;
3858           tmp0  = t[i1];
3859           tmp1  = t[i2];
3860           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3861           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3862           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3863           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3864           sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3865         }
3866 
3867         if (n == sz-1) {
3868           tmp0  = t[*idx];
3869           sum1 -= v1[0] * tmp0;
3870           sum2 -= v2[0] * tmp0;
3871           sum3 -= v3[0] * tmp0;
3872           sum4 -= v4[0] * tmp0;
3873           sum5 -= v5[0] * tmp0;
3874         }
3875         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3876         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3877         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3878         x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3879         x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3880         ibdiag   += 25; row += 5;
3881         break;
3882       default:
3883         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3884       }
3885     }
3886     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3887   }
3888   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3889   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3890   PetscFunctionReturn(0);
3891 }
3892 
3893 #undef __FUNCT__
3894 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode"
3895 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
3896 {
3897   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
3898   PetscScalar       *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
3899   const MatScalar   *bdiag = a->inode.bdiag;
3900   const PetscScalar *b;
3901   PetscErrorCode    ierr;
3902   PetscInt          m      = a->inode.node_count,cnt = 0,i,row;
3903   const PetscInt    *sizes = a->inode.size;
3904 
3905   PetscFunctionBegin;
3906   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3907   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3908   cnt  = 0;
3909   for (i=0, row=0; i<m; i++) {
3910     switch (sizes[i]) {
3911     case 1:
3912       x[row] = b[row]*bdiag[cnt++];row++;
3913       break;
3914     case 2:
3915       x1       = b[row]; x2 = b[row+1];
3916       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3917       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3918       x[row++] = tmp1;
3919       x[row++] = tmp2;
3920       cnt     += 4;
3921       break;
3922     case 3:
3923       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2];
3924       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3925       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3926       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3927       x[row++] = tmp1;
3928       x[row++] = tmp2;
3929       x[row++] = tmp3;
3930       cnt     += 9;
3931       break;
3932     case 4:
3933       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
3934       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3935       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3936       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3937       tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3938       x[row++] = tmp1;
3939       x[row++] = tmp2;
3940       x[row++] = tmp3;
3941       x[row++] = tmp4;
3942       cnt     += 16;
3943       break;
3944     case 5:
3945       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
3946       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3947       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3948       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3949       tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3950       tmp5     = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3951       x[row++] = tmp1;
3952       x[row++] = tmp2;
3953       x[row++] = tmp3;
3954       x[row++] = tmp4;
3955       x[row++] = tmp5;
3956       cnt     += 25;
3957       break;
3958     default:
3959       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3960     }
3961   }
3962   ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr);
3963   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3964   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3965   PetscFunctionReturn(0);
3966 }
3967 
3968 /*
3969     samestructure indicates that the matrix has not changed its nonzero structure so we
3970     do not need to recompute the inodes
3971 */
3972 #undef __FUNCT__
3973 #define __FUNCT__ "Mat_CheckInode"
3974 PetscErrorCode Mat_CheckInode(Mat A,PetscBool samestructure)
3975 {
3976   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3977   PetscErrorCode ierr;
3978   PetscInt       i,j,m,nzx,nzy,*ns,node_count,blk_size;
3979   PetscBool      flag;
3980   const PetscInt *idx,*idy,*ii;
3981 
3982   PetscFunctionBegin;
3983   if (!a->inode.use) PetscFunctionReturn(0);
3984   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
3985 
3986 
3987   m = A->rmap->n;
3988   if (a->inode.size) ns = a->inode.size;
3989   else {
3990     ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);
3991   }
3992 
3993   i          = 0;
3994   node_count = 0;
3995   idx        = a->j;
3996   ii         = a->i;
3997   while (i < m) {                /* For each row */
3998     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
3999     /* Limits the number of elements in a node to 'a->inode.limit' */
4000     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4001       nzy = ii[j+1] - ii[j];     /* Same number of nonzeros */
4002       if (nzy != nzx) break;
4003       idy += nzx;              /* Same nonzero pattern */
4004       ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
4005       if (!flag) break;
4006     }
4007     ns[node_count++] = blk_size;
4008     idx             += blk_size*nzx;
4009     i                = j;
4010   }
4011   /* If not enough inodes found,, do not use inode version of the routines */
4012   if (!m || node_count > .8*m) {
4013     ierr = PetscFree(ns);CHKERRQ(ierr);
4014 
4015     a->inode.node_count       = 0;
4016     a->inode.size             = NULL;
4017     a->inode.use              = PETSC_FALSE;
4018     A->ops->mult              = MatMult_SeqAIJ;
4019     A->ops->sor               = MatSOR_SeqAIJ;
4020     A->ops->multadd           = MatMultAdd_SeqAIJ;
4021     A->ops->getrowij          = MatGetRowIJ_SeqAIJ;
4022     A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ;
4023     A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ;
4024     A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ;
4025     A->ops->coloringpatch     = 0;
4026     A->ops->multdiagonalblock = 0;
4027 
4028     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
4029   } else {
4030     if (!A->factortype) {
4031       A->ops->mult              = MatMult_SeqAIJ_Inode;
4032       A->ops->sor               = MatSOR_SeqAIJ_Inode;
4033       A->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
4034       A->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4035       A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4036       A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4037       A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4038       A->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4039       A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4040     } else {
4041       A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4042     }
4043     a->inode.node_count = node_count;
4044     a->inode.size       = ns;
4045     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
4046   }
4047   a->inode.checked = PETSC_TRUE;
4048   PetscFunctionReturn(0);
4049 }
4050 
4051 #undef __FUNCT__
4052 #define __FUNCT__ "MatDuplicate_SeqAIJ_Inode"
4053 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat *C)
4054 {
4055   Mat            B =*C;
4056   Mat_SeqAIJ     *c=(Mat_SeqAIJ*)B->data,*a=(Mat_SeqAIJ*)A->data;
4057   PetscErrorCode ierr;
4058   PetscInt       m=A->rmap->n;
4059 
4060   PetscFunctionBegin;
4061   c->inode.use       = a->inode.use;
4062   c->inode.limit     = a->inode.limit;
4063   c->inode.max_limit = a->inode.max_limit;
4064   if (a->inode.size) {
4065     ierr                = PetscMalloc((m+1)*sizeof(PetscInt),&c->inode.size);CHKERRQ(ierr);
4066     c->inode.node_count = a->inode.node_count;
4067     ierr                = PetscMemcpy(c->inode.size,a->inode.size,(m+1)*sizeof(PetscInt));CHKERRQ(ierr);
4068     /* note the table of functions below should match that in Mat_CheckInode() */
4069     if (!B->factortype) {
4070       B->ops->mult              = MatMult_SeqAIJ_Inode;
4071       B->ops->sor               = MatSOR_SeqAIJ_Inode;
4072       B->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
4073       B->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4074       B->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4075       B->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4076       B->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4077       B->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4078       B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4079     } else {
4080       B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4081     }
4082   } else {
4083     c->inode.size       = 0;
4084     c->inode.node_count = 0;
4085   }
4086   c->inode.ibdiagvalid = PETSC_FALSE;
4087   c->inode.ibdiag      = 0;
4088   c->inode.bdiag       = 0;
4089   PetscFunctionReturn(0);
4090 }
4091 
4092 #undef __FUNCT__
4093 #define __FUNCT__ "MatGetRow_FactoredLU"
4094 PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,const PetscInt *ai,const PetscInt *aj,const PetscInt *adiag,PetscInt row)
4095 {
4096   PetscInt       k;
4097   const PetscInt *vi;
4098 
4099   PetscFunctionBegin;
4100   vi = aj + ai[row];
4101   for (k=0; k<nzl; k++) cols[k] = vi[k];
4102   vi        = aj + adiag[row];
4103   cols[nzl] = vi[0];
4104   vi        = aj + adiag[row+1]+1;
4105   for (k=0; k<nzu; k++) cols[nzl+1+k] = vi[k];
4106   PetscFunctionReturn(0);
4107 }
4108 /*
4109    Mat_CheckInode_FactorLU - Check Inode for factored seqaij matrix.
4110    Modified from Mat_CheckInode().
4111 
4112    Input Parameters:
4113 .  Mat A - ILU or LU matrix factor
4114 
4115 */
4116 #undef __FUNCT__
4117 #define __FUNCT__ "Mat_CheckInode_FactorLU"
4118 PetscErrorCode Mat_CheckInode_FactorLU(Mat A)
4119 {
4120   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4121   PetscErrorCode ierr;
4122   PetscInt       i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size;
4123   PetscInt       *cols1,*cols2,*ns;
4124   const PetscInt *ai = a->i,*aj = a->j, *adiag = a->diag;
4125   PetscBool      flag;
4126 
4127   PetscFunctionBegin;
4128   if (!a->inode.use)    PetscFunctionReturn(0);
4129   if (a->inode.checked) PetscFunctionReturn(0);
4130 
4131   m = A->rmap->n;
4132   if (a->inode.size) ns = a->inode.size;
4133   else {
4134     ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);
4135   }
4136 
4137   i          = 0;
4138   node_count = 0;
4139   ierr = PetscMalloc2(m,PetscInt,&cols1,m,PetscInt,&cols2);CHKERRQ(ierr);
4140   while (i < m) {                /* For each row */
4141     nzl1 = ai[i+1] - ai[i];       /* Number of nonzeros in L */
4142     nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/
4143     nzx  = nzl1 + nzu1 + 1;
4144     MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i);
4145 
4146     /* Limits the number of elements in a node to 'a->inode.limit' */
4147     for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4148       nzl2 = ai[j+1] - ai[j];
4149       nzu2 = adiag[j] - adiag[j+1] - 1;
4150       nzy  = nzl2 + nzu2 + 1;
4151       if (nzy != nzx) break;
4152       ierr = MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);CHKERRQ(ierr);
4153       ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
4154       if (!flag) break;
4155     }
4156     ns[node_count++] = blk_size;
4157     i                = j;
4158   }
4159   ierr             = PetscFree2(cols1,cols2);CHKERRQ(ierr);
4160   /* If not enough inodes found,, do not use inode version of the routines */
4161   if (!m || node_count > .8*m) {
4162     ierr = PetscFree(ns);CHKERRQ(ierr);
4163 
4164     a->inode.node_count = 0;
4165     a->inode.size       = NULL;
4166     a->inode.use        = PETSC_FALSE;
4167 
4168     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
4169   } else {
4170     A->ops->mult              = 0;
4171     A->ops->sor               = 0;
4172     A->ops->multadd           = 0;
4173     A->ops->getrowij          = 0;
4174     A->ops->restorerowij      = 0;
4175     A->ops->getcolumnij       = 0;
4176     A->ops->restorecolumnij   = 0;
4177     A->ops->coloringpatch     = 0;
4178     A->ops->multdiagonalblock = 0;
4179     a->inode.node_count       = node_count;
4180     a->inode.size             = ns;
4181 
4182     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
4183   }
4184   a->inode.checked = PETSC_TRUE;
4185   PetscFunctionReturn(0);
4186 }
4187 
4188 #undef __FUNCT__
4189 #define __FUNCT__ "MatSeqAIJInvalidateDiagonal_Inode"
4190 PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4191 {
4192   Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;
4193 
4194   PetscFunctionBegin;
4195   a->inode.ibdiagvalid = PETSC_FALSE;
4196   PetscFunctionReturn(0);
4197 }
4198 
4199 /*
4200      This is really ugly. if inodes are used this replaces the
4201   permutations with ones that correspond to rows/cols of the matrix
4202   rather then inode blocks
4203 */
4204 #undef __FUNCT__
4205 #define __FUNCT__ "MatInodeAdjustForInodes"
4206 PetscErrorCode  MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
4207 {
4208   PetscErrorCode ierr;
4209 
4210   PetscFunctionBegin;
4211   ierr = PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));CHKERRQ(ierr);
4212   PetscFunctionReturn(0);
4213 }
4214 
4215 #undef __FUNCT__
4216 #define __FUNCT__ "MatInodeAdjustForInodes_SeqAIJ_Inode"
4217 PetscErrorCode  MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
4218 {
4219   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data;
4220   PetscErrorCode ierr;
4221   PetscInt       m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
4222   const PetscInt *ridx,*cidx;
4223   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
4224   PetscInt       nslim_col,*ns_col;
4225   IS             ris = *rperm,cis = *cperm;
4226 
4227   PetscFunctionBegin;
4228   if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */
4229   if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */
4230 
4231   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
4232   ierr = PetscMalloc((((nslim_row>nslim_col) ? nslim_row : nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
4233   ierr = PetscMalloc2(m,PetscInt,&permr,n,PetscInt,&permc);CHKERRQ(ierr);
4234 
4235   ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr);
4236   ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr);
4237 
4238   /* Form the inode structure for the rows of permuted matric using inv perm*/
4239   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
4240 
4241   /* Construct the permutations for rows*/
4242   for (i=0,row = 0; i<nslim_row; ++i) {
4243     indx      = ridx[i];
4244     start_val = tns[indx];
4245     end_val   = tns[indx + 1];
4246     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
4247   }
4248 
4249   /* Form the inode structure for the columns of permuted matrix using inv perm*/
4250   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
4251 
4252   /* Construct permutations for columns */
4253   for (i=0,col=0; i<nslim_col; ++i) {
4254     indx      = cidx[i];
4255     start_val = tns[indx];
4256     end_val   = tns[indx + 1];
4257     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
4258   }
4259 
4260   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm);CHKERRQ(ierr);
4261   ierr = ISSetPermutation(*rperm);CHKERRQ(ierr);
4262   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm);CHKERRQ(ierr);
4263   ierr = ISSetPermutation(*cperm);CHKERRQ(ierr);
4264 
4265   ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr);
4266   ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr);
4267 
4268   ierr = PetscFree(ns_col);CHKERRQ(ierr);
4269   ierr = PetscFree2(permr,permc);CHKERRQ(ierr);
4270   ierr = ISDestroy(&cis);CHKERRQ(ierr);
4271   ierr = ISDestroy(&ris);CHKERRQ(ierr);
4272   ierr = PetscFree(tns);CHKERRQ(ierr);
4273   PetscFunctionReturn(0);
4274 }
4275 
4276 #undef __FUNCT__
4277 #define __FUNCT__ "MatInodeGetInodeSizes"
4278 /*@C
4279    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
4280 
4281    Not Collective
4282 
4283    Input Parameter:
4284 .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
4285 
4286    Output Parameter:
4287 +  node_count - no of inodes present in the matrix.
4288 .  sizes      - an array of size node_count,with sizes of each inode.
4289 -  limit      - the max size used to generate the inodes.
4290 
4291    Level: advanced
4292 
4293    Notes: This routine returns some internal storage information
4294    of the matrix, it is intended to be used by advanced users.
4295    It should be called after the matrix is assembled.
4296    The contents of the sizes[] array should not be changed.
4297    NULL may be passed for information not requested.
4298 
4299 .keywords: matrix, seqaij, get, inode
4300 
4301 .seealso: MatGetInfo()
4302 @*/
4303 PetscErrorCode  MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4304 {
4305   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
4306 
4307   PetscFunctionBegin;
4308   if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
4309   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",&f);CHKERRQ(ierr);
4310   if (f) {
4311     ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr);
4312   }
4313   PetscFunctionReturn(0);
4314 }
4315 
4316 #undef __FUNCT__
4317 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode"
4318 PetscErrorCode  MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4319 {
4320   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4321 
4322   PetscFunctionBegin;
4323   if (node_count) *node_count = a->inode.node_count;
4324   if (sizes)      *sizes      = a->inode.size;
4325   if (limit)      *limit      = a->inode.limit;
4326   PetscFunctionReturn(0);
4327 }
4328