xref: /petsc/src/mat/impls/aij/seq/inode.c (revision 8758e1faf4825e4b427fda184376c8a4a04678d8)
1 
2 /*
3   This file provides high performance routines for the Inode format (compressed sparse row)
4   by taking advantage of rows with identical nonzero structure (I-nodes).
5 */
6 #include <../src/mat/impls/aij/seq/aij.h>
7 
8 #undef __FUNCT__
9 #define __FUNCT__ "Mat_CreateColInode"
10 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt *size,PetscInt **ns)
11 {
12   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
13   PetscErrorCode ierr;
14   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;
15 
16   PetscFunctionBegin;
17   n      = A->cmap->n;
18   m      = A->rmap->n;
19   ns_row = a->inode.size;
20 
21   min_mn = (m < n) ? m : n;
22   if (!ns) {
23     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) ;
24     for (; count+1 < n; count++,i++) ;
25     if (count < n)  {
26       i++;
27     }
28     *size = i;
29     PetscFunctionReturn(0);
30   }
31   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr);
32 
33   /* Use the same row structure wherever feasible. */
34   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
35     ns_col[i] = ns_row[i];
36   }
37 
38   /* if m < n; pad up the remainder with inode_limit */
39   for (; count+1 < n; count++,i++) {
40     ns_col[i] = 1;
41   }
42   /* The last node is the odd ball. padd it up with the remaining rows; */
43   if (count < n) {
44     ns_col[i] = n - count;
45     i++;
46   } else if (count > n) {
47     /* Adjust for the over estimation */
48     ns_col[i-1] += n - count;
49   }
50   *size = i;
51   *ns   = ns_col;
52   PetscFunctionReturn(0);
53 }
54 
55 
56 /*
57       This builds symmetric version of nonzero structure,
58 */
59 #undef __FUNCT__
60 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric"
61 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
62 {
63   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
64   PetscErrorCode ierr;
65   PetscInt       *work,*ia,*ja,nz,nslim_row,nslim_col,m,row,col,n;
66   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2;
67   const PetscInt *j,*jmax,*ai= a->i,*aj = a->j;
68 
69   PetscFunctionBegin;
70   nslim_row = a->inode.node_count;
71   m         = A->rmap->n;
72   n         = A->cmap->n;
73   if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
74 
75   /* Use the row_inode as column_inode */
76   nslim_col = nslim_row;
77   ns_col    = ns_row;
78 
79   /* allocate space for reformated inode structure */
80   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
81   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
82   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
83 
84   for (i1=0,col=0; i1<nslim_col; ++i1) {
85     nsz = ns_col[i1];
86     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
87   }
88   /* allocate space for row pointers */
89   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
90   *iia = ia;
91   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
92   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
93 
94   /* determine the number of columns in each row */
95   ia[0] = oshift;
96   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
97 
98     j    = aj + ai[row] + ishift;
99     jmax = aj + ai[row+1] + ishift;
100     col  = *j++ + ishift;
101     i2   = tvc[col];
102     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
103       ia[i1+1]++;
104       ia[i2+1]++;
105       i2++;                     /* Start col of next node */
106       while ((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j;
107       i2 = tvc[col];
108     }
109     if (i2 == i1) ia[i2+1]++;    /* now the diagonal element */
110   }
111 
112   /* shift ia[i] to point to next row */
113   for (i1=1; i1<nslim_row+1; i1++) {
114     row        = ia[i1-1];
115     ia[i1]    += row;
116     work[i1-1] = row - oshift;
117   }
118 
119   /* allocate space for column pointers */
120   nz   = ia[nslim_row] + (!ishift);
121   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
122   *jja = ja;
123 
124   /* loop over lower triangular part putting into ja */
125   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
126     j    = aj + ai[row] + ishift;
127     jmax = aj + ai[row+1] + ishift;
128     col  = *j++ + ishift;
129     i2   = tvc[col];
130     while (i2<i1 && j<jmax) {
131       ja[work[i2]++] = i1 + oshift;
132       ja[work[i1]++] = i2 + oshift;
133       ++i2;
134       while ((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */
135       i2 = tvc[col];
136     }
137     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
138 
139   }
140   ierr = PetscFree(work);CHKERRQ(ierr);
141   ierr = PetscFree(tns);CHKERRQ(ierr);
142   ierr = PetscFree(tvc);CHKERRQ(ierr);
143   PetscFunctionReturn(0);
144 }
145 
146 /*
147       This builds nonsymmetric version of nonzero structure,
148 */
149 #undef __FUNCT__
150 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric"
151 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
152 {
153   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
154   PetscErrorCode ierr;
155   PetscInt       *work,*ia,*ja,nz,nslim_row,n,row,col,*ns_col,nslim_col;
156   PetscInt       *tns,*tvc,nsz,i1,i2;
157   const PetscInt *j,*ai= a->i,*aj = a->j,*ns_row = a->inode.size;
158 
159   PetscFunctionBegin;
160   nslim_row = a->inode.node_count;
161   n         = A->cmap->n;
162 
163   /* Create The column_inode for this matrix */
164   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
165 
166   /* allocate space for reformated column_inode structure */
167   ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
168   ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
169   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
170 
171   for (i1=0,col=0; i1<nslim_col; ++i1) {
172     nsz = ns_col[i1];
173     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
174   }
175   /* allocate space for row pointers */
176   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
177   *iia = ia;
178   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
179   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
180 
181   /* determine the number of columns in each row */
182   ia[0] = oshift;
183   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
184     j   = aj + ai[row] + ishift;
185     col = *j++ + ishift;
186     i2  = tvc[col];
187     nz  = ai[row+1] - ai[row];
188     while (nz-- > 0) {           /* off-diagonal elemets */
189       ia[i1+1]++;
190       i2++;                     /* Start col of next node */
191       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
192       if (nz > 0) i2 = tvc[col];
193     }
194   }
195 
196   /* shift ia[i] to point to next row */
197   for (i1=1; i1<nslim_row+1; i1++) {
198     row        = ia[i1-1];
199     ia[i1]    += row;
200     work[i1-1] = row - oshift;
201   }
202 
203   /* allocate space for column pointers */
204   nz   = ia[nslim_row] + (!ishift);
205   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
206   *jja = ja;
207 
208   /* loop over matrix putting into ja */
209   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
210     j   = aj + ai[row] + ishift;
211     col = *j++ + ishift;
212     i2  = tvc[col];
213     nz  = ai[row+1] - ai[row];
214     while (nz-- > 0) {
215       ja[work[i1]++] = i2 + oshift;
216       ++i2;
217       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
218       if (nz > 0) i2 = tvc[col];
219     }
220   }
221   ierr = PetscFree(ns_col);CHKERRQ(ierr);
222   ierr = PetscFree(work);CHKERRQ(ierr);
223   ierr = PetscFree(tns);CHKERRQ(ierr);
224   ierr = PetscFree(tvc);CHKERRQ(ierr);
225   PetscFunctionReturn(0);
226 }
227 
228 #undef __FUNCT__
229 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode"
230 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
231 {
232   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
233   PetscErrorCode ierr;
234 
235   PetscFunctionBegin;
236   *n = a->inode.node_count;
237   if (!ia) PetscFunctionReturn(0);
238   if (!blockcompressed) {
239     ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
240   } else if (symmetric) {
241     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
242   } else {
243     ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
244   }
245   PetscFunctionReturn(0);
246 }
247 
248 #undef __FUNCT__
249 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode"
250 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
251 {
252   PetscErrorCode ierr;
253 
254   PetscFunctionBegin;
255   if (!ia) PetscFunctionReturn(0);
256 
257   if (!blockcompressed) {
258     ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
259   } else {
260     ierr = PetscFree(*ia);CHKERRQ(ierr);
261     ierr = PetscFree(*ja);CHKERRQ(ierr);
262   }
263   PetscFunctionReturn(0);
264 }
265 
266 /* ----------------------------------------------------------- */
267 
268 #undef __FUNCT__
269 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric"
270 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
271 {
272   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
273   PetscErrorCode ierr;
274   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
275   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
276 
277   PetscFunctionBegin;
278   nslim_row = a->inode.node_count;
279   n         = A->cmap->n;
280 
281   /* Create The column_inode for this matrix */
282   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
283 
284   /* allocate space for reformated column_inode structure */
285   ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
286   ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
287   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
288 
289   for (i1=0,col=0; i1<nslim_col; ++i1) {
290     nsz = ns_col[i1];
291     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
292   }
293   /* allocate space for column pointers */
294   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
295   *iia = ia;
296   ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr);
297   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
298 
299   /* determine the number of columns in each row */
300   ia[0] = oshift;
301   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
302     j   = aj + ai[row] + ishift;
303     col = *j++ + ishift;
304     i2  = tvc[col];
305     nz  = ai[row+1] - ai[row];
306     while (nz-- > 0) {           /* off-diagonal elemets */
307       /* ia[i1+1]++; */
308       ia[i2+1]++;
309       i2++;
310       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
311       if (nz > 0) i2 = tvc[col];
312     }
313   }
314 
315   /* shift ia[i] to point to next col */
316   for (i1=1; i1<nslim_col+1; i1++) {
317     col        = ia[i1-1];
318     ia[i1]    += col;
319     work[i1-1] = col - oshift;
320   }
321 
322   /* allocate space for column pointers */
323   nz   = ia[nslim_col] + (!ishift);
324   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
325   *jja = ja;
326 
327   /* loop over matrix putting into ja */
328   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
329     j   = aj + ai[row] + ishift;
330     col = *j++ + ishift;
331     i2  = tvc[col];
332     nz  = ai[row+1] - ai[row];
333     while (nz-- > 0) {
334       /* ja[work[i1]++] = i2 + oshift; */
335       ja[work[i2]++] = i1 + oshift;
336       i2++;
337       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
338       if (nz > 0) i2 = tvc[col];
339     }
340   }
341   ierr = PetscFree(ns_col);CHKERRQ(ierr);
342   ierr = PetscFree(work);CHKERRQ(ierr);
343   ierr = PetscFree(tns);CHKERRQ(ierr);
344   ierr = PetscFree(tvc);CHKERRQ(ierr);
345   PetscFunctionReturn(0);
346 }
347 
348 #undef __FUNCT__
349 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode"
350 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
351 {
352   PetscErrorCode ierr;
353 
354   PetscFunctionBegin;
355   ierr = Mat_CreateColInode(A,n,NULL);CHKERRQ(ierr);
356   if (!ia) PetscFunctionReturn(0);
357 
358   if (!blockcompressed) {
359     ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
360   } else if (symmetric) {
361     /* Since the indices are symmetric it does'nt matter */
362     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
363   } else {
364     ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
365   }
366   PetscFunctionReturn(0);
367 }
368 
369 #undef __FUNCT__
370 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode"
371 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
372 {
373   PetscErrorCode ierr;
374 
375   PetscFunctionBegin;
376   if (!ia) PetscFunctionReturn(0);
377   if (!blockcompressed) {
378     ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
379   } else {
380     ierr = PetscFree(*ia);CHKERRQ(ierr);
381     ierr = PetscFree(*ja);CHKERRQ(ierr);
382   }
383   PetscFunctionReturn(0);
384 }
385 
386 /* ----------------------------------------------------------- */
387 
388 #undef __FUNCT__
389 #define __FUNCT__ "MatMult_SeqAIJ_Inode"
390 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
391 {
392   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
393   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
394   PetscScalar       *y;
395   const PetscScalar *x;
396   const MatScalar   *v1,*v2,*v3,*v4,*v5;
397   PetscErrorCode    ierr;
398   PetscInt          i1,i2,n,i,row,node_max,nsz,sz,nonzerorow=0;
399   const PetscInt    *idx,*ns,*ii;
400 
401 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
402 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
403 #endif
404 
405   PetscFunctionBegin;
406   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
407   node_max = a->inode.node_count;
408   ns       = a->inode.size;     /* Node Size array */
409   ierr     = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
410   ierr     = VecGetArray(yy,&y);CHKERRQ(ierr);
411   idx      = a->j;
412   v1       = a->a;
413   ii       = a->i;
414 
415   for (i = 0,row = 0; i< node_max; ++i) {
416     nsz         = ns[i];
417     n           = ii[1] - ii[0];
418     nonzerorow += (n>0)*nsz;
419     ii         += nsz;
420     PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA);    /* Prefetch the indices for the block row after the current one */
421     PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one  */
422     sz = n;                     /* No of non zeros in this row */
423                                 /* Switch on the size of Node */
424     switch (nsz) {               /* Each loop in 'case' is unrolled */
425     case 1:
426       sum1 = 0.;
427 
428       for (n = 0; n< sz-1; n+=2) {
429         i1    = idx[0];         /* The instructions are ordered to */
430         i2    = idx[1];         /* make the compiler's job easy */
431         idx  += 2;
432         tmp0  = x[i1];
433         tmp1  = x[i2];
434         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
435       }
436 
437       if (n == sz-1) {          /* Take care of the last nonzero  */
438         tmp0  = x[*idx++];
439         sum1 += *v1++ *tmp0;
440       }
441       y[row++]=sum1;
442       break;
443     case 2:
444       sum1 = 0.;
445       sum2 = 0.;
446       v2   = v1 + n;
447 
448       for (n = 0; n< sz-1; n+=2) {
449         i1    = idx[0];
450         i2    = idx[1];
451         idx  += 2;
452         tmp0  = x[i1];
453         tmp1  = x[i2];
454         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
455         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
456       }
457       if (n == sz-1) {
458         tmp0  = x[*idx++];
459         sum1 += *v1++ * tmp0;
460         sum2 += *v2++ * tmp0;
461       }
462       y[row++]=sum1;
463       y[row++]=sum2;
464       v1      =v2;              /* Since the next block to be processed starts there*/
465       idx    +=sz;
466       break;
467     case 3:
468       sum1 = 0.;
469       sum2 = 0.;
470       sum3 = 0.;
471       v2   = v1 + n;
472       v3   = v2 + n;
473 
474       for (n = 0; n< sz-1; n+=2) {
475         i1    = idx[0];
476         i2    = idx[1];
477         idx  += 2;
478         tmp0  = x[i1];
479         tmp1  = x[i2];
480         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
481         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
482         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
483       }
484       if (n == sz-1) {
485         tmp0  = x[*idx++];
486         sum1 += *v1++ * tmp0;
487         sum2 += *v2++ * tmp0;
488         sum3 += *v3++ * tmp0;
489       }
490       y[row++]=sum1;
491       y[row++]=sum2;
492       y[row++]=sum3;
493       v1      =v3;              /* Since the next block to be processed starts there*/
494       idx    +=2*sz;
495       break;
496     case 4:
497       sum1 = 0.;
498       sum2 = 0.;
499       sum3 = 0.;
500       sum4 = 0.;
501       v2   = v1 + n;
502       v3   = v2 + n;
503       v4   = v3 + n;
504 
505       for (n = 0; n< sz-1; n+=2) {
506         i1    = idx[0];
507         i2    = idx[1];
508         idx  += 2;
509         tmp0  = x[i1];
510         tmp1  = x[i2];
511         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
512         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
513         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
514         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
515       }
516       if (n == sz-1) {
517         tmp0  = x[*idx++];
518         sum1 += *v1++ * tmp0;
519         sum2 += *v2++ * tmp0;
520         sum3 += *v3++ * tmp0;
521         sum4 += *v4++ * tmp0;
522       }
523       y[row++]=sum1;
524       y[row++]=sum2;
525       y[row++]=sum3;
526       y[row++]=sum4;
527       v1      =v4;              /* Since the next block to be processed starts there*/
528       idx    +=3*sz;
529       break;
530     case 5:
531       sum1 = 0.;
532       sum2 = 0.;
533       sum3 = 0.;
534       sum4 = 0.;
535       sum5 = 0.;
536       v2   = v1 + n;
537       v3   = v2 + n;
538       v4   = v3 + n;
539       v5   = v4 + n;
540 
541       for (n = 0; n<sz-1; n+=2) {
542         i1    = idx[0];
543         i2    = idx[1];
544         idx  += 2;
545         tmp0  = x[i1];
546         tmp1  = x[i2];
547         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
548         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
549         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
550         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
551         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
552       }
553       if (n == sz-1) {
554         tmp0  = x[*idx++];
555         sum1 += *v1++ * tmp0;
556         sum2 += *v2++ * tmp0;
557         sum3 += *v3++ * tmp0;
558         sum4 += *v4++ * tmp0;
559         sum5 += *v5++ * tmp0;
560       }
561       y[row++]=sum1;
562       y[row++]=sum2;
563       y[row++]=sum3;
564       y[row++]=sum4;
565       y[row++]=sum5;
566       v1      =v5;       /* Since the next block to be processed starts there */
567       idx    +=4*sz;
568       break;
569     default:
570       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
571     }
572   }
573   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
574   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
575   ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr);
576   PetscFunctionReturn(0);
577 }
578 /* ----------------------------------------------------------- */
579 /* Almost same code as the MatMult_SeqAIJ_Inode() */
580 #undef __FUNCT__
581 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode"
582 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
583 {
584   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
585   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
586   const MatScalar   *v1,*v2,*v3,*v4,*v5;
587   const PetscScalar *x;
588   PetscScalar       *y,*z,*zt;
589   PetscErrorCode    ierr;
590   PetscInt          i1,i2,n,i,row,node_max,nsz,sz;
591   const PetscInt    *idx,*ns,*ii;
592 
593   PetscFunctionBegin;
594   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
595   node_max = a->inode.node_count;
596   ns       = a->inode.size;     /* Node Size array */
597 
598   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
599   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
600   if (zz != yy) {
601     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
602   } else {
603     z = y;
604   }
605   zt = z;
606 
607   idx = a->j;
608   v1  = a->a;
609   ii  = a->i;
610 
611   for (i = 0,row = 0; i< node_max; ++i) {
612     nsz = ns[i];
613     n   = ii[1] - ii[0];
614     ii += nsz;
615     sz  = n;                    /* No of non zeros in this row */
616                                 /* Switch on the size of Node */
617     switch (nsz) {               /* Each loop in 'case' is unrolled */
618     case 1:
619       sum1 = *zt++;
620 
621       for (n = 0; n< sz-1; n+=2) {
622         i1    = idx[0];         /* The instructions are ordered to */
623         i2    = idx[1];         /* make the compiler's job easy */
624         idx  += 2;
625         tmp0  = x[i1];
626         tmp1  = x[i2];
627         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
628       }
629 
630       if (n   == sz-1) {          /* Take care of the last nonzero  */
631         tmp0  = x[*idx++];
632         sum1 += *v1++ * tmp0;
633       }
634       y[row++]=sum1;
635       break;
636     case 2:
637       sum1 = *zt++;
638       sum2 = *zt++;
639       v2   = v1 + n;
640 
641       for (n = 0; n< sz-1; n+=2) {
642         i1    = idx[0];
643         i2    = idx[1];
644         idx  += 2;
645         tmp0  = x[i1];
646         tmp1  = x[i2];
647         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
648         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
649       }
650       if (n   == sz-1) {
651         tmp0  = x[*idx++];
652         sum1 += *v1++ * tmp0;
653         sum2 += *v2++ * tmp0;
654       }
655       y[row++]=sum1;
656       y[row++]=sum2;
657       v1      =v2;              /* Since the next block to be processed starts there*/
658       idx    +=sz;
659       break;
660     case 3:
661       sum1 = *zt++;
662       sum2 = *zt++;
663       sum3 = *zt++;
664       v2   = v1 + n;
665       v3   = v2 + n;
666 
667       for (n = 0; n< sz-1; n+=2) {
668         i1    = idx[0];
669         i2    = idx[1];
670         idx  += 2;
671         tmp0  = x[i1];
672         tmp1  = x[i2];
673         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
674         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
675         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
676       }
677       if (n == sz-1) {
678         tmp0  = x[*idx++];
679         sum1 += *v1++ * tmp0;
680         sum2 += *v2++ * tmp0;
681         sum3 += *v3++ * tmp0;
682       }
683       y[row++]=sum1;
684       y[row++]=sum2;
685       y[row++]=sum3;
686       v1      =v3;              /* Since the next block to be processed starts there*/
687       idx    +=2*sz;
688       break;
689     case 4:
690       sum1 = *zt++;
691       sum2 = *zt++;
692       sum3 = *zt++;
693       sum4 = *zt++;
694       v2   = v1 + n;
695       v3   = v2 + n;
696       v4   = v3 + n;
697 
698       for (n = 0; n< sz-1; n+=2) {
699         i1    = idx[0];
700         i2    = idx[1];
701         idx  += 2;
702         tmp0  = x[i1];
703         tmp1  = x[i2];
704         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
705         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
706         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
707         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
708       }
709       if (n == sz-1) {
710         tmp0  = x[*idx++];
711         sum1 += *v1++ * tmp0;
712         sum2 += *v2++ * tmp0;
713         sum3 += *v3++ * tmp0;
714         sum4 += *v4++ * tmp0;
715       }
716       y[row++]=sum1;
717       y[row++]=sum2;
718       y[row++]=sum3;
719       y[row++]=sum4;
720       v1      =v4;              /* Since the next block to be processed starts there*/
721       idx    +=3*sz;
722       break;
723     case 5:
724       sum1 = *zt++;
725       sum2 = *zt++;
726       sum3 = *zt++;
727       sum4 = *zt++;
728       sum5 = *zt++;
729       v2   = v1 + n;
730       v3   = v2 + n;
731       v4   = v3 + n;
732       v5   = v4 + n;
733 
734       for (n = 0; n<sz-1; n+=2) {
735         i1    = idx[0];
736         i2    = idx[1];
737         idx  += 2;
738         tmp0  = x[i1];
739         tmp1  = x[i2];
740         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
741         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
742         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
743         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
744         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
745       }
746       if (n   == sz-1) {
747         tmp0  = x[*idx++];
748         sum1 += *v1++ * tmp0;
749         sum2 += *v2++ * tmp0;
750         sum3 += *v3++ * tmp0;
751         sum4 += *v4++ * tmp0;
752         sum5 += *v5++ * tmp0;
753       }
754       y[row++]=sum1;
755       y[row++]=sum2;
756       y[row++]=sum3;
757       y[row++]=sum4;
758       y[row++]=sum5;
759       v1      =v5;       /* Since the next block to be processed starts there */
760       idx    +=4*sz;
761       break;
762     default:
763       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
764     }
765   }
766   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
767   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
768   if (zz != yy) {
769     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
770   }
771   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
772   PetscFunctionReturn(0);
773 }
774 
775 /* ----------------------------------------------------------- */
776 #undef __FUNCT__
777 #define __FUNCT__ "MatSolve_SeqAIJ_Inode_inplace"
778 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)
779 {
780   Mat_SeqAIJ        *a    = (Mat_SeqAIJ*)A->data;
781   IS                iscol = a->col,isrow = a->row;
782   PetscErrorCode    ierr;
783   const PetscInt    *r,*c,*rout,*cout;
784   PetscInt          i,j,n = A->rmap->n,nz;
785   PetscInt          node_max,*ns,row,nsz,aii,i0,i1;
786   const PetscInt    *ai = a->i,*a_j = a->j,*vi,*ad,*aj;
787   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
788   PetscScalar       sum1,sum2,sum3,sum4,sum5;
789   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
790   const PetscScalar *b;
791 
792   PetscFunctionBegin;
793   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
794   node_max = a->inode.node_count;
795   ns       = a->inode.size;     /* Node Size array */
796 
797   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
798   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
799   tmp  = a->solve_work;
800 
801   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
802   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
803 
804   /* forward solve the lower triangular */
805   tmps = tmp;
806   aa   = a_a;
807   aj   = a_j;
808   ad   = a->diag;
809 
810   for (i = 0,row = 0; i< node_max; ++i) {
811     nsz = ns[i];
812     aii = ai[row];
813     v1  = aa + aii;
814     vi  = aj + aii;
815     nz  = ad[row]- aii;
816     if (i < node_max-1) {
817       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
818       * but our indexing to determine it's size could. */
819       PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
820       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
821       PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
822       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
823     }
824 
825     switch (nsz) {               /* Each loop in 'case' is unrolled */
826     case 1:
827       sum1 = b[*r++];
828       for (j=0; j<nz-1; j+=2) {
829         i0    = vi[0];
830         i1    = vi[1];
831         vi   +=2;
832         tmp0  = tmps[i0];
833         tmp1  = tmps[i1];
834         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
835       }
836       if (j == nz-1) {
837         tmp0  = tmps[*vi++];
838         sum1 -= *v1++ *tmp0;
839       }
840       tmp[row++]=sum1;
841       break;
842     case 2:
843       sum1 = b[*r++];
844       sum2 = b[*r++];
845       v2   = aa + ai[row+1];
846 
847       for (j=0; j<nz-1; j+=2) {
848         i0    = vi[0];
849         i1    = vi[1];
850         vi   +=2;
851         tmp0  = tmps[i0];
852         tmp1  = tmps[i1];
853         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
854         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
855       }
856       if (j == nz-1) {
857         tmp0  = tmps[*vi++];
858         sum1 -= *v1++ *tmp0;
859         sum2 -= *v2++ *tmp0;
860       }
861       sum2     -= *v2++ *sum1;
862       tmp[row++]=sum1;
863       tmp[row++]=sum2;
864       break;
865     case 3:
866       sum1 = b[*r++];
867       sum2 = b[*r++];
868       sum3 = b[*r++];
869       v2   = aa + ai[row+1];
870       v3   = aa + ai[row+2];
871 
872       for (j=0; j<nz-1; j+=2) {
873         i0    = vi[0];
874         i1    = vi[1];
875         vi   +=2;
876         tmp0  = tmps[i0];
877         tmp1  = tmps[i1];
878         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
879         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
880         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
881       }
882       if (j == nz-1) {
883         tmp0  = tmps[*vi++];
884         sum1 -= *v1++ *tmp0;
885         sum2 -= *v2++ *tmp0;
886         sum3 -= *v3++ *tmp0;
887       }
888       sum2 -= *v2++ * sum1;
889       sum3 -= *v3++ * sum1;
890       sum3 -= *v3++ * sum2;
891 
892       tmp[row++]=sum1;
893       tmp[row++]=sum2;
894       tmp[row++]=sum3;
895       break;
896 
897     case 4:
898       sum1 = b[*r++];
899       sum2 = b[*r++];
900       sum3 = b[*r++];
901       sum4 = b[*r++];
902       v2   = aa + ai[row+1];
903       v3   = aa + ai[row+2];
904       v4   = aa + ai[row+3];
905 
906       for (j=0; j<nz-1; j+=2) {
907         i0    = vi[0];
908         i1    = vi[1];
909         vi   +=2;
910         tmp0  = tmps[i0];
911         tmp1  = tmps[i1];
912         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
913         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
914         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
915         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
916       }
917       if (j == nz-1) {
918         tmp0  = tmps[*vi++];
919         sum1 -= *v1++ *tmp0;
920         sum2 -= *v2++ *tmp0;
921         sum3 -= *v3++ *tmp0;
922         sum4 -= *v4++ *tmp0;
923       }
924       sum2 -= *v2++ * sum1;
925       sum3 -= *v3++ * sum1;
926       sum4 -= *v4++ * sum1;
927       sum3 -= *v3++ * sum2;
928       sum4 -= *v4++ * sum2;
929       sum4 -= *v4++ * sum3;
930 
931       tmp[row++]=sum1;
932       tmp[row++]=sum2;
933       tmp[row++]=sum3;
934       tmp[row++]=sum4;
935       break;
936     case 5:
937       sum1 = b[*r++];
938       sum2 = b[*r++];
939       sum3 = b[*r++];
940       sum4 = b[*r++];
941       sum5 = b[*r++];
942       v2   = aa + ai[row+1];
943       v3   = aa + ai[row+2];
944       v4   = aa + ai[row+3];
945       v5   = aa + ai[row+4];
946 
947       for (j=0; j<nz-1; j+=2) {
948         i0    = vi[0];
949         i1    = vi[1];
950         vi   +=2;
951         tmp0  = tmps[i0];
952         tmp1  = tmps[i1];
953         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
954         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
955         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
956         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
957         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
958       }
959       if (j == nz-1) {
960         tmp0  = tmps[*vi++];
961         sum1 -= *v1++ *tmp0;
962         sum2 -= *v2++ *tmp0;
963         sum3 -= *v3++ *tmp0;
964         sum4 -= *v4++ *tmp0;
965         sum5 -= *v5++ *tmp0;
966       }
967 
968       sum2 -= *v2++ * sum1;
969       sum3 -= *v3++ * sum1;
970       sum4 -= *v4++ * sum1;
971       sum5 -= *v5++ * sum1;
972       sum3 -= *v3++ * sum2;
973       sum4 -= *v4++ * sum2;
974       sum5 -= *v5++ * sum2;
975       sum4 -= *v4++ * sum3;
976       sum5 -= *v5++ * sum3;
977       sum5 -= *v5++ * sum4;
978 
979       tmp[row++]=sum1;
980       tmp[row++]=sum2;
981       tmp[row++]=sum3;
982       tmp[row++]=sum4;
983       tmp[row++]=sum5;
984       break;
985     default:
986       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
987     }
988   }
989   /* backward solve the upper triangular */
990   for (i=node_max -1,row = n-1; i>=0; i--) {
991     nsz = ns[i];
992     aii = ai[row+1] -1;
993     v1  = aa + aii;
994     vi  = aj + aii;
995     nz  = aii- ad[row];
996     switch (nsz) {               /* Each loop in 'case' is unrolled */
997     case 1:
998       sum1 = tmp[row];
999 
1000       for (j=nz; j>1; j-=2) {
1001         vi   -=2;
1002         i0    = vi[2];
1003         i1    = vi[1];
1004         tmp0  = tmps[i0];
1005         tmp1  = tmps[i1];
1006         v1   -= 2;
1007         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1008       }
1009       if (j==1) {
1010         tmp0  = tmps[*vi--];
1011         sum1 -= *v1-- * tmp0;
1012       }
1013       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1014       break;
1015     case 2:
1016       sum1 = tmp[row];
1017       sum2 = tmp[row -1];
1018       v2   = aa + ai[row]-1;
1019       for (j=nz; j>1; j-=2) {
1020         vi   -=2;
1021         i0    = vi[2];
1022         i1    = vi[1];
1023         tmp0  = tmps[i0];
1024         tmp1  = tmps[i1];
1025         v1   -= 2;
1026         v2   -= 2;
1027         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1028         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1029       }
1030       if (j==1) {
1031         tmp0  = tmps[*vi--];
1032         sum1 -= *v1-- * tmp0;
1033         sum2 -= *v2-- * tmp0;
1034       }
1035 
1036       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1037       sum2   -= *v2-- * tmp0;
1038       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1039       break;
1040     case 3:
1041       sum1 = tmp[row];
1042       sum2 = tmp[row -1];
1043       sum3 = tmp[row -2];
1044       v2   = aa + ai[row]-1;
1045       v3   = aa + ai[row -1]-1;
1046       for (j=nz; j>1; j-=2) {
1047         vi   -=2;
1048         i0    = vi[2];
1049         i1    = vi[1];
1050         tmp0  = tmps[i0];
1051         tmp1  = tmps[i1];
1052         v1   -= 2;
1053         v2   -= 2;
1054         v3   -= 2;
1055         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1056         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1057         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1058       }
1059       if (j==1) {
1060         tmp0  = tmps[*vi--];
1061         sum1 -= *v1-- * tmp0;
1062         sum2 -= *v2-- * tmp0;
1063         sum3 -= *v3-- * tmp0;
1064       }
1065       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1066       sum2   -= *v2-- * tmp0;
1067       sum3   -= *v3-- * tmp0;
1068       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1069       sum3   -= *v3-- * tmp0;
1070       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1071 
1072       break;
1073     case 4:
1074       sum1 = tmp[row];
1075       sum2 = tmp[row -1];
1076       sum3 = tmp[row -2];
1077       sum4 = tmp[row -3];
1078       v2   = aa + ai[row]-1;
1079       v3   = aa + ai[row -1]-1;
1080       v4   = aa + ai[row -2]-1;
1081 
1082       for (j=nz; j>1; j-=2) {
1083         vi   -=2;
1084         i0    = vi[2];
1085         i1    = vi[1];
1086         tmp0  = tmps[i0];
1087         tmp1  = tmps[i1];
1088         v1   -= 2;
1089         v2   -= 2;
1090         v3   -= 2;
1091         v4   -= 2;
1092         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1093         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1094         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1095         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1096       }
1097       if (j==1) {
1098         tmp0  = tmps[*vi--];
1099         sum1 -= *v1-- * tmp0;
1100         sum2 -= *v2-- * tmp0;
1101         sum3 -= *v3-- * tmp0;
1102         sum4 -= *v4-- * tmp0;
1103       }
1104 
1105       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1106       sum2   -= *v2-- * tmp0;
1107       sum3   -= *v3-- * tmp0;
1108       sum4   -= *v4-- * tmp0;
1109       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1110       sum3   -= *v3-- * tmp0;
1111       sum4   -= *v4-- * tmp0;
1112       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1113       sum4   -= *v4-- * tmp0;
1114       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1115       break;
1116     case 5:
1117       sum1 = tmp[row];
1118       sum2 = tmp[row -1];
1119       sum3 = tmp[row -2];
1120       sum4 = tmp[row -3];
1121       sum5 = tmp[row -4];
1122       v2   = aa + ai[row]-1;
1123       v3   = aa + ai[row -1]-1;
1124       v4   = aa + ai[row -2]-1;
1125       v5   = aa + ai[row -3]-1;
1126       for (j=nz; j>1; j-=2) {
1127         vi   -= 2;
1128         i0    = vi[2];
1129         i1    = vi[1];
1130         tmp0  = tmps[i0];
1131         tmp1  = tmps[i1];
1132         v1   -= 2;
1133         v2   -= 2;
1134         v3   -= 2;
1135         v4   -= 2;
1136         v5   -= 2;
1137         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1138         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1139         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1140         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1141         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1142       }
1143       if (j==1) {
1144         tmp0  = tmps[*vi--];
1145         sum1 -= *v1-- * tmp0;
1146         sum2 -= *v2-- * tmp0;
1147         sum3 -= *v3-- * tmp0;
1148         sum4 -= *v4-- * tmp0;
1149         sum5 -= *v5-- * tmp0;
1150       }
1151 
1152       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1153       sum2   -= *v2-- * tmp0;
1154       sum3   -= *v3-- * tmp0;
1155       sum4   -= *v4-- * tmp0;
1156       sum5   -= *v5-- * tmp0;
1157       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1158       sum3   -= *v3-- * tmp0;
1159       sum4   -= *v4-- * tmp0;
1160       sum5   -= *v5-- * tmp0;
1161       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1162       sum4   -= *v4-- * tmp0;
1163       sum5   -= *v5-- * tmp0;
1164       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1165       sum5   -= *v5-- * tmp0;
1166       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1167       break;
1168     default:
1169       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
1170     }
1171   }
1172   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1173   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1174   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1175   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1176   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
1177   PetscFunctionReturn(0);
1178 }
1179 
1180 #undef __FUNCT__
1181 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode"
1182 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info)
1183 {
1184   Mat             C     =B;
1185   Mat_SeqAIJ      *a    =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)C->data;
1186   IS              isrow = b->row,isicol = b->icol;
1187   PetscErrorCode  ierr;
1188   const PetscInt  *r,*ic,*ics;
1189   const PetscInt  n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag;
1190   PetscInt        i,j,k,nz,nzL,row,*pj;
1191   const PetscInt  *ajtmp,*bjtmp;
1192   MatScalar       *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4;
1193   const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4;
1194   FactorShiftCtx  sctx;
1195   const PetscInt  *ddiag;
1196   PetscReal       rs;
1197   MatScalar       d;
1198   PetscInt        inod,nodesz,node_max,col;
1199   const PetscInt  *ns;
1200   PetscInt        *tmp_vec1,*tmp_vec2,*nsmap;
1201 
1202   PetscFunctionBegin;
1203   /* MatPivotSetUp(): initialize shift context sctx */
1204   ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr);
1205 
1206   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1207     ddiag          = a->diag;
1208     sctx.shift_top = info->zeropivot;
1209     for (i=0; i<n; i++) {
1210       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1211       d  = (aa)[ddiag[i]];
1212       rs = -PetscAbsScalar(d) - PetscRealPart(d);
1213       v  = aa+ai[i];
1214       nz = ai[i+1] - ai[i];
1215       for (j=0; j<nz; j++) rs += PetscAbsScalar(v[j]);
1216       if (rs>sctx.shift_top) sctx.shift_top = rs;
1217     }
1218     sctx.shift_top *= 1.1;
1219     sctx.nshift_max = 5;
1220     sctx.shift_lo   = 0.;
1221     sctx.shift_hi   = 1.;
1222   }
1223 
1224   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1225   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1226 
1227   ierr  = PetscMalloc((4*n+1)*sizeof(PetscScalar),&rtmp1);CHKERRQ(ierr);
1228   ierr  = PetscMemzero(rtmp1,(4*n+1)*sizeof(PetscScalar));CHKERRQ(ierr);
1229   rtmp2 = rtmp1 + n;
1230   rtmp3 = rtmp2 + n;
1231   rtmp4 = rtmp3 + n;
1232   ics   = ic;
1233 
1234   node_max = a->inode.node_count;
1235   ns       = a->inode.size;
1236   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1237 
1238   /* If max inode size > 4, split it into two inodes.*/
1239   /* also map the inode sizes according to the ordering */
1240   ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr);
1241   for (i=0,j=0; i<node_max; ++i,++j) {
1242     if (ns[i] > 4) {
1243       tmp_vec1[j] = 4;
1244       ++j;
1245       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1246     } else {
1247       tmp_vec1[j] = ns[i];
1248     }
1249   }
1250   /* Use the correct node_max */
1251   node_max = j;
1252 
1253   /* Now reorder the inode info based on mat re-ordering info */
1254   /* First create a row -> inode_size_array_index map */
1255   ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr);
1256   ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr);
1257   for (i=0,row=0; i<node_max; i++) {
1258     nodesz = tmp_vec1[i];
1259     for (j=0; j<nodesz; j++,row++) {
1260       nsmap[row] = i;
1261     }
1262   }
1263   /* Using nsmap, create a reordered ns structure */
1264   for (i=0,j=0; i< node_max; i++) {
1265     nodesz      = tmp_vec1[nsmap[r[j]]];     /* here the reordered row_no is in r[] */
1266     tmp_vec2[i] = nodesz;
1267     j          += nodesz;
1268   }
1269   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1270   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1271 
1272   /* Now use the correct ns */
1273   ns = tmp_vec2;
1274 
1275   do {
1276     sctx.newshift = PETSC_FALSE;
1277     /* Now loop over each block-row, and do the factorization */
1278     for (inod=0,i=0; inod<node_max; inod++) { /* i: row index; inod: inode index */
1279       nodesz = ns[inod];
1280 
1281       switch (nodesz) {
1282       case 1:
1283         /*----------*/
1284         /* zero rtmp1 */
1285         /* L part */
1286         nz    = bi[i+1] - bi[i];
1287         bjtmp = bj + bi[i];
1288         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1289 
1290         /* U part */
1291         nz    = bdiag[i]-bdiag[i+1];
1292         bjtmp = bj + bdiag[i+1]+1;
1293         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1294 
1295         /* load in initial (unfactored row) */
1296         nz    = ai[r[i]+1] - ai[r[i]];
1297         ajtmp = aj + ai[r[i]];
1298         v     = aa + ai[r[i]];
1299         for (j=0; j<nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];
1300 
1301         /* ZeropivotApply() */
1302         rtmp1[i] += sctx.shift_amount;  /* shift the diagonal of the matrix */
1303 
1304         /* elimination */
1305         bjtmp = bj + bi[i];
1306         row   = *bjtmp++;
1307         nzL   = bi[i+1] - bi[i];
1308         for (k=0; k < nzL; k++) {
1309           pc = rtmp1 + row;
1310           if (*pc != 0.0) {
1311             pv   = b->a + bdiag[row];
1312             mul1 = *pc * (*pv);
1313             *pc  = mul1;
1314             pj   = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1315             pv   = b->a + bdiag[row+1]+1;
1316             nz   = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1317             for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1318             ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1319           }
1320           row = *bjtmp++;
1321         }
1322 
1323         /* finished row so stick it into b->a */
1324         rs = 0.0;
1325         /* L part */
1326         pv = b->a + bi[i];
1327         pj = b->j + bi[i];
1328         nz = bi[i+1] - bi[i];
1329         for (j=0; j<nz; j++) {
1330           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1331         }
1332 
1333         /* U part */
1334         pv = b->a + bdiag[i+1]+1;
1335         pj = b->j + bdiag[i+1]+1;
1336         nz = bdiag[i] - bdiag[i+1]-1;
1337         for (j=0; j<nz; j++) {
1338           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1339         }
1340 
1341         /* Check zero pivot */
1342         sctx.rs = rs;
1343         sctx.pv = rtmp1[i];
1344         ierr    = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr);
1345         if (sctx.newshift) break;
1346 
1347         /* Mark diagonal and invert diagonal for simplier triangular solves */
1348         pv  = b->a + bdiag[i];
1349         *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1350         break;
1351 
1352       case 2:
1353         /*----------*/
1354         /* zero rtmp1 and rtmp2 */
1355         /* L part */
1356         nz    = bi[i+1] - bi[i];
1357         bjtmp = bj + bi[i];
1358         for  (j=0; j<nz; j++) {
1359           col        = bjtmp[j];
1360           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1361         }
1362 
1363         /* U part */
1364         nz    = bdiag[i]-bdiag[i+1];
1365         bjtmp = bj + bdiag[i+1]+1;
1366         for  (j=0; j<nz; j++) {
1367           col        = bjtmp[j];
1368           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1369         }
1370 
1371         /* load in initial (unfactored row) */
1372         nz    = ai[r[i]+1] - ai[r[i]];
1373         ajtmp = aj + ai[r[i]];
1374         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1];
1375         for (j=0; j<nz; j++) {
1376           col        = ics[ajtmp[j]];
1377           rtmp1[col] = v1[j]; rtmp2[col] = v2[j];
1378         }
1379         /* ZeropivotApply(): shift the diagonal of the matrix  */
1380         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount;
1381 
1382         /* elimination */
1383         bjtmp = bj + bi[i];
1384         row   = *bjtmp++; /* pivot row */
1385         nzL   = bi[i+1] - bi[i];
1386         for (k=0; k < nzL; k++) {
1387           pc1 = rtmp1 + row;
1388           pc2 = rtmp2 + row;
1389           if (*pc1 != 0.0 || *pc2 != 0.0) {
1390             pv   = b->a + bdiag[row];
1391             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv);
1392             *pc1 = mul1;       *pc2 = mul2;
1393 
1394             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1395             pv = b->a + bdiag[row+1]+1;
1396             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1397             for (j=0; j<nz; j++) {
1398               col         = pj[j];
1399               rtmp1[col] -= mul1 * pv[j];
1400               rtmp2[col] -= mul2 * pv[j];
1401             }
1402             ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr);
1403           }
1404           row = *bjtmp++;
1405         }
1406 
1407         /* finished row i; check zero pivot, then stick row i into b->a */
1408         rs = 0.0;
1409         /* L part */
1410         pc1 = b->a + bi[i];
1411         pj  = b->j + bi[i];
1412         nz  = bi[i+1] - bi[i];
1413         for (j=0; j<nz; j++) {
1414           col    = pj[j];
1415           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1416         }
1417         /* U part */
1418         pc1 = b->a + bdiag[i+1]+1;
1419         pj  = b->j + bdiag[i+1]+1;
1420         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1421         for (j=0; j<nz; j++) {
1422           col    = pj[j];
1423           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1424         }
1425 
1426         sctx.rs = rs;
1427         sctx.pv = rtmp1[i];
1428         ierr    = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr);
1429         if (sctx.newshift) break;
1430         pc1  = b->a + bdiag[i]; /* Mark diagonal */
1431         *pc1 = 1.0/sctx.pv;
1432 
1433         /* Now take care of diagonal 2x2 block. */
1434         pc2 = rtmp2 + i;
1435         if (*pc2 != 0.0) {
1436           mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */
1437           *pc2 = mul1;          /* insert L entry */
1438           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1439           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1440           for (j=0; j<nz; j++) {
1441             col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col];
1442           }
1443           ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1444         }
1445 
1446         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1447         rs = 0.0;
1448         /* L part */
1449         pc2 = b->a + bi[i+1];
1450         pj  = b->j + bi[i+1];
1451         nz  = bi[i+2] - bi[i+1];
1452         for (j=0; j<nz; j++) {
1453           col    = pj[j];
1454           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1455         }
1456         /* U part */
1457         pc2 = b->a + bdiag[i+2]+1;
1458         pj  = b->j + bdiag[i+2]+1;
1459         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1460         for (j=0; j<nz; j++) {
1461           col    = pj[j];
1462           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1463         }
1464 
1465         sctx.rs = rs;
1466         sctx.pv = rtmp2[i+1];
1467         ierr    = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr);
1468         if (sctx.newshift) break;
1469         pc2  = b->a + bdiag[i+1];
1470         *pc2 = 1.0/sctx.pv;
1471         break;
1472 
1473       case 3:
1474         /*----------*/
1475         /* zero rtmp */
1476         /* L part */
1477         nz    = bi[i+1] - bi[i];
1478         bjtmp = bj + bi[i];
1479         for  (j=0; j<nz; j++) {
1480           col        = bjtmp[j];
1481           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1482         }
1483 
1484         /* U part */
1485         nz    = bdiag[i]-bdiag[i+1];
1486         bjtmp = bj + bdiag[i+1]+1;
1487         for  (j=0; j<nz; j++) {
1488           col        = bjtmp[j];
1489           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1490         }
1491 
1492         /* load in initial (unfactored row) */
1493         nz    = ai[r[i]+1] - ai[r[i]];
1494         ajtmp = aj + ai[r[i]];
1495         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2];
1496         for (j=0; j<nz; j++) {
1497           col        = ics[ajtmp[j]];
1498           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j];
1499         }
1500         /* ZeropivotApply(): shift the diagonal of the matrix  */
1501         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount;
1502 
1503         /* elimination */
1504         bjtmp = bj + bi[i];
1505         row   = *bjtmp++; /* pivot row */
1506         nzL   = bi[i+1] - bi[i];
1507         for (k=0; k < nzL; k++) {
1508           pc1 = rtmp1 + row;
1509           pc2 = rtmp2 + row;
1510           pc3 = rtmp3 + row;
1511           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1512             pv   = b->a + bdiag[row];
1513             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv);
1514             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3;
1515 
1516             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1517             pv = b->a + bdiag[row+1]+1;
1518             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1519             for (j=0; j<nz; j++) {
1520               col         = pj[j];
1521               rtmp1[col] -= mul1 * pv[j];
1522               rtmp2[col] -= mul2 * pv[j];
1523               rtmp3[col] -= mul3 * pv[j];
1524             }
1525             ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr);
1526           }
1527           row = *bjtmp++;
1528         }
1529 
1530         /* finished row i; check zero pivot, then stick row i into b->a */
1531         rs = 0.0;
1532         /* L part */
1533         pc1 = b->a + bi[i];
1534         pj  = b->j + bi[i];
1535         nz  = bi[i+1] - bi[i];
1536         for (j=0; j<nz; j++) {
1537           col    = pj[j];
1538           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1539         }
1540         /* U part */
1541         pc1 = b->a + bdiag[i+1]+1;
1542         pj  = b->j + bdiag[i+1]+1;
1543         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1544         for (j=0; j<nz; j++) {
1545           col    = pj[j];
1546           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1547         }
1548 
1549         sctx.rs = rs;
1550         sctx.pv = rtmp1[i];
1551         ierr    = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr);
1552         if (sctx.newshift) break;
1553         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1554         *pc1 = 1.0/sctx.pv;
1555 
1556         /* Now take care of 1st column of diagonal 3x3 block. */
1557         pc2 = rtmp2 + i;
1558         pc3 = rtmp3 + i;
1559         if (*pc2 != 0.0 || *pc3 != 0.0) {
1560           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1561           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1562           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1563           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1564           for (j=0; j<nz; j++) {
1565             col         = pj[j];
1566             rtmp2[col] -= mul2 * rtmp1[col];
1567             rtmp3[col] -= mul3 * rtmp1[col];
1568           }
1569           ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr);
1570         }
1571 
1572         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1573         rs = 0.0;
1574         /* L part */
1575         pc2 = b->a + bi[i+1];
1576         pj  = b->j + bi[i+1];
1577         nz  = bi[i+2] - bi[i+1];
1578         for (j=0; j<nz; j++) {
1579           col    = pj[j];
1580           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1581         }
1582         /* U part */
1583         pc2 = b->a + bdiag[i+2]+1;
1584         pj  = b->j + bdiag[i+2]+1;
1585         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1586         for (j=0; j<nz; j++) {
1587           col    = pj[j];
1588           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1589         }
1590 
1591         sctx.rs = rs;
1592         sctx.pv = rtmp2[i+1];
1593         ierr    = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr);
1594         if (sctx.newshift) break;
1595         pc2  = b->a + bdiag[i+1];
1596         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1597 
1598         /* Now take care of 2nd column of diagonal 3x3 block. */
1599         pc3 = rtmp3 + i+1;
1600         if (*pc3 != 0.0) {
1601           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1602           pj   = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1603           nz   = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1604           for (j=0; j<nz; j++) {
1605             col         = pj[j];
1606             rtmp3[col] -= mul3 * rtmp2[col];
1607           }
1608           ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1609         }
1610 
1611         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1612         rs = 0.0;
1613         /* L part */
1614         pc3 = b->a + bi[i+2];
1615         pj  = b->j + bi[i+2];
1616         nz  = bi[i+3] - bi[i+2];
1617         for (j=0; j<nz; j++) {
1618           col    = pj[j];
1619           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1620         }
1621         /* U part */
1622         pc3 = b->a + bdiag[i+3]+1;
1623         pj  = b->j + bdiag[i+3]+1;
1624         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1625         for (j=0; j<nz; j++) {
1626           col    = pj[j];
1627           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1628         }
1629 
1630         sctx.rs = rs;
1631         sctx.pv = rtmp3[i+2];
1632         ierr    = MatPivotCheck(A,info,&sctx,i+2);CHKERRQ(ierr);
1633         if (sctx.newshift) break;
1634         pc3  = b->a + bdiag[i+2];
1635         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1636         break;
1637       case 4:
1638         /*----------*/
1639         /* zero rtmp */
1640         /* L part */
1641         nz    = bi[i+1] - bi[i];
1642         bjtmp = bj + bi[i];
1643         for  (j=0; j<nz; j++) {
1644           col        = bjtmp[j];
1645           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0;
1646         }
1647 
1648         /* U part */
1649         nz    = bdiag[i]-bdiag[i+1];
1650         bjtmp = bj + bdiag[i+1]+1;
1651         for  (j=0; j<nz; j++) {
1652           col        = bjtmp[j];
1653           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0;
1654         }
1655 
1656         /* load in initial (unfactored row) */
1657         nz    = ai[r[i]+1] - ai[r[i]];
1658         ajtmp = aj + ai[r[i]];
1659         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3];
1660         for (j=0; j<nz; j++) {
1661           col        = ics[ajtmp[j]];
1662           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j];
1663         }
1664         /* ZeropivotApply(): shift the diagonal of the matrix  */
1665         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount;
1666 
1667         /* elimination */
1668         bjtmp = bj + bi[i];
1669         row   = *bjtmp++; /* pivot row */
1670         nzL   = bi[i+1] - bi[i];
1671         for (k=0; k < nzL; k++) {
1672           pc1 = rtmp1 + row;
1673           pc2 = rtmp2 + row;
1674           pc3 = rtmp3 + row;
1675           pc4 = rtmp4 + row;
1676           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1677             pv   = b->a + bdiag[row];
1678             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv);
1679             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4;
1680 
1681             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1682             pv = b->a + bdiag[row+1]+1;
1683             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1684             for (j=0; j<nz; j++) {
1685               col         = pj[j];
1686               rtmp1[col] -= mul1 * pv[j];
1687               rtmp2[col] -= mul2 * pv[j];
1688               rtmp3[col] -= mul3 * pv[j];
1689               rtmp4[col] -= mul4 * pv[j];
1690             }
1691             ierr = PetscLogFlops(4+8*nz);CHKERRQ(ierr);
1692           }
1693           row = *bjtmp++;
1694         }
1695 
1696         /* finished row i; check zero pivot, then stick row i into b->a */
1697         rs = 0.0;
1698         /* L part */
1699         pc1 = b->a + bi[i];
1700         pj  = b->j + bi[i];
1701         nz  = bi[i+1] - bi[i];
1702         for (j=0; j<nz; j++) {
1703           col    = pj[j];
1704           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1705         }
1706         /* U part */
1707         pc1 = b->a + bdiag[i+1]+1;
1708         pj  = b->j + bdiag[i+1]+1;
1709         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1710         for (j=0; j<nz; j++) {
1711           col    = pj[j];
1712           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1713         }
1714 
1715         sctx.rs = rs;
1716         sctx.pv = rtmp1[i];
1717         ierr    = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr);
1718         if (sctx.newshift) break;
1719         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1720         *pc1 = 1.0/sctx.pv;
1721 
1722         /* Now take care of 1st column of diagonal 4x4 block. */
1723         pc2 = rtmp2 + i;
1724         pc3 = rtmp3 + i;
1725         pc4 = rtmp4 + i;
1726         if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1727           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1728           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1729           mul4 = (*pc4)*(*pc1); *pc4 = mul4;
1730           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1731           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1732           for (j=0; j<nz; j++) {
1733             col         = pj[j];
1734             rtmp2[col] -= mul2 * rtmp1[col];
1735             rtmp3[col] -= mul3 * rtmp1[col];
1736             rtmp4[col] -= mul4 * rtmp1[col];
1737           }
1738           ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr);
1739         }
1740 
1741         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1742         rs = 0.0;
1743         /* L part */
1744         pc2 = b->a + bi[i+1];
1745         pj  = b->j + bi[i+1];
1746         nz  = bi[i+2] - bi[i+1];
1747         for (j=0; j<nz; j++) {
1748           col    = pj[j];
1749           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1750         }
1751         /* U part */
1752         pc2 = b->a + bdiag[i+2]+1;
1753         pj  = b->j + bdiag[i+2]+1;
1754         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1755         for (j=0; j<nz; j++) {
1756           col    = pj[j];
1757           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1758         }
1759 
1760         sctx.rs = rs;
1761         sctx.pv = rtmp2[i+1];
1762         ierr    = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr);
1763         if (sctx.newshift) break;
1764         pc2  = b->a + bdiag[i+1];
1765         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1766 
1767         /* Now take care of 2nd column of diagonal 4x4 block. */
1768         pc3 = rtmp3 + i+1;
1769         pc4 = rtmp4 + i+1;
1770         if (*pc3 != 0.0 || *pc4 != 0.0) {
1771           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1772           mul4 = (*pc4)*(*pc2); *pc4 = mul4;
1773           pj   = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1774           nz   = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1775           for (j=0; j<nz; j++) {
1776             col         = pj[j];
1777             rtmp3[col] -= mul3 * rtmp2[col];
1778             rtmp4[col] -= mul4 * rtmp2[col];
1779           }
1780           ierr = PetscLogFlops(4*nz);CHKERRQ(ierr);
1781         }
1782 
1783         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1784         rs = 0.0;
1785         /* L part */
1786         pc3 = b->a + bi[i+2];
1787         pj  = b->j + bi[i+2];
1788         nz  = bi[i+3] - bi[i+2];
1789         for (j=0; j<nz; j++) {
1790           col    = pj[j];
1791           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1792         }
1793         /* U part */
1794         pc3 = b->a + bdiag[i+3]+1;
1795         pj  = b->j + bdiag[i+3]+1;
1796         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1797         for (j=0; j<nz; j++) {
1798           col    = pj[j];
1799           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1800         }
1801 
1802         sctx.rs = rs;
1803         sctx.pv = rtmp3[i+2];
1804         ierr    = MatPivotCheck(A,info,&sctx,i+2);CHKERRQ(ierr);
1805         if (sctx.newshift) break;
1806         pc3  = b->a + bdiag[i+2];
1807         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1808 
1809         /* Now take care of 3rd column of diagonal 4x4 block. */
1810         pc4 = rtmp4 + i+2;
1811         if (*pc4 != 0.0) {
1812           mul4 = (*pc4)*(*pc3); *pc4 = mul4;
1813           pj   = b->j + bdiag[i+3]+1;     /* beginning of U(i+2,:) */
1814           nz   = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */
1815           for (j=0; j<nz; j++) {
1816             col         = pj[j];
1817             rtmp4[col] -= mul4 * rtmp3[col];
1818           }
1819           ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1820         }
1821 
1822         /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1823         rs = 0.0;
1824         /* L part */
1825         pc4 = b->a + bi[i+3];
1826         pj  = b->j + bi[i+3];
1827         nz  = bi[i+4] - bi[i+3];
1828         for (j=0; j<nz; j++) {
1829           col    = pj[j];
1830           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1831         }
1832         /* U part */
1833         pc4 = b->a + bdiag[i+4]+1;
1834         pj  = b->j + bdiag[i+4]+1;
1835         nz  = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */
1836         for (j=0; j<nz; j++) {
1837           col    = pj[j];
1838           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1839         }
1840 
1841         sctx.rs = rs;
1842         sctx.pv = rtmp4[i+3];
1843         ierr    = MatPivotCheck(A,info,&sctx,i+3);CHKERRQ(ierr);
1844         if (sctx.newshift) break;
1845         pc4  = b->a + bdiag[i+3];
1846         *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */
1847         break;
1848 
1849       default:
1850         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
1851       }
1852       if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1853       i += nodesz;                 /* Update the row */
1854     }
1855 
1856     /* MatPivotRefine() */
1857     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max) {
1858       /*
1859        * if no shift in this attempt & shifting & started shifting & can refine,
1860        * then try lower shift
1861        */
1862       sctx.shift_hi       = sctx.shift_fraction;
1863       sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.;
1864       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
1865       sctx.newshift       = PETSC_TRUE;
1866       sctx.nshift++;
1867     }
1868   } while (sctx.newshift);
1869 
1870   ierr = PetscFree(rtmp1);CHKERRQ(ierr);
1871   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
1872   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
1873   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1874 
1875   C->ops->solve             = MatSolve_SeqAIJ;
1876   C->ops->solveadd          = MatSolveAdd_SeqAIJ;
1877   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1878   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1879   C->ops->matsolve          = MatMatSolve_SeqAIJ;
1880   C->assembled              = PETSC_TRUE;
1881   C->preallocated           = PETSC_TRUE;
1882 
1883   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
1884 
1885   /* MatShiftView(A,info,&sctx) */
1886   if (sctx.nshift) {
1887     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) {
1888       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr);
1889     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1890       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
1891     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1892       ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %G\n",sctx.nshift,info->shiftamount);CHKERRQ(ierr);
1893     }
1894   }
1895   ierr = Mat_CheckInode_FactorLU(C,PETSC_FALSE);CHKERRQ(ierr);
1896   PetscFunctionReturn(0);
1897 }
1898 
1899 #undef __FUNCT__
1900 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode_inplace"
1901 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info)
1902 {
1903   Mat             C     = B;
1904   Mat_SeqAIJ      *a    = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1905   IS              iscol = b->col,isrow = b->row,isicol = b->icol;
1906   PetscErrorCode  ierr;
1907   const PetscInt  *r,*ic,*c,*ics;
1908   PetscInt        n   = A->rmap->n,*bi = b->i;
1909   PetscInt        *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1910   PetscInt        i,j,idx,*bd = b->diag,node_max,nodesz;
1911   PetscInt        *ai = a->i,*aj = a->j;
1912   PetscInt        *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1913   PetscScalar     mul1,mul2,mul3,tmp;
1914   MatScalar       *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1915   const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1;
1916   PetscReal       rs=0.0;
1917   FactorShiftCtx  sctx;
1918 
1919   PetscFunctionBegin;
1920   sctx.shift_top      = 0;
1921   sctx.nshift_max     = 0;
1922   sctx.shift_lo       = 0;
1923   sctx.shift_hi       = 0;
1924   sctx.shift_fraction = 0;
1925 
1926   /* if both shift schemes are chosen by user, only use info->shiftpd */
1927   if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1928     sctx.shift_top = 0;
1929     for (i=0; i<n; i++) {
1930       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1931       rs    = 0.0;
1932       ajtmp = aj + ai[i];
1933       rtmp1 = aa + ai[i];
1934       nz    = ai[i+1] - ai[i];
1935       for (j=0; j<nz; j++) {
1936         if (*ajtmp != i) {
1937           rs += PetscAbsScalar(*rtmp1++);
1938         } else {
1939           rs -= PetscRealPart(*rtmp1++);
1940         }
1941         ajtmp++;
1942       }
1943       if (rs>sctx.shift_top) sctx.shift_top = rs;
1944     }
1945     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1946     sctx.shift_top *= 1.1;
1947     sctx.nshift_max = 5;
1948     sctx.shift_lo   = 0.;
1949     sctx.shift_hi   = 1.;
1950   }
1951   sctx.shift_amount = 0;
1952   sctx.nshift       = 0;
1953 
1954   ierr   = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1955   ierr   = ISGetIndices(iscol,&c);CHKERRQ(ierr);
1956   ierr   = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1957   ierr   = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp11);CHKERRQ(ierr);
1958   ierr   = PetscMemzero(rtmp11,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr);
1959   ics    = ic;
1960   rtmp22 = rtmp11 + n;
1961   rtmp33 = rtmp22 + n;
1962 
1963   node_max = a->inode.node_count;
1964   ns       = a->inode.size;
1965   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1966 
1967   /* If max inode size > 3, split it into two inodes.*/
1968   /* also map the inode sizes according to the ordering */
1969   ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr);
1970   for (i=0,j=0; i<node_max; ++i,++j) {
1971     if (ns[i]>3) {
1972       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1973       ++j;
1974       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1975     } else {
1976       tmp_vec1[j] = ns[i];
1977     }
1978   }
1979   /* Use the correct node_max */
1980   node_max = j;
1981 
1982   /* Now reorder the inode info based on mat re-ordering info */
1983   /* First create a row -> inode_size_array_index map */
1984   ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr);
1985   ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr);
1986   for (i=0,row=0; i<node_max; i++) {
1987     nodesz = tmp_vec1[i];
1988     for (j=0; j<nodesz; j++,row++) {
1989       nsmap[row] = i;
1990     }
1991   }
1992   /* Using nsmap, create a reordered ns structure */
1993   for (i=0,j=0; i< node_max; i++) {
1994     nodesz      = tmp_vec1[nsmap[r[j]]];     /* here the reordered row_no is in r[] */
1995     tmp_vec2[i] = nodesz;
1996     j          += nodesz;
1997   }
1998   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1999   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
2000   /* Now use the correct ns */
2001   ns = tmp_vec2;
2002 
2003   do {
2004     sctx.newshift = PETSC_FALSE;
2005     /* Now loop over each block-row, and do the factorization */
2006     for (i=0,row=0; i<node_max; i++) {
2007       nodesz = ns[i];
2008       nz     = bi[row+1] - bi[row];
2009       bjtmp  = bj + bi[row];
2010 
2011       switch (nodesz) {
2012       case 1:
2013         for  (j=0; j<nz; j++) {
2014           idx         = bjtmp[j];
2015           rtmp11[idx] = 0.0;
2016         }
2017 
2018         /* load in initial (unfactored row) */
2019         idx    = r[row];
2020         nz_tmp = ai[idx+1] - ai[idx];
2021         ajtmp  = aj + ai[idx];
2022         v1     = aa + ai[idx];
2023 
2024         for (j=0; j<nz_tmp; j++) {
2025           idx         = ics[ajtmp[j]];
2026           rtmp11[idx] = v1[j];
2027         }
2028         rtmp11[ics[r[row]]] += sctx.shift_amount;
2029 
2030         prow = *bjtmp++;
2031         while (prow < row) {
2032           pc1 = rtmp11 + prow;
2033           if (*pc1 != 0.0) {
2034             pv     = ba + bd[prow];
2035             pj     = nbj + bd[prow];
2036             mul1   = *pc1 * *pv++;
2037             *pc1   = mul1;
2038             nz_tmp = bi[prow+1] - bd[prow] - 1;
2039             ierr   = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr);
2040             for (j=0; j<nz_tmp; j++) {
2041               tmp          = pv[j];
2042               idx          = pj[j];
2043               rtmp11[idx] -= mul1 * tmp;
2044             }
2045           }
2046           prow = *bjtmp++;
2047         }
2048         pj  = bj + bi[row];
2049         pc1 = ba + bi[row];
2050 
2051         sctx.pv     = rtmp11[row];
2052         rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
2053         rs          = 0.0;
2054         for (j=0; j<nz; j++) {
2055           idx    = pj[j];
2056           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2057           if (idx != row) rs += PetscAbsScalar(pc1[j]);
2058         }
2059         sctx.rs = rs;
2060         ierr    = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr);
2061         if (sctx.newshift) goto endofwhile;
2062         break;
2063 
2064       case 2:
2065         for (j=0; j<nz; j++) {
2066           idx         = bjtmp[j];
2067           rtmp11[idx] = 0.0;
2068           rtmp22[idx] = 0.0;
2069         }
2070 
2071         /* load in initial (unfactored row) */
2072         idx    = r[row];
2073         nz_tmp = ai[idx+1] - ai[idx];
2074         ajtmp  = aj + ai[idx];
2075         v1     = aa + ai[idx];
2076         v2     = aa + ai[idx+1];
2077         for (j=0; j<nz_tmp; j++) {
2078           idx         = ics[ajtmp[j]];
2079           rtmp11[idx] = v1[j];
2080           rtmp22[idx] = v2[j];
2081         }
2082         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2083         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2084 
2085         prow = *bjtmp++;
2086         while (prow < row) {
2087           pc1 = rtmp11 + prow;
2088           pc2 = rtmp22 + prow;
2089           if (*pc1 != 0.0 || *pc2 != 0.0) {
2090             pv   = ba + bd[prow];
2091             pj   = nbj + bd[prow];
2092             mul1 = *pc1 * *pv;
2093             mul2 = *pc2 * *pv;
2094             ++pv;
2095             *pc1 = mul1;
2096             *pc2 = mul2;
2097 
2098             nz_tmp = bi[prow+1] - bd[prow] - 1;
2099             for (j=0; j<nz_tmp; j++) {
2100               tmp          = pv[j];
2101               idx          = pj[j];
2102               rtmp11[idx] -= mul1 * tmp;
2103               rtmp22[idx] -= mul2 * tmp;
2104             }
2105             ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr);
2106           }
2107           prow = *bjtmp++;
2108         }
2109 
2110         /* Now take care of diagonal 2x2 block. Note: prow = row here */
2111         pc1 = rtmp11 + prow;
2112         pc2 = rtmp22 + prow;
2113 
2114         sctx.pv = *pc1;
2115         pj      = bj + bi[prow];
2116         rs      = 0.0;
2117         for (j=0; j<nz; j++) {
2118           idx = pj[j];
2119           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2120         }
2121         sctx.rs = rs;
2122         ierr    = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr);
2123         if (sctx.newshift) goto endofwhile;
2124 
2125         if (*pc2 != 0.0) {
2126           pj     = nbj + bd[prow];
2127           mul2   = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
2128           *pc2   = mul2;
2129           nz_tmp = bi[prow+1] - bd[prow] - 1;
2130           for (j=0; j<nz_tmp; j++) {
2131             idx          = pj[j];
2132             tmp          = rtmp11[idx];
2133             rtmp22[idx] -= mul2 * tmp;
2134           }
2135           ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr);
2136         }
2137 
2138         pj  = bj + bi[row];
2139         pc1 = ba + bi[row];
2140         pc2 = ba + bi[row+1];
2141 
2142         sctx.pv       = rtmp22[row+1];
2143         rs            = 0.0;
2144         rtmp11[row]   = 1.0/rtmp11[row];
2145         rtmp22[row+1] = 1.0/rtmp22[row+1];
2146         /* copy row entries from dense representation to sparse */
2147         for (j=0; j<nz; j++) {
2148           idx    = pj[j];
2149           pc1[j] = rtmp11[idx];
2150           pc2[j] = rtmp22[idx];
2151           if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
2152         }
2153         sctx.rs = rs;
2154         ierr    = MatPivotCheck(A,info,&sctx,row+1);CHKERRQ(ierr);
2155         if (sctx.newshift) goto endofwhile;
2156         break;
2157 
2158       case 3:
2159         for  (j=0; j<nz; j++) {
2160           idx         = bjtmp[j];
2161           rtmp11[idx] = 0.0;
2162           rtmp22[idx] = 0.0;
2163           rtmp33[idx] = 0.0;
2164         }
2165         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2166         idx    = r[row];
2167         nz_tmp = ai[idx+1] - ai[idx];
2168         ajtmp  = aj + ai[idx];
2169         v1     = aa + ai[idx];
2170         v2     = aa + ai[idx+1];
2171         v3     = aa + ai[idx+2];
2172         for (j=0; j<nz_tmp; j++) {
2173           idx         = ics[ajtmp[j]];
2174           rtmp11[idx] = v1[j];
2175           rtmp22[idx] = v2[j];
2176           rtmp33[idx] = v3[j];
2177         }
2178         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2179         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2180         rtmp33[ics[r[row+2]]] += sctx.shift_amount;
2181 
2182         /* loop over all pivot row blocks above this row block */
2183         prow = *bjtmp++;
2184         while (prow < row) {
2185           pc1 = rtmp11 + prow;
2186           pc2 = rtmp22 + prow;
2187           pc3 = rtmp33 + prow;
2188           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0) {
2189             pv   = ba  + bd[prow];
2190             pj   = nbj + bd[prow];
2191             mul1 = *pc1 * *pv;
2192             mul2 = *pc2 * *pv;
2193             mul3 = *pc3 * *pv;
2194             ++pv;
2195             *pc1 = mul1;
2196             *pc2 = mul2;
2197             *pc3 = mul3;
2198 
2199             nz_tmp = bi[prow+1] - bd[prow] - 1;
2200             /* update this row based on pivot row */
2201             for (j=0; j<nz_tmp; j++) {
2202               tmp          = pv[j];
2203               idx          = pj[j];
2204               rtmp11[idx] -= mul1 * tmp;
2205               rtmp22[idx] -= mul2 * tmp;
2206               rtmp33[idx] -= mul3 * tmp;
2207             }
2208             ierr = PetscLogFlops(3+6*nz_tmp);CHKERRQ(ierr);
2209           }
2210           prow = *bjtmp++;
2211         }
2212 
2213         /* Now take care of diagonal 3x3 block in this set of rows */
2214         /* note: prow = row here */
2215         pc1 = rtmp11 + prow;
2216         pc2 = rtmp22 + prow;
2217         pc3 = rtmp33 + prow;
2218 
2219         sctx.pv = *pc1;
2220         pj      = bj + bi[prow];
2221         rs      = 0.0;
2222         for (j=0; j<nz; j++) {
2223           idx = pj[j];
2224           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2225         }
2226         sctx.rs = rs;
2227         ierr    = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr);
2228         if (sctx.newshift) goto endofwhile;
2229 
2230         if (*pc2 != 0.0 || *pc3 != 0.0) {
2231           mul2   = (*pc2)/(*pc1);
2232           mul3   = (*pc3)/(*pc1);
2233           *pc2   = mul2;
2234           *pc3   = mul3;
2235           nz_tmp = bi[prow+1] - bd[prow] - 1;
2236           pj     = nbj + bd[prow];
2237           for (j=0; j<nz_tmp; j++) {
2238             idx          = pj[j];
2239             tmp          = rtmp11[idx];
2240             rtmp22[idx] -= mul2 * tmp;
2241             rtmp33[idx] -= mul3 * tmp;
2242           }
2243           ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr);
2244         }
2245         ++prow;
2246 
2247         pc2     = rtmp22 + prow;
2248         pc3     = rtmp33 + prow;
2249         sctx.pv = *pc2;
2250         pj      = bj + bi[prow];
2251         rs      = 0.0;
2252         for (j=0; j<nz; j++) {
2253           idx = pj[j];
2254           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2255         }
2256         sctx.rs = rs;
2257         ierr    = MatPivotCheck(A,info,&sctx,row+1);CHKERRQ(ierr);
2258         if (sctx.newshift) goto endofwhile;
2259 
2260         if (*pc3 != 0.0) {
2261           mul3   = (*pc3)/(*pc2);
2262           *pc3   = mul3;
2263           pj     = nbj + bd[prow];
2264           nz_tmp = bi[prow+1] - bd[prow] - 1;
2265           for (j=0; j<nz_tmp; j++) {
2266             idx          = pj[j];
2267             tmp          = rtmp22[idx];
2268             rtmp33[idx] -= mul3 * tmp;
2269           }
2270           ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr);
2271         }
2272 
2273         pj  = bj + bi[row];
2274         pc1 = ba + bi[row];
2275         pc2 = ba + bi[row+1];
2276         pc3 = ba + bi[row+2];
2277 
2278         sctx.pv       = rtmp33[row+2];
2279         rs            = 0.0;
2280         rtmp11[row]   = 1.0/rtmp11[row];
2281         rtmp22[row+1] = 1.0/rtmp22[row+1];
2282         rtmp33[row+2] = 1.0/rtmp33[row+2];
2283         /* copy row entries from dense representation to sparse */
2284         for (j=0; j<nz; j++) {
2285           idx    = pj[j];
2286           pc1[j] = rtmp11[idx];
2287           pc2[j] = rtmp22[idx];
2288           pc3[j] = rtmp33[idx];
2289           if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
2290         }
2291 
2292         sctx.rs = rs;
2293         ierr    = MatPivotCheck(A,info,&sctx,row+2);CHKERRQ(ierr);
2294         if (sctx.newshift) goto endofwhile;
2295         break;
2296 
2297       default:
2298         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
2299       }
2300       row += nodesz;                 /* Update the row */
2301     }
2302 endofwhile:;
2303   } while (sctx.newshift);
2304   ierr = PetscFree(rtmp11);CHKERRQ(ierr);
2305   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
2306   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
2307   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
2308   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
2309 
2310   (B)->ops->solve = MatSolve_SeqAIJ_inplace;
2311   /* do not set solve add, since MatSolve_Inode + Add is faster */
2312   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ_inplace;
2313   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2314   C->assembled              = PETSC_TRUE;
2315   C->preallocated           = PETSC_TRUE;
2316   if (sctx.nshift) {
2317     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2318       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr);
2319     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2320       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
2321     }
2322   }
2323   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
2324   ierr = Mat_CheckInode(C,PETSC_FALSE);CHKERRQ(ierr);
2325   PetscFunctionReturn(0);
2326 }
2327 
2328 
2329 /* ----------------------------------------------------------- */
2330 #undef __FUNCT__
2331 #define __FUNCT__ "MatSolve_SeqAIJ_Inode"
2332 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
2333 {
2334   Mat_SeqAIJ        *a    = (Mat_SeqAIJ*)A->data;
2335   IS                iscol = a->col,isrow = a->row;
2336   PetscErrorCode    ierr;
2337   const PetscInt    *r,*c,*rout,*cout;
2338   PetscInt          i,j,n = A->rmap->n;
2339   PetscInt          node_max,row,nsz,aii,i0,i1,nz;
2340   const PetscInt    *ai = a->i,*a_j = a->j,*ns,*vi,*ad,*aj;
2341   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
2342   PetscScalar       sum1,sum2,sum3,sum4,sum5;
2343   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
2344   const PetscScalar *b;
2345 
2346   PetscFunctionBegin;
2347   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
2348   node_max = a->inode.node_count;
2349   ns       = a->inode.size;     /* Node Size array */
2350 
2351   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2352   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2353   tmp  = a->solve_work;
2354 
2355   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2356   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2357 
2358   /* forward solve the lower triangular */
2359   tmps = tmp;
2360   aa   = a_a;
2361   aj   = a_j;
2362   ad   = a->diag;
2363 
2364   for (i = 0,row = 0; i< node_max; ++i) {
2365     nsz = ns[i];
2366     aii = ai[row];
2367     v1  = aa + aii;
2368     vi  = aj + aii;
2369     nz  = ai[row+1]- ai[row];
2370 
2371     if (i < node_max-1) {
2372       /* Prefetch the indices for the next block */
2373       PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
2374       /* Prefetch the data for the next block */
2375       PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
2376     }
2377 
2378     switch (nsz) {               /* Each loop in 'case' is unrolled */
2379     case 1:
2380       sum1 = b[r[row]];
2381       for (j=0; j<nz-1; j+=2) {
2382         i0    = vi[j];
2383         i1    = vi[j+1];
2384         tmp0  = tmps[i0];
2385         tmp1  = tmps[i1];
2386         sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1;
2387       }
2388       if (j == nz-1) {
2389         tmp0  = tmps[vi[j]];
2390         sum1 -= v1[j]*tmp0;
2391       }
2392       tmp[row++]=sum1;
2393       break;
2394     case 2:
2395       sum1 = b[r[row]];
2396       sum2 = b[r[row+1]];
2397       v2   = aa + ai[row+1];
2398 
2399       for (j=0; j<nz-1; j+=2) {
2400         i0    = vi[j];
2401         i1    = vi[j+1];
2402         tmp0  = tmps[i0];
2403         tmp1  = tmps[i1];
2404         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2405         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2406       }
2407       if (j == nz-1) {
2408         tmp0  = tmps[vi[j]];
2409         sum1 -= v1[j] *tmp0;
2410         sum2 -= v2[j] *tmp0;
2411       }
2412       sum2     -= v2[nz] * sum1;
2413       tmp[row++]=sum1;
2414       tmp[row++]=sum2;
2415       break;
2416     case 3:
2417       sum1 = b[r[row]];
2418       sum2 = b[r[row+1]];
2419       sum3 = b[r[row+2]];
2420       v2   = aa + ai[row+1];
2421       v3   = aa + ai[row+2];
2422 
2423       for (j=0; j<nz-1; j+=2) {
2424         i0    = vi[j];
2425         i1    = vi[j+1];
2426         tmp0  = tmps[i0];
2427         tmp1  = tmps[i1];
2428         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2429         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2430         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2431       }
2432       if (j == nz-1) {
2433         tmp0  = tmps[vi[j]];
2434         sum1 -= v1[j] *tmp0;
2435         sum2 -= v2[j] *tmp0;
2436         sum3 -= v3[j] *tmp0;
2437       }
2438       sum2     -= v2[nz] * sum1;
2439       sum3     -= v3[nz] * sum1;
2440       sum3     -= v3[nz+1] * sum2;
2441       tmp[row++]=sum1;
2442       tmp[row++]=sum2;
2443       tmp[row++]=sum3;
2444       break;
2445 
2446     case 4:
2447       sum1 = b[r[row]];
2448       sum2 = b[r[row+1]];
2449       sum3 = b[r[row+2]];
2450       sum4 = b[r[row+3]];
2451       v2   = aa + ai[row+1];
2452       v3   = aa + ai[row+2];
2453       v4   = aa + ai[row+3];
2454 
2455       for (j=0; j<nz-1; j+=2) {
2456         i0    = vi[j];
2457         i1    = vi[j+1];
2458         tmp0  = tmps[i0];
2459         tmp1  = tmps[i1];
2460         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2461         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2462         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2463         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2464       }
2465       if (j == nz-1) {
2466         tmp0  = tmps[vi[j]];
2467         sum1 -= v1[j] *tmp0;
2468         sum2 -= v2[j] *tmp0;
2469         sum3 -= v3[j] *tmp0;
2470         sum4 -= v4[j] *tmp0;
2471       }
2472       sum2 -= v2[nz] * sum1;
2473       sum3 -= v3[nz] * sum1;
2474       sum4 -= v4[nz] * sum1;
2475       sum3 -= v3[nz+1] * sum2;
2476       sum4 -= v4[nz+1] * sum2;
2477       sum4 -= v4[nz+2] * sum3;
2478 
2479       tmp[row++]=sum1;
2480       tmp[row++]=sum2;
2481       tmp[row++]=sum3;
2482       tmp[row++]=sum4;
2483       break;
2484     case 5:
2485       sum1 = b[r[row]];
2486       sum2 = b[r[row+1]];
2487       sum3 = b[r[row+2]];
2488       sum4 = b[r[row+3]];
2489       sum5 = b[r[row+4]];
2490       v2   = aa + ai[row+1];
2491       v3   = aa + ai[row+2];
2492       v4   = aa + ai[row+3];
2493       v5   = aa + ai[row+4];
2494 
2495       for (j=0; j<nz-1; j+=2) {
2496         i0    = vi[j];
2497         i1    = vi[j+1];
2498         tmp0  = tmps[i0];
2499         tmp1  = tmps[i1];
2500         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2501         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2502         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2503         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2504         sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1;
2505       }
2506       if (j == nz-1) {
2507         tmp0  = tmps[vi[j]];
2508         sum1 -= v1[j] *tmp0;
2509         sum2 -= v2[j] *tmp0;
2510         sum3 -= v3[j] *tmp0;
2511         sum4 -= v4[j] *tmp0;
2512         sum5 -= v5[j] *tmp0;
2513       }
2514 
2515       sum2 -= v2[nz] * sum1;
2516       sum3 -= v3[nz] * sum1;
2517       sum4 -= v4[nz] * sum1;
2518       sum5 -= v5[nz] * sum1;
2519       sum3 -= v3[nz+1] * sum2;
2520       sum4 -= v4[nz+1] * sum2;
2521       sum5 -= v5[nz+1] * sum2;
2522       sum4 -= v4[nz+2] * sum3;
2523       sum5 -= v5[nz+2] * sum3;
2524       sum5 -= v5[nz+3] * sum4;
2525 
2526       tmp[row++]=sum1;
2527       tmp[row++]=sum2;
2528       tmp[row++]=sum3;
2529       tmp[row++]=sum4;
2530       tmp[row++]=sum5;
2531       break;
2532     default:
2533       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2534     }
2535   }
2536   /* backward solve the upper triangular */
2537   for (i=node_max -1,row = n-1; i>=0; i--) {
2538     nsz = ns[i];
2539     aii = ad[row+1] + 1;
2540     v1  = aa + aii;
2541     vi  = aj + aii;
2542     nz  = ad[row]- ad[row+1] - 1;
2543 
2544     if (i > 0) {
2545       /* Prefetch the indices for the next block */
2546       PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2547       /* Prefetch the data for the next block */
2548       PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2549     }
2550 
2551     switch (nsz) {               /* Each loop in 'case' is unrolled */
2552     case 1:
2553       sum1 = tmp[row];
2554 
2555       for (j=0; j<nz-1; j+=2) {
2556         i0    = vi[j];
2557         i1    = vi[j+1];
2558         tmp0  = tmps[i0];
2559         tmp1  = tmps[i1];
2560         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2561       }
2562       if (j == nz-1) {
2563         tmp0  = tmps[vi[j]];
2564         sum1 -= v1[j]*tmp0;
2565       }
2566       x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2567       break;
2568     case 2:
2569       sum1 = tmp[row];
2570       sum2 = tmp[row-1];
2571       v2   = aa + ad[row] + 1;
2572       for (j=0; j<nz-1; j+=2) {
2573         i0    = vi[j];
2574         i1    = vi[j+1];
2575         tmp0  = tmps[i0];
2576         tmp1  = tmps[i1];
2577         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2578         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2579       }
2580       if (j == nz-1) {
2581         tmp0  = tmps[vi[j]];
2582         sum1 -= v1[j]* tmp0;
2583         sum2 -= v2[j+1]* tmp0;
2584       }
2585 
2586       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2587       sum2     -= v2[0] * tmp0;
2588       x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2589       break;
2590     case 3:
2591       sum1 = tmp[row];
2592       sum2 = tmp[row -1];
2593       sum3 = tmp[row -2];
2594       v2   = aa + ad[row] + 1;
2595       v3   = aa + ad[row -1] + 1;
2596       for (j=0; j<nz-1; j+=2) {
2597         i0    = vi[j];
2598         i1    = vi[j+1];
2599         tmp0  = tmps[i0];
2600         tmp1  = tmps[i1];
2601         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2602         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2603         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2604       }
2605       if (j== nz-1) {
2606         tmp0  = tmps[vi[j]];
2607         sum1 -= v1[j] * tmp0;
2608         sum2 -= v2[j+1] * tmp0;
2609         sum3 -= v3[j+2] * tmp0;
2610       }
2611       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2612       sum2     -= v2[0]* tmp0;
2613       sum3     -= v3[1] * tmp0;
2614       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2615       sum3     -= v3[0]* tmp0;
2616       x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2617 
2618       break;
2619     case 4:
2620       sum1 = tmp[row];
2621       sum2 = tmp[row -1];
2622       sum3 = tmp[row -2];
2623       sum4 = tmp[row -3];
2624       v2   = aa + ad[row]+1;
2625       v3   = aa + ad[row -1]+1;
2626       v4   = aa + ad[row -2]+1;
2627 
2628       for (j=0; j<nz-1; j+=2) {
2629         i0    = vi[j];
2630         i1    = vi[j+1];
2631         tmp0  = tmps[i0];
2632         tmp1  = tmps[i1];
2633         sum1 -= v1[j] * tmp0   + v1[j+1] * tmp1;
2634         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2635         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2636         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2637       }
2638       if (j== nz-1) {
2639         tmp0  = tmps[vi[j]];
2640         sum1 -= v1[j] * tmp0;
2641         sum2 -= v2[j+1] * tmp0;
2642         sum3 -= v3[j+2] * tmp0;
2643         sum4 -= v4[j+3] * tmp0;
2644       }
2645 
2646       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2647       sum2     -= v2[0] * tmp0;
2648       sum3     -= v3[1] * tmp0;
2649       sum4     -= v4[2] * tmp0;
2650       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2651       sum3     -= v3[0] * tmp0;
2652       sum4     -= v4[1] * tmp0;
2653       tmp0      = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2654       sum4     -= v4[0] * tmp0;
2655       x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2656       break;
2657     case 5:
2658       sum1 = tmp[row];
2659       sum2 = tmp[row -1];
2660       sum3 = tmp[row -2];
2661       sum4 = tmp[row -3];
2662       sum5 = tmp[row -4];
2663       v2   = aa + ad[row]+1;
2664       v3   = aa + ad[row -1]+1;
2665       v4   = aa + ad[row -2]+1;
2666       v5   = aa + ad[row -3]+1;
2667       for (j=0; j<nz-1; j+=2) {
2668         i0    = vi[j];
2669         i1    = vi[j+1];
2670         tmp0  = tmps[i0];
2671         tmp1  = tmps[i1];
2672         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2673         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2674         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2675         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2676         sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1;
2677       }
2678       if (j==nz-1) {
2679         tmp0  = tmps[vi[j]];
2680         sum1 -= v1[j] * tmp0;
2681         sum2 -= v2[j+1] * tmp0;
2682         sum3 -= v3[j+2] * tmp0;
2683         sum4 -= v4[j+3] * tmp0;
2684         sum5 -= v5[j+4] * tmp0;
2685       }
2686 
2687       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2688       sum2     -= v2[0] * tmp0;
2689       sum3     -= v3[1] * tmp0;
2690       sum4     -= v4[2] * tmp0;
2691       sum5     -= v5[3] * tmp0;
2692       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2693       sum3     -= v3[0] * tmp0;
2694       sum4     -= v4[1] * tmp0;
2695       sum5     -= v5[2] * tmp0;
2696       tmp0      = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2697       sum4     -= v4[0] * tmp0;
2698       sum5     -= v5[1] * tmp0;
2699       tmp0      = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2700       sum5     -= v5[0] * tmp0;
2701       x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--;
2702       break;
2703     default:
2704       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2705     }
2706   }
2707   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2708   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2709   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2710   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2711   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
2712   PetscFunctionReturn(0);
2713 }
2714 
2715 
2716 /*
2717      Makes a longer coloring[] array and calls the usual code with that
2718 */
2719 #undef __FUNCT__
2720 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode"
2721 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
2722 {
2723   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)mat->data;
2724   PetscErrorCode  ierr;
2725   PetscInt        n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
2726   PetscInt        *colorused,i;
2727   ISColoringValue *newcolor;
2728 
2729   PetscFunctionBegin;
2730   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr);
2731   /* loop over inodes, marking a color for each column*/
2732   row = 0;
2733   for (i=0; i<m; i++) {
2734     for (j=0; j<ns[i]; j++) {
2735       newcolor[row++] = coloring[i] + j*ncolors;
2736     }
2737   }
2738 
2739   /* eliminate unneeded colors */
2740   ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr);
2741   ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr);
2742   for (i=0; i<n; i++) {
2743     colorused[newcolor[i]] = 1;
2744   }
2745 
2746   for (i=1; i<5*ncolors; i++) {
2747     colorused[i] += colorused[i-1];
2748   }
2749   ncolors = colorused[5*ncolors-1];
2750   for (i=0; i<n; i++) {
2751     newcolor[i] = colorused[newcolor[i]]-1;
2752   }
2753   ierr = PetscFree(colorused);CHKERRQ(ierr);
2754   ierr = ISColoringCreate(PetscObjectComm((PetscObject)mat),ncolors,n,newcolor,iscoloring);CHKERRQ(ierr);
2755   ierr = PetscFree(coloring);CHKERRQ(ierr);
2756   PetscFunctionReturn(0);
2757 }
2758 
2759 #include <petsc-private/kernels/blockinvert.h>
2760 
2761 #undef __FUNCT__
2762 #define __FUNCT__ "MatSOR_SeqAIJ_Inode"
2763 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
2764 {
2765   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
2766   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3;
2767   MatScalar         *ibdiag,*bdiag,work[25],*t;
2768   PetscScalar       *x,tmp4,tmp5,x1,x2,x3,x4,x5;
2769   const MatScalar   *v = a->a,*v1,*v2,*v3,*v4,*v5;
2770   const PetscScalar *xb, *b;
2771   PetscReal         zeropivot = 1.0e-15, shift = 0.0;
2772   PetscErrorCode    ierr;
2773   PetscInt          n,m = a->inode.node_count,cnt = 0,i,j,row,i1,i2;
2774   PetscInt          sz,k,ipvt[5];
2775   const PetscInt    *sizes = a->inode.size,*idx,*diag = a->diag,*ii = a->i;
2776 
2777   PetscFunctionBegin;
2778   if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
2779   if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
2780 
2781   if (!a->inode.ibdiagvalid) {
2782     if (!a->inode.ibdiag) {
2783       /* calculate space needed for diagonal blocks */
2784       for (i=0; i<m; i++) {
2785         cnt += sizes[i]*sizes[i];
2786       }
2787       a->inode.bdiagsize = cnt;
2788 
2789       ierr = PetscMalloc3(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag,A->rmap->n,MatScalar,&a->inode.ssor_work);CHKERRQ(ierr);
2790     }
2791 
2792     /* copy over the diagonal blocks and invert them */
2793     ibdiag = a->inode.ibdiag;
2794     bdiag  = a->inode.bdiag;
2795     cnt    = 0;
2796     for (i=0, row = 0; i<m; i++) {
2797       for (j=0; j<sizes[i]; j++) {
2798         for (k=0; k<sizes[i]; k++) {
2799           bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
2800         }
2801       }
2802       ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr);
2803 
2804       switch (sizes[i]) {
2805       case 1:
2806         /* Create matrix data structure */
2807         if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
2808         ibdiag[cnt] = 1.0/ibdiag[cnt];
2809         break;
2810       case 2:
2811         ierr = PetscKernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr);
2812         break;
2813       case 3:
2814         ierr = PetscKernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr);
2815         break;
2816       case 4:
2817         ierr = PetscKernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr);
2818         break;
2819       case 5:
2820         ierr = PetscKernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift);CHKERRQ(ierr);
2821         break;
2822       default:
2823         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2824       }
2825       cnt += sizes[i]*sizes[i];
2826       row += sizes[i];
2827     }
2828     a->inode.ibdiagvalid = PETSC_TRUE;
2829   }
2830   ibdiag = a->inode.ibdiag;
2831   bdiag  = a->inode.bdiag;
2832   t      = a->inode.ssor_work;
2833 
2834   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2835   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2836   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2837   if (flag & SOR_ZERO_INITIAL_GUESS) {
2838     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
2839 
2840       for (i=0, row=0; i<m; i++) {
2841         sz  = diag[row] - ii[row];
2842         v1  = a->a + ii[row];
2843         idx = a->j + ii[row];
2844 
2845         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2846         switch (sizes[i]) {
2847         case 1:
2848 
2849           sum1 = b[row];
2850           for (n = 0; n<sz-1; n+=2) {
2851             i1    = idx[0];
2852             i2    = idx[1];
2853             idx  += 2;
2854             tmp0  = x[i1];
2855             tmp1  = x[i2];
2856             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2857           }
2858 
2859           if (n == sz-1) {
2860             tmp0  = x[*idx];
2861             sum1 -= *v1 * tmp0;
2862           }
2863           t[row]   = sum1;
2864           x[row++] = sum1*(*ibdiag++);
2865           break;
2866         case 2:
2867           v2   = a->a + ii[row+1];
2868           sum1 = b[row];
2869           sum2 = b[row+1];
2870           for (n = 0; n<sz-1; n+=2) {
2871             i1    = idx[0];
2872             i2    = idx[1];
2873             idx  += 2;
2874             tmp0  = x[i1];
2875             tmp1  = x[i2];
2876             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2877             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2878           }
2879 
2880           if (n == sz-1) {
2881             tmp0  = x[*idx];
2882             sum1 -= v1[0] * tmp0;
2883             sum2 -= v2[0] * tmp0;
2884           }
2885           t[row]   = sum1;
2886           t[row+1] = sum2;
2887           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
2888           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
2889           ibdiag  += 4;
2890           break;
2891         case 3:
2892           v2   = a->a + ii[row+1];
2893           v3   = a->a + ii[row+2];
2894           sum1 = b[row];
2895           sum2 = b[row+1];
2896           sum3 = b[row+2];
2897           for (n = 0; n<sz-1; n+=2) {
2898             i1    = idx[0];
2899             i2    = idx[1];
2900             idx  += 2;
2901             tmp0  = x[i1];
2902             tmp1  = x[i2];
2903             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2904             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2905             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2906           }
2907 
2908           if (n == sz-1) {
2909             tmp0  = x[*idx];
2910             sum1 -= v1[0] * tmp0;
2911             sum2 -= v2[0] * tmp0;
2912             sum3 -= v3[0] * tmp0;
2913           }
2914           t[row]   = sum1;
2915           t[row+1] = sum2;
2916           t[row+2] = sum3;
2917           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2918           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2919           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2920           ibdiag  += 9;
2921           break;
2922         case 4:
2923           v2   = a->a + ii[row+1];
2924           v3   = a->a + ii[row+2];
2925           v4   = a->a + ii[row+3];
2926           sum1 = b[row];
2927           sum2 = b[row+1];
2928           sum3 = b[row+2];
2929           sum4 = b[row+3];
2930           for (n = 0; n<sz-1; n+=2) {
2931             i1    = idx[0];
2932             i2    = idx[1];
2933             idx  += 2;
2934             tmp0  = x[i1];
2935             tmp1  = x[i2];
2936             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2937             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2938             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2939             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2940           }
2941 
2942           if (n == sz-1) {
2943             tmp0  = x[*idx];
2944             sum1 -= v1[0] * tmp0;
2945             sum2 -= v2[0] * tmp0;
2946             sum3 -= v3[0] * tmp0;
2947             sum4 -= v4[0] * tmp0;
2948           }
2949           t[row]   = sum1;
2950           t[row+1] = sum2;
2951           t[row+2] = sum3;
2952           t[row+3] = sum4;
2953           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2954           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2955           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2956           x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2957           ibdiag  += 16;
2958           break;
2959         case 5:
2960           v2   = a->a + ii[row+1];
2961           v3   = a->a + ii[row+2];
2962           v4   = a->a + ii[row+3];
2963           v5   = a->a + ii[row+4];
2964           sum1 = b[row];
2965           sum2 = b[row+1];
2966           sum3 = b[row+2];
2967           sum4 = b[row+3];
2968           sum5 = b[row+4];
2969           for (n = 0; n<sz-1; n+=2) {
2970             i1    = idx[0];
2971             i2    = idx[1];
2972             idx  += 2;
2973             tmp0  = x[i1];
2974             tmp1  = x[i2];
2975             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2976             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2977             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2978             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2979             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2980           }
2981 
2982           if (n == sz-1) {
2983             tmp0  = x[*idx];
2984             sum1 -= v1[0] * tmp0;
2985             sum2 -= v2[0] * tmp0;
2986             sum3 -= v3[0] * tmp0;
2987             sum4 -= v4[0] * tmp0;
2988             sum5 -= v5[0] * tmp0;
2989           }
2990           t[row]   = sum1;
2991           t[row+1] = sum2;
2992           t[row+2] = sum3;
2993           t[row+3] = sum4;
2994           t[row+4] = sum5;
2995           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2996           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
2997           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
2998           x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
2999           x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3000           ibdiag  += 25;
3001           break;
3002         default:
3003           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3004         }
3005       }
3006 
3007       xb   = t;
3008       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3009     } else xb = b;
3010     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3011 
3012       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3013       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3014         ibdiag -= sizes[i]*sizes[i];
3015         sz      = ii[row+1] - diag[row] - 1;
3016         v1      = a->a + diag[row] + 1;
3017         idx     = a->j + diag[row] + 1;
3018 
3019         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3020         switch (sizes[i]) {
3021         case 1:
3022 
3023           sum1 = xb[row];
3024           for (n = 0; n<sz-1; n+=2) {
3025             i1    = idx[0];
3026             i2    = idx[1];
3027             idx  += 2;
3028             tmp0  = x[i1];
3029             tmp1  = x[i2];
3030             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3031           }
3032 
3033           if (n == sz-1) {
3034             tmp0  = x[*idx];
3035             sum1 -= *v1*tmp0;
3036           }
3037           x[row--] = sum1*(*ibdiag);
3038           break;
3039 
3040         case 2:
3041 
3042           sum1 = xb[row];
3043           sum2 = xb[row-1];
3044           /* note that sum1 is associated with the second of the two rows */
3045           v2 = a->a + diag[row-1] + 2;
3046           for (n = 0; n<sz-1; n+=2) {
3047             i1    = idx[0];
3048             i2    = idx[1];
3049             idx  += 2;
3050             tmp0  = x[i1];
3051             tmp1  = x[i2];
3052             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3053             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3054           }
3055 
3056           if (n == sz-1) {
3057             tmp0  = x[*idx];
3058             sum1 -= *v1*tmp0;
3059             sum2 -= *v2*tmp0;
3060           }
3061           x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3062           x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3063           break;
3064         case 3:
3065 
3066           sum1 = xb[row];
3067           sum2 = xb[row-1];
3068           sum3 = xb[row-2];
3069           v2   = a->a + diag[row-1] + 2;
3070           v3   = a->a + diag[row-2] + 3;
3071           for (n = 0; n<sz-1; n+=2) {
3072             i1    = idx[0];
3073             i2    = idx[1];
3074             idx  += 2;
3075             tmp0  = x[i1];
3076             tmp1  = x[i2];
3077             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3078             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3079             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3080           }
3081 
3082           if (n == sz-1) {
3083             tmp0  = x[*idx];
3084             sum1 -= *v1*tmp0;
3085             sum2 -= *v2*tmp0;
3086             sum3 -= *v3*tmp0;
3087           }
3088           x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3089           x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3090           x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3091           break;
3092         case 4:
3093 
3094           sum1 = xb[row];
3095           sum2 = xb[row-1];
3096           sum3 = xb[row-2];
3097           sum4 = xb[row-3];
3098           v2   = a->a + diag[row-1] + 2;
3099           v3   = a->a + diag[row-2] + 3;
3100           v4   = a->a + diag[row-3] + 4;
3101           for (n = 0; n<sz-1; n+=2) {
3102             i1    = idx[0];
3103             i2    = idx[1];
3104             idx  += 2;
3105             tmp0  = x[i1];
3106             tmp1  = x[i2];
3107             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3108             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3109             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3110             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3111           }
3112 
3113           if (n == sz-1) {
3114             tmp0  = x[*idx];
3115             sum1 -= *v1*tmp0;
3116             sum2 -= *v2*tmp0;
3117             sum3 -= *v3*tmp0;
3118             sum4 -= *v4*tmp0;
3119           }
3120           x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3121           x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3122           x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3123           x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3124           break;
3125         case 5:
3126 
3127           sum1 = xb[row];
3128           sum2 = xb[row-1];
3129           sum3 = xb[row-2];
3130           sum4 = xb[row-3];
3131           sum5 = xb[row-4];
3132           v2   = a->a + diag[row-1] + 2;
3133           v3   = a->a + diag[row-2] + 3;
3134           v4   = a->a + diag[row-3] + 4;
3135           v5   = a->a + diag[row-4] + 5;
3136           for (n = 0; n<sz-1; n+=2) {
3137             i1    = idx[0];
3138             i2    = idx[1];
3139             idx  += 2;
3140             tmp0  = x[i1];
3141             tmp1  = x[i2];
3142             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3143             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3144             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3145             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3146             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3147           }
3148 
3149           if (n == sz-1) {
3150             tmp0  = x[*idx];
3151             sum1 -= *v1*tmp0;
3152             sum2 -= *v2*tmp0;
3153             sum3 -= *v3*tmp0;
3154             sum4 -= *v4*tmp0;
3155             sum5 -= *v5*tmp0;
3156           }
3157           x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3158           x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3159           x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3160           x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3161           x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3162           break;
3163         default:
3164           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3165         }
3166       }
3167 
3168       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3169     }
3170     its--;
3171   }
3172   while (its--) {
3173 
3174     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
3175       ibdiag = a->inode.ibdiag;
3176 
3177       for (i=0, row=0; i<m; i++) {
3178         sz  = ii[row + 1] - ii[row];
3179         v1  = a->a + ii[row];
3180         idx = a->j + ii[row];
3181 
3182         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3183         switch (sizes[i]) {
3184         case 1:
3185 
3186           sum1 = b[row];
3187           for (n = 0; n<sz-1; n+=2) {
3188             i1    = idx[0];
3189             i2    = idx[1];
3190             idx  += 2;
3191             tmp0  = x[i1];
3192             tmp1  = x[i2];
3193             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3194           }
3195 
3196           if (n == sz-1) {
3197             tmp0  = x[*idx];
3198             sum1 -= *v1 * tmp0;
3199           }
3200 
3201           /* in MatSOR_SeqAIJ this line would be
3202            *
3203            * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
3204            *
3205            * but omega == 1, so this becomes
3206            *
3207            * x[row] = (sum1+(*bdiag++)*x[row])*(*ibdiag++);
3208            *
3209            * but bdiag and ibdiag cancel each other, so we can change this
3210            * to adding sum1*(*ibdiag++).  We can skip bdiag for the larger
3211            * block sizes as well
3212            */
3213           x[row++] += sum1*(*ibdiag++);
3214           break;
3215         case 2:
3216           v2   = a->a + ii[row+1];
3217           sum1 = b[row];
3218           sum2 = b[row+1];
3219           for (n = 0; n<sz-1; n+=2) {
3220             i1    = idx[0];
3221             i2    = idx[1];
3222             idx  += 2;
3223             tmp0  = x[i1];
3224             tmp1  = x[i2];
3225             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3226             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3227           }
3228 
3229           if (n == sz-1) {
3230             tmp0  = x[*idx];
3231             sum1 -= v1[0] * tmp0;
3232             sum2 -= v2[0] * tmp0;
3233           }
3234           x[row++] += sum1*ibdiag[0] + sum2*ibdiag[2];
3235           x[row++] += sum1*ibdiag[1] + sum2*ibdiag[3];
3236           ibdiag   += 4;
3237           break;
3238         case 3:
3239           v2   = a->a + ii[row+1];
3240           v3   = a->a + ii[row+2];
3241           sum1 = b[row];
3242           sum2 = b[row+1];
3243           sum3 = b[row+2];
3244           for (n = 0; n<sz-1; n+=2) {
3245             i1    = idx[0];
3246             i2    = idx[1];
3247             idx  += 2;
3248             tmp0  = x[i1];
3249             tmp1  = x[i2];
3250             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3251             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3252             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3253           }
3254 
3255           if (n == sz-1) {
3256             tmp0  = x[*idx];
3257             sum1 -= v1[0] * tmp0;
3258             sum2 -= v2[0] * tmp0;
3259             sum3 -= v3[0] * tmp0;
3260           }
3261           x[row++] += sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3262           x[row++] += sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3263           x[row++] += sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3264           ibdiag   += 9;
3265           break;
3266         case 4:
3267           v2   = a->a + ii[row+1];
3268           v3   = a->a + ii[row+2];
3269           v4   = a->a + ii[row+3];
3270           sum1 = b[row];
3271           sum2 = b[row+1];
3272           sum3 = b[row+2];
3273           sum4 = b[row+3];
3274           for (n = 0; n<sz-1; n+=2) {
3275             i1    = idx[0];
3276             i2    = idx[1];
3277             idx  += 2;
3278             tmp0  = x[i1];
3279             tmp1  = x[i2];
3280             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3281             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3282             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3283             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3284           }
3285 
3286           if (n == sz-1) {
3287             tmp0  = x[*idx];
3288             sum1 -= v1[0] * tmp0;
3289             sum2 -= v2[0] * tmp0;
3290             sum3 -= v3[0] * tmp0;
3291             sum4 -= v4[0] * tmp0;
3292           }
3293           x[row++] += sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3294           x[row++] += sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3295           x[row++] += sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3296           x[row++] += sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3297           ibdiag   += 16;
3298           break;
3299         case 5:
3300           v2   = a->a + ii[row+1];
3301           v3   = a->a + ii[row+2];
3302           v4   = a->a + ii[row+3];
3303           v5   = a->a + ii[row+4];
3304           sum1 = b[row];
3305           sum2 = b[row+1];
3306           sum3 = b[row+2];
3307           sum4 = b[row+3];
3308           sum5 = b[row+4];
3309           for (n = 0; n<sz-1; n+=2) {
3310             i1    = idx[0];
3311             i2    = idx[1];
3312             idx  += 2;
3313             tmp0  = x[i1];
3314             tmp1  = x[i2];
3315             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3316             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3317             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3318             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3319             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3320           }
3321 
3322           if (n == sz-1) {
3323             tmp0  = x[*idx];
3324             sum1 -= v1[0] * tmp0;
3325             sum2 -= v2[0] * tmp0;
3326             sum3 -= v3[0] * tmp0;
3327             sum4 -= v4[0] * tmp0;
3328             sum5 -= v5[0] * tmp0;
3329           }
3330           x[row++] += sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3331           x[row++] += sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3332           x[row++] += sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3333           x[row++] += sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3334           x[row++] += sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3335           ibdiag   += 25;
3336           break;
3337         default:
3338           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3339         }
3340       }
3341 
3342       ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
3343     }
3344     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3345 
3346       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3347       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3348         ibdiag -= sizes[i]*sizes[i];
3349         sz      = ii[row+1] - ii[row];
3350         v1      = a->a + ii[row];
3351         idx     = a->j + ii[row];
3352 
3353         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3354         switch (sizes[i]) {
3355         case 1:
3356 
3357           sum1 = b[row];
3358           for (n = 0; n<sz-1; n+=2) {
3359             i1    = idx[0];
3360             i2    = idx[1];
3361             idx  += 2;
3362             tmp0  = x[i1];
3363             tmp1  = x[i2];
3364             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3365           }
3366 
3367           if (n == sz-1) {
3368             tmp0  = x[*idx];
3369             sum1 -= *v1*tmp0;
3370           }
3371           x[row--] += sum1*(*ibdiag);
3372           break;
3373 
3374         case 2:
3375 
3376           sum1 = b[row];
3377           sum2 = b[row-1];
3378           /* note that sum1 is associated with the second of the two rows */
3379           v2 = a->a + ii[row - 1];
3380           for (n = 0; n<sz-1; n+=2) {
3381             i1    = idx[0];
3382             i2    = idx[1];
3383             idx  += 2;
3384             tmp0  = x[i1];
3385             tmp1  = x[i2];
3386             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3387             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3388           }
3389 
3390           if (n == sz-1) {
3391             tmp0  = x[*idx];
3392             sum1 -= *v1*tmp0;
3393             sum2 -= *v2*tmp0;
3394           }
3395           x[row--] += sum2*ibdiag[1] + sum1*ibdiag[3];
3396           x[row--] += sum2*ibdiag[0] + sum1*ibdiag[2];
3397           break;
3398         case 3:
3399 
3400           sum1 = b[row];
3401           sum2 = b[row-1];
3402           sum3 = b[row-2];
3403           v2   = a->a + ii[row-1];
3404           v3   = a->a + ii[row-2];
3405           for (n = 0; n<sz-1; n+=2) {
3406             i1    = idx[0];
3407             i2    = idx[1];
3408             idx  += 2;
3409             tmp0  = x[i1];
3410             tmp1  = x[i2];
3411             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3412             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3413             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3414           }
3415 
3416           if (n == sz-1) {
3417             tmp0  = x[*idx];
3418             sum1 -= *v1*tmp0;
3419             sum2 -= *v2*tmp0;
3420             sum3 -= *v3*tmp0;
3421           }
3422           x[row--] += sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3423           x[row--] += sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3424           x[row--] += sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3425           break;
3426         case 4:
3427 
3428           sum1 = b[row];
3429           sum2 = b[row-1];
3430           sum3 = b[row-2];
3431           sum4 = b[row-3];
3432           v2   = a->a + ii[row-1];
3433           v3   = a->a + ii[row-2];
3434           v4   = a->a + ii[row-3];
3435           for (n = 0; n<sz-1; n+=2) {
3436             i1    = idx[0];
3437             i2    = idx[1];
3438             idx  += 2;
3439             tmp0  = x[i1];
3440             tmp1  = x[i2];
3441             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3442             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3443             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3444             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3445           }
3446 
3447           if (n == sz-1) {
3448             tmp0  = x[*idx];
3449             sum1 -= *v1*tmp0;
3450             sum2 -= *v2*tmp0;
3451             sum3 -= *v3*tmp0;
3452             sum4 -= *v4*tmp0;
3453           }
3454           x[row--] += sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3455           x[row--] += sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3456           x[row--] += sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3457           x[row--] += sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3458           break;
3459         case 5:
3460 
3461           sum1 = b[row];
3462           sum2 = b[row-1];
3463           sum3 = b[row-2];
3464           sum4 = b[row-3];
3465           sum5 = b[row-4];
3466           v2   = a->a + ii[row-1];
3467           v3   = a->a + ii[row-2];
3468           v4   = a->a + ii[row-3];
3469           v5   = a->a + ii[row-4];
3470           for (n = 0; n<sz-1; n+=2) {
3471             i1    = idx[0];
3472             i2    = idx[1];
3473             idx  += 2;
3474             tmp0  = x[i1];
3475             tmp1  = x[i2];
3476             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3477             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3478             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3479             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3480             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3481           }
3482 
3483           if (n == sz-1) {
3484             tmp0  = x[*idx];
3485             sum1 -= *v1*tmp0;
3486             sum2 -= *v2*tmp0;
3487             sum3 -= *v3*tmp0;
3488             sum4 -= *v4*tmp0;
3489             sum5 -= *v5*tmp0;
3490           }
3491           x[row--] += sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3492           x[row--] += sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3493           x[row--] += sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3494           x[row--] += sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3495           x[row--] += sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3496           break;
3497         default:
3498           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3499         }
3500       }
3501 
3502       ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
3503     }
3504   }
3505   if (flag & SOR_EISENSTAT) {
3506     /*
3507           Apply  (U + D)^-1  where D is now the block diagonal
3508     */
3509     ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3510     for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3511       ibdiag -= sizes[i]*sizes[i];
3512       sz      = ii[row+1] - diag[row] - 1;
3513       v1      = a->a + diag[row] + 1;
3514       idx     = a->j + diag[row] + 1;
3515       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3516       switch (sizes[i]) {
3517       case 1:
3518 
3519         sum1 = b[row];
3520         for (n = 0; n<sz-1; n+=2) {
3521           i1    = idx[0];
3522           i2    = idx[1];
3523           idx  += 2;
3524           tmp0  = x[i1];
3525           tmp1  = x[i2];
3526           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3527         }
3528 
3529         if (n == sz-1) {
3530           tmp0  = x[*idx];
3531           sum1 -= *v1*tmp0;
3532         }
3533         x[row] = sum1*(*ibdiag);row--;
3534         break;
3535 
3536       case 2:
3537 
3538         sum1 = b[row];
3539         sum2 = b[row-1];
3540         /* note that sum1 is associated with the second of the two rows */
3541         v2 = a->a + diag[row-1] + 2;
3542         for (n = 0; n<sz-1; n+=2) {
3543           i1    = idx[0];
3544           i2    = idx[1];
3545           idx  += 2;
3546           tmp0  = x[i1];
3547           tmp1  = x[i2];
3548           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3549           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3550         }
3551 
3552         if (n == sz-1) {
3553           tmp0  = x[*idx];
3554           sum1 -= *v1*tmp0;
3555           sum2 -= *v2*tmp0;
3556         }
3557         x[row]   = sum2*ibdiag[1] + sum1*ibdiag[3];
3558         x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
3559         row     -= 2;
3560         break;
3561       case 3:
3562 
3563         sum1 = b[row];
3564         sum2 = b[row-1];
3565         sum3 = b[row-2];
3566         v2   = a->a + diag[row-1] + 2;
3567         v3   = a->a + diag[row-2] + 3;
3568         for (n = 0; n<sz-1; n+=2) {
3569           i1    = idx[0];
3570           i2    = idx[1];
3571           idx  += 2;
3572           tmp0  = x[i1];
3573           tmp1  = x[i2];
3574           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3575           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3576           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3577         }
3578 
3579         if (n == sz-1) {
3580           tmp0  = x[*idx];
3581           sum1 -= *v1*tmp0;
3582           sum2 -= *v2*tmp0;
3583           sum3 -= *v3*tmp0;
3584         }
3585         x[row]   = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3586         x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3587         x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3588         row     -= 3;
3589         break;
3590       case 4:
3591 
3592         sum1 = b[row];
3593         sum2 = b[row-1];
3594         sum3 = b[row-2];
3595         sum4 = b[row-3];
3596         v2   = a->a + diag[row-1] + 2;
3597         v3   = a->a + diag[row-2] + 3;
3598         v4   = a->a + diag[row-3] + 4;
3599         for (n = 0; n<sz-1; n+=2) {
3600           i1    = idx[0];
3601           i2    = idx[1];
3602           idx  += 2;
3603           tmp0  = x[i1];
3604           tmp1  = x[i2];
3605           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3606           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3607           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3608           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3609         }
3610 
3611         if (n == sz-1) {
3612           tmp0  = x[*idx];
3613           sum1 -= *v1*tmp0;
3614           sum2 -= *v2*tmp0;
3615           sum3 -= *v3*tmp0;
3616           sum4 -= *v4*tmp0;
3617         }
3618         x[row]   = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3619         x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3620         x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3621         x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3622         row     -= 4;
3623         break;
3624       case 5:
3625 
3626         sum1 = b[row];
3627         sum2 = b[row-1];
3628         sum3 = b[row-2];
3629         sum4 = b[row-3];
3630         sum5 = b[row-4];
3631         v2   = a->a + diag[row-1] + 2;
3632         v3   = a->a + diag[row-2] + 3;
3633         v4   = a->a + diag[row-3] + 4;
3634         v5   = a->a + diag[row-4] + 5;
3635         for (n = 0; n<sz-1; n+=2) {
3636           i1    = idx[0];
3637           i2    = idx[1];
3638           idx  += 2;
3639           tmp0  = x[i1];
3640           tmp1  = x[i2];
3641           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3642           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3643           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3644           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3645           sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3646         }
3647 
3648         if (n == sz-1) {
3649           tmp0  = x[*idx];
3650           sum1 -= *v1*tmp0;
3651           sum2 -= *v2*tmp0;
3652           sum3 -= *v3*tmp0;
3653           sum4 -= *v4*tmp0;
3654           sum5 -= *v5*tmp0;
3655         }
3656         x[row]   = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3657         x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3658         x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3659         x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3660         x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3661         row     -= 5;
3662         break;
3663       default:
3664         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3665       }
3666     }
3667     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3668 
3669     /*
3670            t = b - D x    where D is the block diagonal
3671     */
3672     cnt = 0;
3673     for (i=0, row=0; i<m; i++) {
3674       switch (sizes[i]) {
3675       case 1:
3676         t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
3677         break;
3678       case 2:
3679         x1       = x[row]; x2 = x[row+1];
3680         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3681         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3682         t[row]   = b[row] - tmp1;
3683         t[row+1] = b[row+1] - tmp2; row += 2;
3684         cnt     += 4;
3685         break;
3686       case 3:
3687         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2];
3688         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3689         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3690         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3691         t[row]   = b[row] - tmp1;
3692         t[row+1] = b[row+1] - tmp2;
3693         t[row+2] = b[row+2] - tmp3; row += 3;
3694         cnt     += 9;
3695         break;
3696       case 4:
3697         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3698         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3699         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3700         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3701         tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3702         t[row]   = b[row] - tmp1;
3703         t[row+1] = b[row+1] - tmp2;
3704         t[row+2] = b[row+2] - tmp3;
3705         t[row+3] = b[row+3] - tmp4; row += 4;
3706         cnt     += 16;
3707         break;
3708       case 5:
3709         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3710         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3711         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3712         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3713         tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3714         tmp5     = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3715         t[row]   = b[row] - tmp1;
3716         t[row+1] = b[row+1] - tmp2;
3717         t[row+2] = b[row+2] - tmp3;
3718         t[row+3] = b[row+3] - tmp4;
3719         t[row+4] = b[row+4] - tmp5;row += 5;
3720         cnt     += 25;
3721         break;
3722       default:
3723         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3724       }
3725     }
3726     ierr = PetscLogFlops(m);CHKERRQ(ierr);
3727 
3728 
3729 
3730     /*
3731           Apply (L + D)^-1 where D is the block diagonal
3732     */
3733     for (i=0, row=0; i<m; i++) {
3734       sz  = diag[row] - ii[row];
3735       v1  = a->a + ii[row];
3736       idx = a->j + ii[row];
3737       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3738       switch (sizes[i]) {
3739       case 1:
3740 
3741         sum1 = t[row];
3742         for (n = 0; n<sz-1; n+=2) {
3743           i1    = idx[0];
3744           i2    = idx[1];
3745           idx  += 2;
3746           tmp0  = t[i1];
3747           tmp1  = t[i2];
3748           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3749         }
3750 
3751         if (n == sz-1) {
3752           tmp0  = t[*idx];
3753           sum1 -= *v1 * tmp0;
3754         }
3755         x[row] += t[row] = sum1*(*ibdiag++); row++;
3756         break;
3757       case 2:
3758         v2   = a->a + ii[row+1];
3759         sum1 = t[row];
3760         sum2 = t[row+1];
3761         for (n = 0; n<sz-1; n+=2) {
3762           i1    = idx[0];
3763           i2    = idx[1];
3764           idx  += 2;
3765           tmp0  = t[i1];
3766           tmp1  = t[i2];
3767           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3768           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3769         }
3770 
3771         if (n == sz-1) {
3772           tmp0  = t[*idx];
3773           sum1 -= v1[0] * tmp0;
3774           sum2 -= v2[0] * tmp0;
3775         }
3776         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3777         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3778         ibdiag   += 4; row += 2;
3779         break;
3780       case 3:
3781         v2   = a->a + ii[row+1];
3782         v3   = a->a + ii[row+2];
3783         sum1 = t[row];
3784         sum2 = t[row+1];
3785         sum3 = t[row+2];
3786         for (n = 0; n<sz-1; n+=2) {
3787           i1    = idx[0];
3788           i2    = idx[1];
3789           idx  += 2;
3790           tmp0  = t[i1];
3791           tmp1  = t[i2];
3792           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3793           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3794           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3795         }
3796 
3797         if (n == sz-1) {
3798           tmp0  = t[*idx];
3799           sum1 -= v1[0] * tmp0;
3800           sum2 -= v2[0] * tmp0;
3801           sum3 -= v3[0] * tmp0;
3802         }
3803         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3804         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3805         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3806         ibdiag   += 9; row += 3;
3807         break;
3808       case 4:
3809         v2   = a->a + ii[row+1];
3810         v3   = a->a + ii[row+2];
3811         v4   = a->a + ii[row+3];
3812         sum1 = t[row];
3813         sum2 = t[row+1];
3814         sum3 = t[row+2];
3815         sum4 = t[row+3];
3816         for (n = 0; n<sz-1; n+=2) {
3817           i1    = idx[0];
3818           i2    = idx[1];
3819           idx  += 2;
3820           tmp0  = t[i1];
3821           tmp1  = t[i2];
3822           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3823           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3824           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3825           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3826         }
3827 
3828         if (n == sz-1) {
3829           tmp0  = t[*idx];
3830           sum1 -= v1[0] * tmp0;
3831           sum2 -= v2[0] * tmp0;
3832           sum3 -= v3[0] * tmp0;
3833           sum4 -= v4[0] * tmp0;
3834         }
3835         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3836         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3837         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3838         x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3839         ibdiag   += 16; row += 4;
3840         break;
3841       case 5:
3842         v2   = a->a + ii[row+1];
3843         v3   = a->a + ii[row+2];
3844         v4   = a->a + ii[row+3];
3845         v5   = a->a + ii[row+4];
3846         sum1 = t[row];
3847         sum2 = t[row+1];
3848         sum3 = t[row+2];
3849         sum4 = t[row+3];
3850         sum5 = t[row+4];
3851         for (n = 0; n<sz-1; n+=2) {
3852           i1    = idx[0];
3853           i2    = idx[1];
3854           idx  += 2;
3855           tmp0  = t[i1];
3856           tmp1  = t[i2];
3857           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3858           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3859           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3860           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3861           sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3862         }
3863 
3864         if (n == sz-1) {
3865           tmp0  = t[*idx];
3866           sum1 -= v1[0] * tmp0;
3867           sum2 -= v2[0] * tmp0;
3868           sum3 -= v3[0] * tmp0;
3869           sum4 -= v4[0] * tmp0;
3870           sum5 -= v5[0] * tmp0;
3871         }
3872         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3873         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3874         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3875         x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3876         x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3877         ibdiag   += 25; row += 5;
3878         break;
3879       default:
3880         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3881       }
3882     }
3883     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3884   }
3885   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3886   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3887   PetscFunctionReturn(0);
3888 }
3889 
3890 #undef __FUNCT__
3891 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode"
3892 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
3893 {
3894   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
3895   PetscScalar       *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
3896   const MatScalar   *bdiag = a->inode.bdiag;
3897   const PetscScalar *b;
3898   PetscErrorCode    ierr;
3899   PetscInt          m      = a->inode.node_count,cnt = 0,i,row;
3900   const PetscInt    *sizes = a->inode.size;
3901 
3902   PetscFunctionBegin;
3903   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3904   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3905   cnt  = 0;
3906   for (i=0, row=0; i<m; i++) {
3907     switch (sizes[i]) {
3908     case 1:
3909       x[row] = b[row]*bdiag[cnt++];row++;
3910       break;
3911     case 2:
3912       x1       = b[row]; x2 = b[row+1];
3913       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3914       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3915       x[row++] = tmp1;
3916       x[row++] = tmp2;
3917       cnt     += 4;
3918       break;
3919     case 3:
3920       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2];
3921       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3922       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3923       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3924       x[row++] = tmp1;
3925       x[row++] = tmp2;
3926       x[row++] = tmp3;
3927       cnt     += 9;
3928       break;
3929     case 4:
3930       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
3931       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3932       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3933       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3934       tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3935       x[row++] = tmp1;
3936       x[row++] = tmp2;
3937       x[row++] = tmp3;
3938       x[row++] = tmp4;
3939       cnt     += 16;
3940       break;
3941     case 5:
3942       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
3943       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3944       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3945       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3946       tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3947       tmp5     = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3948       x[row++] = tmp1;
3949       x[row++] = tmp2;
3950       x[row++] = tmp3;
3951       x[row++] = tmp4;
3952       x[row++] = tmp5;
3953       cnt     += 25;
3954       break;
3955     default:
3956       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3957     }
3958   }
3959   ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr);
3960   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3961   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3962   PetscFunctionReturn(0);
3963 }
3964 
3965 /*
3966     samestructure indicates that the matrix has not changed its nonzero structure so we
3967     do not need to recompute the inodes
3968 */
3969 #undef __FUNCT__
3970 #define __FUNCT__ "Mat_CheckInode"
3971 PetscErrorCode Mat_CheckInode(Mat A,PetscBool samestructure)
3972 {
3973   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3974   PetscErrorCode ierr;
3975   PetscInt       i,j,m,nzx,nzy,*ns,node_count,blk_size;
3976   PetscBool      flag;
3977   const PetscInt *idx,*idy,*ii;
3978 
3979   PetscFunctionBegin;
3980   if (!a->inode.use) PetscFunctionReturn(0);
3981   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
3982 
3983 
3984   m = A->rmap->n;
3985   if (a->inode.size) ns = a->inode.size;
3986   else {
3987     ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);
3988   }
3989 
3990   i          = 0;
3991   node_count = 0;
3992   idx        = a->j;
3993   ii         = a->i;
3994   while (i < m) {                /* For each row */
3995     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
3996     /* Limits the number of elements in a node to 'a->inode.limit' */
3997     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
3998       nzy = ii[j+1] - ii[j];     /* Same number of nonzeros */
3999       if (nzy != nzx) break;
4000       idy += nzx;              /* Same nonzero pattern */
4001       ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
4002       if (!flag) break;
4003     }
4004     ns[node_count++] = blk_size;
4005     idx             += blk_size*nzx;
4006     i                = j;
4007   }
4008   /* If not enough inodes found,, do not use inode version of the routines */
4009   if (!m || node_count > .8*m) {
4010     ierr = PetscFree(ns);CHKERRQ(ierr);
4011 
4012     a->inode.node_count       = 0;
4013     a->inode.size             = NULL;
4014     a->inode.use              = PETSC_FALSE;
4015     A->ops->mult              = MatMult_SeqAIJ;
4016     A->ops->sor               = MatSOR_SeqAIJ;
4017     A->ops->multadd           = MatMultAdd_SeqAIJ;
4018     A->ops->getrowij          = MatGetRowIJ_SeqAIJ;
4019     A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ;
4020     A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ;
4021     A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ;
4022     A->ops->coloringpatch     = 0;
4023     A->ops->multdiagonalblock = 0;
4024 
4025     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
4026   } else {
4027     if (!A->factortype) {
4028       A->ops->mult              = MatMult_SeqAIJ_Inode;
4029       A->ops->sor               = MatSOR_SeqAIJ_Inode;
4030       A->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
4031       A->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4032       A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4033       A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4034       A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4035       A->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4036       A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4037     } else {
4038       A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4039     }
4040     a->inode.node_count = node_count;
4041     a->inode.size       = ns;
4042     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
4043   }
4044   a->inode.checked = PETSC_TRUE;
4045   PetscFunctionReturn(0);
4046 }
4047 
4048 #undef __FUNCT__
4049 #define __FUNCT__ "MatDuplicate_SeqAIJ_Inode"
4050 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat *C)
4051 {
4052   Mat            B =*C;
4053   Mat_SeqAIJ     *c=(Mat_SeqAIJ*)B->data,*a=(Mat_SeqAIJ*)A->data;
4054   PetscErrorCode ierr;
4055   PetscInt       m=A->rmap->n;
4056 
4057   PetscFunctionBegin;
4058   c->inode.use       = a->inode.use;
4059   c->inode.limit     = a->inode.limit;
4060   c->inode.max_limit = a->inode.max_limit;
4061   if (a->inode.size) {
4062     ierr                = PetscMalloc((m+1)*sizeof(PetscInt),&c->inode.size);CHKERRQ(ierr);
4063     c->inode.node_count = a->inode.node_count;
4064     ierr                = PetscMemcpy(c->inode.size,a->inode.size,(m+1)*sizeof(PetscInt));CHKERRQ(ierr);
4065     /* note the table of functions below should match that in Mat_CheckInode() */
4066     if (!B->factortype) {
4067       B->ops->mult              = MatMult_SeqAIJ_Inode;
4068       B->ops->sor               = MatSOR_SeqAIJ_Inode;
4069       B->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
4070       B->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4071       B->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4072       B->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4073       B->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4074       B->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4075       B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4076     } else {
4077       B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4078     }
4079   } else {
4080     c->inode.size       = 0;
4081     c->inode.node_count = 0;
4082   }
4083   c->inode.ibdiagvalid = PETSC_FALSE;
4084   c->inode.ibdiag      = 0;
4085   c->inode.bdiag       = 0;
4086   PetscFunctionReturn(0);
4087 }
4088 
4089 #undef __FUNCT__
4090 #define __FUNCT__ "MatGetRow_FactoredLU"
4091 PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,const PetscInt *ai,const PetscInt *aj,const PetscInt *adiag,PetscInt row)
4092 {
4093   PetscInt       k;
4094   const PetscInt *vi;
4095 
4096   PetscFunctionBegin;
4097   vi = aj + ai[row];
4098   for (k=0; k<nzl; k++) cols[k] = vi[k];
4099   vi        = aj + adiag[row];
4100   cols[nzl] = vi[0];
4101   vi        = aj + adiag[row+1]+1;
4102   for (k=0; k<nzu; k++) cols[nzl+1+k] = vi[k];
4103   PetscFunctionReturn(0);
4104 }
4105 /*
4106    Mat_CheckInode_FactorLU - Check Inode for factored seqaij matrix.
4107    Modified from Mat_CheckInode().
4108 
4109    Input Parameters:
4110 +  Mat A - ILU or LU matrix factor
4111 -  samestructure - TRUE indicates that the matrix has not changed its nonzero structure so we
4112     do not need to recompute the inodes
4113 */
4114 #undef __FUNCT__
4115 #define __FUNCT__ "Mat_CheckInode_FactorLU"
4116 PetscErrorCode Mat_CheckInode_FactorLU(Mat A,PetscBool samestructure)
4117 {
4118   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4119   PetscErrorCode ierr;
4120   PetscInt       i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size;
4121   PetscInt       *cols1,*cols2,*ns;
4122   const PetscInt *ai = a->i,*aj = a->j, *adiag = a->diag;
4123   PetscBool      flag;
4124 
4125   PetscFunctionBegin;
4126   if (!a->inode.use)                     PetscFunctionReturn(0);
4127   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
4128 
4129   m = A->rmap->n;
4130   if (a->inode.size) ns = a->inode.size;
4131   else {
4132     ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);
4133   }
4134 
4135   i          = 0;
4136   node_count = 0;
4137   ierr = PetscMalloc2(m,PetscInt,&cols1,m,PetscInt,&cols2);CHKERRQ(ierr);
4138   while (i < m) {                /* For each row */
4139     nzl1 = ai[i+1] - ai[i];       /* Number of nonzeros in L */
4140     nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/
4141     nzx  = nzl1 + nzu1 + 1;
4142     MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i);
4143 
4144     /* Limits the number of elements in a node to 'a->inode.limit' */
4145     for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4146       nzl2 = ai[j+1] - ai[j];
4147       nzu2 = adiag[j] - adiag[j+1] - 1;
4148       nzy  = nzl2 + nzu2 + 1;
4149       if (nzy != nzx) break;
4150       ierr = MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);CHKERRQ(ierr);
4151       ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
4152       if (!flag) break;
4153     }
4154     ns[node_count++] = blk_size;
4155     i                = j;
4156   }
4157   ierr             = PetscFree2(cols1,cols2);CHKERRQ(ierr);
4158   /* If not enough inodes found,, do not use inode version of the routines */
4159   if (!m || node_count > .8*m) {
4160     ierr = PetscFree(ns);CHKERRQ(ierr);
4161 
4162     a->inode.node_count = 0;
4163     a->inode.size       = NULL;
4164     a->inode.use        = PETSC_FALSE;
4165 
4166     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
4167   } else {
4168     A->ops->mult              = 0;
4169     A->ops->sor               = 0;
4170     A->ops->multadd           = 0;
4171     A->ops->getrowij          = 0;
4172     A->ops->restorerowij      = 0;
4173     A->ops->getcolumnij       = 0;
4174     A->ops->restorecolumnij   = 0;
4175     A->ops->coloringpatch     = 0;
4176     A->ops->multdiagonalblock = 0;
4177     A->ops->solve             = MatSolve_SeqAIJ_Inode;
4178     a->inode.node_count       = node_count;
4179     a->inode.size             = ns;
4180 
4181     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
4182   }
4183   a->inode.checked = PETSC_TRUE;
4184   PetscFunctionReturn(0);
4185 }
4186 
4187 #undef __FUNCT__
4188 #define __FUNCT__ "MatSeqAIJInvalidateDiagonal_Inode"
4189 PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4190 {
4191   Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;
4192 
4193   PetscFunctionBegin;
4194   a->inode.ibdiagvalid = PETSC_FALSE;
4195   PetscFunctionReturn(0);
4196 }
4197 
4198 /*
4199      This is really ugly. if inodes are used this replaces the
4200   permutations with ones that correspond to rows/cols of the matrix
4201   rather then inode blocks
4202 */
4203 #undef __FUNCT__
4204 #define __FUNCT__ "MatInodeAdjustForInodes"
4205 PetscErrorCode  MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
4206 {
4207   PetscErrorCode ierr;
4208 
4209   PetscFunctionBegin;
4210   ierr = PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));CHKERRQ(ierr);
4211   PetscFunctionReturn(0);
4212 }
4213 
4214 #undef __FUNCT__
4215 #define __FUNCT__ "MatInodeAdjustForInodes_SeqAIJ_Inode"
4216 PetscErrorCode  MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
4217 {
4218   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data;
4219   PetscErrorCode ierr;
4220   PetscInt       m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
4221   const PetscInt *ridx,*cidx;
4222   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
4223   PetscInt       nslim_col,*ns_col;
4224   IS             ris = *rperm,cis = *cperm;
4225 
4226   PetscFunctionBegin;
4227   if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */
4228   if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */
4229 
4230   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
4231   ierr = PetscMalloc((((nslim_row>nslim_col) ? nslim_row : nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
4232   ierr = PetscMalloc2(m,PetscInt,&permr,n,PetscInt,&permc);CHKERRQ(ierr);
4233 
4234   ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr);
4235   ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr);
4236 
4237   /* Form the inode structure for the rows of permuted matric using inv perm*/
4238   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
4239 
4240   /* Construct the permutations for rows*/
4241   for (i=0,row = 0; i<nslim_row; ++i) {
4242     indx      = ridx[i];
4243     start_val = tns[indx];
4244     end_val   = tns[indx + 1];
4245     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
4246   }
4247 
4248   /* Form the inode structure for the columns of permuted matrix using inv perm*/
4249   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
4250 
4251   /* Construct permutations for columns */
4252   for (i=0,col=0; i<nslim_col; ++i) {
4253     indx      = cidx[i];
4254     start_val = tns[indx];
4255     end_val   = tns[indx + 1];
4256     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
4257   }
4258 
4259   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm);CHKERRQ(ierr);
4260   ierr = ISSetPermutation(*rperm);CHKERRQ(ierr);
4261   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm);CHKERRQ(ierr);
4262   ierr = ISSetPermutation(*cperm);CHKERRQ(ierr);
4263 
4264   ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr);
4265   ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr);
4266 
4267   ierr = PetscFree(ns_col);CHKERRQ(ierr);
4268   ierr = PetscFree2(permr,permc);CHKERRQ(ierr);
4269   ierr = ISDestroy(&cis);CHKERRQ(ierr);
4270   ierr = ISDestroy(&ris);CHKERRQ(ierr);
4271   ierr = PetscFree(tns);CHKERRQ(ierr);
4272   PetscFunctionReturn(0);
4273 }
4274 
4275 #undef __FUNCT__
4276 #define __FUNCT__ "MatInodeGetInodeSizes"
4277 /*@C
4278    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
4279 
4280    Not Collective
4281 
4282    Input Parameter:
4283 .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
4284 
4285    Output Parameter:
4286 +  node_count - no of inodes present in the matrix.
4287 .  sizes      - an array of size node_count,with sizes of each inode.
4288 -  limit      - the max size used to generate the inodes.
4289 
4290    Level: advanced
4291 
4292    Notes: This routine returns some internal storage information
4293    of the matrix, it is intended to be used by advanced users.
4294    It should be called after the matrix is assembled.
4295    The contents of the sizes[] array should not be changed.
4296    NULL may be passed for information not requested.
4297 
4298 .keywords: matrix, seqaij, get, inode
4299 
4300 .seealso: MatGetInfo()
4301 @*/
4302 PetscErrorCode  MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4303 {
4304   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
4305 
4306   PetscFunctionBegin;
4307   if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
4308   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",&f);CHKERRQ(ierr);
4309   if (f) {
4310     ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr);
4311   }
4312   PetscFunctionReturn(0);
4313 }
4314 
4315 #undef __FUNCT__
4316 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode"
4317 PetscErrorCode  MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4318 {
4319   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4320 
4321   PetscFunctionBegin;
4322   if (node_count) *node_count = a->inode.node_count;
4323   if (sizes)      *sizes      = a->inode.size;
4324   if (limit)      *limit      = a->inode.limit;
4325   PetscFunctionReturn(0);
4326 }
4327