xref: /petsc/src/mat/impls/aij/seq/inode.c (revision a455e9261d77363e13c8b22a0e0ea831d71fb0ff)
1 
2 /*
3   This file provides high performance routines for the Inode format (compressed sparse row)
4   by taking advantage of rows with identical nonzero structure (I-nodes).
5 */
6 #include <../src/mat/impls/aij/seq/aij.h>
7 
8 #undef __FUNCT__
9 #define __FUNCT__ "Mat_CreateColInode"
10 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt *size,PetscInt **ns)
11 {
12   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
13   PetscErrorCode ierr;
14   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;
15 
16   PetscFunctionBegin;
17   n      = A->cmap->n;
18   m      = A->rmap->n;
19   ns_row = a->inode.size;
20 
21   min_mn = (m < n) ? m : n;
22   if (!ns) {
23     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) ;
24     for (; count+1 < n; count++,i++) ;
25     if (count < n)  {
26       i++;
27     }
28     *size = i;
29     PetscFunctionReturn(0);
30   }
31   ierr = PetscMalloc1(n+1,&ns_col);CHKERRQ(ierr);
32 
33   /* Use the same row structure wherever feasible. */
34   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
35     ns_col[i] = ns_row[i];
36   }
37 
38   /* if m < n; pad up the remainder with inode_limit */
39   for (; count+1 < n; count++,i++) {
40     ns_col[i] = 1;
41   }
42   /* The last node is the odd ball. padd it up with the remaining rows; */
43   if (count < n) {
44     ns_col[i] = n - count;
45     i++;
46   } else if (count > n) {
47     /* Adjust for the over estimation */
48     ns_col[i-1] += n - count;
49   }
50   *size = i;
51   *ns   = ns_col;
52   PetscFunctionReturn(0);
53 }
54 
55 
56 /*
57       This builds symmetric version of nonzero structure,
58 */
59 #undef __FUNCT__
60 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric"
61 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
62 {
63   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
64   PetscErrorCode ierr;
65   PetscInt       *work,*ia,*ja,nz,nslim_row,nslim_col,m,row,col,n;
66   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2;
67   const PetscInt *j,*jmax,*ai= a->i,*aj = a->j;
68 
69   PetscFunctionBegin;
70   nslim_row = a->inode.node_count;
71   m         = A->rmap->n;
72   n         = A->cmap->n;
73   if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
74 
75   /* Use the row_inode as column_inode */
76   nslim_col = nslim_row;
77   ns_col    = ns_row;
78 
79   /* allocate space for reformated inode structure */
80   ierr = PetscMalloc1(nslim_col+1,&tns);CHKERRQ(ierr);
81   ierr = PetscMalloc1(n+1,&tvc);CHKERRQ(ierr);
82   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
83 
84   for (i1=0,col=0; i1<nslim_col; ++i1) {
85     nsz = ns_col[i1];
86     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
87   }
88   /* allocate space for row pointers */
89   ierr = PetscMalloc1(nslim_row+1,&ia);CHKERRQ(ierr);
90   *iia = ia;
91   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
92   ierr = PetscMalloc1(nslim_row+1,&work);CHKERRQ(ierr);
93 
94   /* determine the number of columns in each row */
95   ia[0] = oshift;
96   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
97 
98     j    = aj + ai[row] + ishift;
99     jmax = aj + ai[row+1] + ishift;
100     col  = *j++ + ishift;
101     i2   = tvc[col];
102     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
103       ia[i1+1]++;
104       ia[i2+1]++;
105       i2++;                     /* Start col of next node */
106       while ((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j;
107       i2 = tvc[col];
108     }
109     if (i2 == i1) ia[i2+1]++;    /* now the diagonal element */
110   }
111 
112   /* shift ia[i] to point to next row */
113   for (i1=1; i1<nslim_row+1; i1++) {
114     row        = ia[i1-1];
115     ia[i1]    += row;
116     work[i1-1] = row - oshift;
117   }
118 
119   /* allocate space for column pointers */
120   nz   = ia[nslim_row] + (!ishift);
121   ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr);
122   *jja = ja;
123 
124   /* loop over lower triangular part putting into ja */
125   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
126     j    = aj + ai[row] + ishift;
127     jmax = aj + ai[row+1] + ishift;
128     col  = *j++ + ishift;
129     i2   = tvc[col];
130     while (i2<i1 && j<jmax) {
131       ja[work[i2]++] = i1 + oshift;
132       ja[work[i1]++] = i2 + oshift;
133       ++i2;
134       while ((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */
135       i2 = tvc[col];
136     }
137     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
138 
139   }
140   ierr = PetscFree(work);CHKERRQ(ierr);
141   ierr = PetscFree(tns);CHKERRQ(ierr);
142   ierr = PetscFree(tvc);CHKERRQ(ierr);
143   PetscFunctionReturn(0);
144 }
145 
146 /*
147       This builds nonsymmetric version of nonzero structure,
148 */
149 #undef __FUNCT__
150 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric"
151 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
152 {
153   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
154   PetscErrorCode ierr;
155   PetscInt       *work,*ia,*ja,nz,nslim_row,n,row,col,*ns_col,nslim_col;
156   PetscInt       *tns,*tvc,nsz,i1,i2;
157   const PetscInt *j,*ai= a->i,*aj = a->j,*ns_row = a->inode.size;
158 
159   PetscFunctionBegin;
160   nslim_row = a->inode.node_count;
161   n         = A->cmap->n;
162 
163   /* Create The column_inode for this matrix */
164   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
165 
166   /* allocate space for reformated column_inode structure */
167   ierr = PetscMalloc1(nslim_col +1,&tns);CHKERRQ(ierr);
168   ierr = PetscMalloc1(n + 1,&tvc);CHKERRQ(ierr);
169   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
170 
171   for (i1=0,col=0; i1<nslim_col; ++i1) {
172     nsz = ns_col[i1];
173     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
174   }
175   /* allocate space for row pointers */
176   ierr = PetscMalloc1(nslim_row+1,&ia);CHKERRQ(ierr);
177   *iia = ia;
178   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
179   ierr = PetscMalloc1(nslim_row+1,&work);CHKERRQ(ierr);
180 
181   /* determine the number of columns in each row */
182   ia[0] = oshift;
183   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
184     j   = aj + ai[row] + ishift;
185     col = *j++ + ishift;
186     i2  = tvc[col];
187     nz  = ai[row+1] - ai[row];
188     while (nz-- > 0) {           /* off-diagonal elemets */
189       ia[i1+1]++;
190       i2++;                     /* Start col of next node */
191       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
192       if (nz > 0) i2 = tvc[col];
193     }
194   }
195 
196   /* shift ia[i] to point to next row */
197   for (i1=1; i1<nslim_row+1; i1++) {
198     row        = ia[i1-1];
199     ia[i1]    += row;
200     work[i1-1] = row - oshift;
201   }
202 
203   /* allocate space for column pointers */
204   nz   = ia[nslim_row] + (!ishift);
205   ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr);
206   *jja = ja;
207 
208   /* loop over matrix putting into ja */
209   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
210     j   = aj + ai[row] + ishift;
211     col = *j++ + ishift;
212     i2  = tvc[col];
213     nz  = ai[row+1] - ai[row];
214     while (nz-- > 0) {
215       ja[work[i1]++] = i2 + oshift;
216       ++i2;
217       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
218       if (nz > 0) i2 = tvc[col];
219     }
220   }
221   ierr = PetscFree(ns_col);CHKERRQ(ierr);
222   ierr = PetscFree(work);CHKERRQ(ierr);
223   ierr = PetscFree(tns);CHKERRQ(ierr);
224   ierr = PetscFree(tvc);CHKERRQ(ierr);
225   PetscFunctionReturn(0);
226 }
227 
228 #undef __FUNCT__
229 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode"
230 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
231 {
232   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
233   PetscErrorCode ierr;
234 
235   PetscFunctionBegin;
236   *n = a->inode.node_count;
237   if (!ia) PetscFunctionReturn(0);
238   if (!blockcompressed) {
239     ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
240   } else if (symmetric) {
241     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
242   } else {
243     ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
244   }
245   PetscFunctionReturn(0);
246 }
247 
248 #undef __FUNCT__
249 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode"
250 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
251 {
252   PetscErrorCode ierr;
253 
254   PetscFunctionBegin;
255   if (!ia) PetscFunctionReturn(0);
256 
257   if (!blockcompressed) {
258     ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
259   } else {
260     ierr = PetscFree(*ia);CHKERRQ(ierr);
261     ierr = PetscFree(*ja);CHKERRQ(ierr);
262   }
263   PetscFunctionReturn(0);
264 }
265 
266 /* ----------------------------------------------------------- */
267 
268 #undef __FUNCT__
269 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric"
270 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
271 {
272   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
273   PetscErrorCode ierr;
274   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
275   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
276 
277   PetscFunctionBegin;
278   nslim_row = a->inode.node_count;
279   n         = A->cmap->n;
280 
281   /* Create The column_inode for this matrix */
282   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
283 
284   /* allocate space for reformated column_inode structure */
285   ierr = PetscMalloc1(nslim_col + 1,&tns);CHKERRQ(ierr);
286   ierr = PetscMalloc1(n + 1,&tvc);CHKERRQ(ierr);
287   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
288 
289   for (i1=0,col=0; i1<nslim_col; ++i1) {
290     nsz = ns_col[i1];
291     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
292   }
293   /* allocate space for column pointers */
294   ierr = PetscMalloc1(nslim_col+1,&ia);CHKERRQ(ierr);
295   *iia = ia;
296   ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr);
297   ierr = PetscMalloc1(nslim_col+1,&work);CHKERRQ(ierr);
298 
299   /* determine the number of columns in each row */
300   ia[0] = oshift;
301   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
302     j   = aj + ai[row] + ishift;
303     col = *j++ + ishift;
304     i2  = tvc[col];
305     nz  = ai[row+1] - ai[row];
306     while (nz-- > 0) {           /* off-diagonal elemets */
307       /* ia[i1+1]++; */
308       ia[i2+1]++;
309       i2++;
310       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
311       if (nz > 0) i2 = tvc[col];
312     }
313   }
314 
315   /* shift ia[i] to point to next col */
316   for (i1=1; i1<nslim_col+1; i1++) {
317     col        = ia[i1-1];
318     ia[i1]    += col;
319     work[i1-1] = col - oshift;
320   }
321 
322   /* allocate space for column pointers */
323   nz   = ia[nslim_col] + (!ishift);
324   ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr);
325   *jja = ja;
326 
327   /* loop over matrix putting into ja */
328   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
329     j   = aj + ai[row] + ishift;
330     col = *j++ + ishift;
331     i2  = tvc[col];
332     nz  = ai[row+1] - ai[row];
333     while (nz-- > 0) {
334       /* ja[work[i1]++] = i2 + oshift; */
335       ja[work[i2]++] = i1 + oshift;
336       i2++;
337       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
338       if (nz > 0) i2 = tvc[col];
339     }
340   }
341   ierr = PetscFree(ns_col);CHKERRQ(ierr);
342   ierr = PetscFree(work);CHKERRQ(ierr);
343   ierr = PetscFree(tns);CHKERRQ(ierr);
344   ierr = PetscFree(tvc);CHKERRQ(ierr);
345   PetscFunctionReturn(0);
346 }
347 
348 #undef __FUNCT__
349 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode"
350 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
351 {
352   PetscErrorCode ierr;
353 
354   PetscFunctionBegin;
355   ierr = Mat_CreateColInode(A,n,NULL);CHKERRQ(ierr);
356   if (!ia) PetscFunctionReturn(0);
357 
358   if (!blockcompressed) {
359     ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
360   } else if (symmetric) {
361     /* Since the indices are symmetric it does'nt matter */
362     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
363   } else {
364     ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
365   }
366   PetscFunctionReturn(0);
367 }
368 
369 #undef __FUNCT__
370 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode"
371 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
372 {
373   PetscErrorCode ierr;
374 
375   PetscFunctionBegin;
376   if (!ia) PetscFunctionReturn(0);
377   if (!blockcompressed) {
378     ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
379   } else {
380     ierr = PetscFree(*ia);CHKERRQ(ierr);
381     ierr = PetscFree(*ja);CHKERRQ(ierr);
382   }
383   PetscFunctionReturn(0);
384 }
385 
386 /* ----------------------------------------------------------- */
387 
388 #undef __FUNCT__
389 #define __FUNCT__ "MatMult_SeqAIJ_Inode"
390 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
391 {
392   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
393   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
394   PetscScalar       *y;
395   const PetscScalar *x;
396   const MatScalar   *v1,*v2,*v3,*v4,*v5;
397   PetscErrorCode    ierr;
398   PetscInt          i1,i2,n,i,row,node_max,nsz,sz,nonzerorow=0;
399   const PetscInt    *idx,*ns,*ii;
400 
401 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
402 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
403 #endif
404 
405   PetscFunctionBegin;
406   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
407   node_max = a->inode.node_count;
408   ns       = a->inode.size;     /* Node Size array */
409   ierr     = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
410   ierr     = VecGetArray(yy,&y);CHKERRQ(ierr);
411   idx      = a->j;
412   v1       = a->a;
413   ii       = a->i;
414 
415   for (i = 0,row = 0; i< node_max; ++i) {
416     nsz         = ns[i];
417     n           = ii[1] - ii[0];
418     nonzerorow += (n>0)*nsz;
419     ii         += nsz;
420     PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA);    /* Prefetch the indices for the block row after the current one */
421     PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one  */
422     sz = n;                     /* No of non zeros in this row */
423                                 /* Switch on the size of Node */
424     switch (nsz) {               /* Each loop in 'case' is unrolled */
425     case 1:
426       sum1 = 0.;
427 
428       for (n = 0; n< sz-1; n+=2) {
429         i1    = idx[0];         /* The instructions are ordered to */
430         i2    = idx[1];         /* make the compiler's job easy */
431         idx  += 2;
432         tmp0  = x[i1];
433         tmp1  = x[i2];
434         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
435       }
436 
437       if (n == sz-1) {          /* Take care of the last nonzero  */
438         tmp0  = x[*idx++];
439         sum1 += *v1++ *tmp0;
440       }
441       y[row++]=sum1;
442       break;
443     case 2:
444       sum1 = 0.;
445       sum2 = 0.;
446       v2   = v1 + n;
447 
448       for (n = 0; n< sz-1; n+=2) {
449         i1    = idx[0];
450         i2    = idx[1];
451         idx  += 2;
452         tmp0  = x[i1];
453         tmp1  = x[i2];
454         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
455         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
456       }
457       if (n == sz-1) {
458         tmp0  = x[*idx++];
459         sum1 += *v1++ * tmp0;
460         sum2 += *v2++ * tmp0;
461       }
462       y[row++]=sum1;
463       y[row++]=sum2;
464       v1      =v2;              /* Since the next block to be processed starts there*/
465       idx    +=sz;
466       break;
467     case 3:
468       sum1 = 0.;
469       sum2 = 0.;
470       sum3 = 0.;
471       v2   = v1 + n;
472       v3   = v2 + n;
473 
474       for (n = 0; n< sz-1; n+=2) {
475         i1    = idx[0];
476         i2    = idx[1];
477         idx  += 2;
478         tmp0  = x[i1];
479         tmp1  = x[i2];
480         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
481         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
482         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
483       }
484       if (n == sz-1) {
485         tmp0  = x[*idx++];
486         sum1 += *v1++ * tmp0;
487         sum2 += *v2++ * tmp0;
488         sum3 += *v3++ * tmp0;
489       }
490       y[row++]=sum1;
491       y[row++]=sum2;
492       y[row++]=sum3;
493       v1      =v3;              /* Since the next block to be processed starts there*/
494       idx    +=2*sz;
495       break;
496     case 4:
497       sum1 = 0.;
498       sum2 = 0.;
499       sum3 = 0.;
500       sum4 = 0.;
501       v2   = v1 + n;
502       v3   = v2 + n;
503       v4   = v3 + n;
504 
505       for (n = 0; n< sz-1; n+=2) {
506         i1    = idx[0];
507         i2    = idx[1];
508         idx  += 2;
509         tmp0  = x[i1];
510         tmp1  = x[i2];
511         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
512         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
513         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
514         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
515       }
516       if (n == sz-1) {
517         tmp0  = x[*idx++];
518         sum1 += *v1++ * tmp0;
519         sum2 += *v2++ * tmp0;
520         sum3 += *v3++ * tmp0;
521         sum4 += *v4++ * tmp0;
522       }
523       y[row++]=sum1;
524       y[row++]=sum2;
525       y[row++]=sum3;
526       y[row++]=sum4;
527       v1      =v4;              /* Since the next block to be processed starts there*/
528       idx    +=3*sz;
529       break;
530     case 5:
531       sum1 = 0.;
532       sum2 = 0.;
533       sum3 = 0.;
534       sum4 = 0.;
535       sum5 = 0.;
536       v2   = v1 + n;
537       v3   = v2 + n;
538       v4   = v3 + n;
539       v5   = v4 + n;
540 
541       for (n = 0; n<sz-1; n+=2) {
542         i1    = idx[0];
543         i2    = idx[1];
544         idx  += 2;
545         tmp0  = x[i1];
546         tmp1  = x[i2];
547         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
548         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
549         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
550         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
551         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
552       }
553       if (n == sz-1) {
554         tmp0  = x[*idx++];
555         sum1 += *v1++ * tmp0;
556         sum2 += *v2++ * tmp0;
557         sum3 += *v3++ * tmp0;
558         sum4 += *v4++ * tmp0;
559         sum5 += *v5++ * tmp0;
560       }
561       y[row++]=sum1;
562       y[row++]=sum2;
563       y[row++]=sum3;
564       y[row++]=sum4;
565       y[row++]=sum5;
566       v1      =v5;       /* Since the next block to be processed starts there */
567       idx    +=4*sz;
568       break;
569     default:
570       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
571     }
572   }
573   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
574   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
575   ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr);
576   PetscFunctionReturn(0);
577 }
578 /* ----------------------------------------------------------- */
579 /* Almost same code as the MatMult_SeqAIJ_Inode() */
580 #undef __FUNCT__
581 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode"
582 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
583 {
584   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
585   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
586   const MatScalar   *v1,*v2,*v3,*v4,*v5;
587   const PetscScalar *x;
588   PetscScalar       *y,*z,*zt;
589   PetscErrorCode    ierr;
590   PetscInt          i1,i2,n,i,row,node_max,nsz,sz;
591   const PetscInt    *idx,*ns,*ii;
592 
593   PetscFunctionBegin;
594   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
595   node_max = a->inode.node_count;
596   ns       = a->inode.size;     /* Node Size array */
597 
598   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
599   ierr = VecGetArrayPair(zz,yy,&z,&y);CHKERRQ(ierr);
600   zt = z;
601 
602   idx = a->j;
603   v1  = a->a;
604   ii  = a->i;
605 
606   for (i = 0,row = 0; i< node_max; ++i) {
607     nsz = ns[i];
608     n   = ii[1] - ii[0];
609     ii += nsz;
610     sz  = n;                    /* No of non zeros in this row */
611                                 /* Switch on the size of Node */
612     switch (nsz) {               /* Each loop in 'case' is unrolled */
613     case 1:
614       sum1 = *zt++;
615 
616       for (n = 0; n< sz-1; n+=2) {
617         i1    = idx[0];         /* The instructions are ordered to */
618         i2    = idx[1];         /* make the compiler's job easy */
619         idx  += 2;
620         tmp0  = x[i1];
621         tmp1  = x[i2];
622         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
623       }
624 
625       if (n   == sz-1) {          /* Take care of the last nonzero  */
626         tmp0  = x[*idx++];
627         sum1 += *v1++ * tmp0;
628       }
629       y[row++]=sum1;
630       break;
631     case 2:
632       sum1 = *zt++;
633       sum2 = *zt++;
634       v2   = v1 + n;
635 
636       for (n = 0; n< sz-1; n+=2) {
637         i1    = idx[0];
638         i2    = idx[1];
639         idx  += 2;
640         tmp0  = x[i1];
641         tmp1  = x[i2];
642         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
643         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
644       }
645       if (n   == sz-1) {
646         tmp0  = x[*idx++];
647         sum1 += *v1++ * tmp0;
648         sum2 += *v2++ * tmp0;
649       }
650       y[row++]=sum1;
651       y[row++]=sum2;
652       v1      =v2;              /* Since the next block to be processed starts there*/
653       idx    +=sz;
654       break;
655     case 3:
656       sum1 = *zt++;
657       sum2 = *zt++;
658       sum3 = *zt++;
659       v2   = v1 + n;
660       v3   = v2 + n;
661 
662       for (n = 0; n< sz-1; n+=2) {
663         i1    = idx[0];
664         i2    = idx[1];
665         idx  += 2;
666         tmp0  = x[i1];
667         tmp1  = x[i2];
668         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
669         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
670         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
671       }
672       if (n == sz-1) {
673         tmp0  = x[*idx++];
674         sum1 += *v1++ * tmp0;
675         sum2 += *v2++ * tmp0;
676         sum3 += *v3++ * tmp0;
677       }
678       y[row++]=sum1;
679       y[row++]=sum2;
680       y[row++]=sum3;
681       v1      =v3;              /* Since the next block to be processed starts there*/
682       idx    +=2*sz;
683       break;
684     case 4:
685       sum1 = *zt++;
686       sum2 = *zt++;
687       sum3 = *zt++;
688       sum4 = *zt++;
689       v2   = v1 + n;
690       v3   = v2 + n;
691       v4   = v3 + n;
692 
693       for (n = 0; n< sz-1; n+=2) {
694         i1    = idx[0];
695         i2    = idx[1];
696         idx  += 2;
697         tmp0  = x[i1];
698         tmp1  = x[i2];
699         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
700         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
701         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
702         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
703       }
704       if (n == sz-1) {
705         tmp0  = x[*idx++];
706         sum1 += *v1++ * tmp0;
707         sum2 += *v2++ * tmp0;
708         sum3 += *v3++ * tmp0;
709         sum4 += *v4++ * tmp0;
710       }
711       y[row++]=sum1;
712       y[row++]=sum2;
713       y[row++]=sum3;
714       y[row++]=sum4;
715       v1      =v4;              /* Since the next block to be processed starts there*/
716       idx    +=3*sz;
717       break;
718     case 5:
719       sum1 = *zt++;
720       sum2 = *zt++;
721       sum3 = *zt++;
722       sum4 = *zt++;
723       sum5 = *zt++;
724       v2   = v1 + n;
725       v3   = v2 + n;
726       v4   = v3 + n;
727       v5   = v4 + n;
728 
729       for (n = 0; n<sz-1; n+=2) {
730         i1    = idx[0];
731         i2    = idx[1];
732         idx  += 2;
733         tmp0  = x[i1];
734         tmp1  = x[i2];
735         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
736         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
737         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
738         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
739         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
740       }
741       if (n   == sz-1) {
742         tmp0  = x[*idx++];
743         sum1 += *v1++ * tmp0;
744         sum2 += *v2++ * tmp0;
745         sum3 += *v3++ * tmp0;
746         sum4 += *v4++ * tmp0;
747         sum5 += *v5++ * tmp0;
748       }
749       y[row++]=sum1;
750       y[row++]=sum2;
751       y[row++]=sum3;
752       y[row++]=sum4;
753       y[row++]=sum5;
754       v1      =v5;       /* Since the next block to be processed starts there */
755       idx    +=4*sz;
756       break;
757     default:
758       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
759     }
760   }
761   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
762   ierr = VecRestoreArrayPair(zz,yy,&z,&y);CHKERRQ(ierr);
763   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
764   PetscFunctionReturn(0);
765 }
766 
767 /* ----------------------------------------------------------- */
768 #undef __FUNCT__
769 #define __FUNCT__ "MatSolve_SeqAIJ_Inode_inplace"
770 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)
771 {
772   Mat_SeqAIJ        *a    = (Mat_SeqAIJ*)A->data;
773   IS                iscol = a->col,isrow = a->row;
774   PetscErrorCode    ierr;
775   const PetscInt    *r,*c,*rout,*cout;
776   PetscInt          i,j,n = A->rmap->n,nz;
777   PetscInt          node_max,*ns,row,nsz,aii,i0,i1;
778   const PetscInt    *ai = a->i,*a_j = a->j,*vi,*ad,*aj;
779   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
780   PetscScalar       sum1,sum2,sum3,sum4,sum5;
781   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
782   const PetscScalar *b;
783 
784   PetscFunctionBegin;
785   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
786   node_max = a->inode.node_count;
787   ns       = a->inode.size;     /* Node Size array */
788 
789   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
790   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
791   tmp  = a->solve_work;
792 
793   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
794   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
795 
796   /* forward solve the lower triangular */
797   tmps = tmp;
798   aa   = a_a;
799   aj   = a_j;
800   ad   = a->diag;
801 
802   for (i = 0,row = 0; i< node_max; ++i) {
803     nsz = ns[i];
804     aii = ai[row];
805     v1  = aa + aii;
806     vi  = aj + aii;
807     nz  = ad[row]- aii;
808     if (i < node_max-1) {
809       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
810       * but our indexing to determine it's size could. */
811       PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
812       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
813       PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
814       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
815     }
816 
817     switch (nsz) {               /* Each loop in 'case' is unrolled */
818     case 1:
819       sum1 = b[*r++];
820       for (j=0; j<nz-1; j+=2) {
821         i0    = vi[0];
822         i1    = vi[1];
823         vi   +=2;
824         tmp0  = tmps[i0];
825         tmp1  = tmps[i1];
826         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
827       }
828       if (j == nz-1) {
829         tmp0  = tmps[*vi++];
830         sum1 -= *v1++ *tmp0;
831       }
832       tmp[row++]=sum1;
833       break;
834     case 2:
835       sum1 = b[*r++];
836       sum2 = b[*r++];
837       v2   = aa + ai[row+1];
838 
839       for (j=0; j<nz-1; j+=2) {
840         i0    = vi[0];
841         i1    = vi[1];
842         vi   +=2;
843         tmp0  = tmps[i0];
844         tmp1  = tmps[i1];
845         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
846         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
847       }
848       if (j == nz-1) {
849         tmp0  = tmps[*vi++];
850         sum1 -= *v1++ *tmp0;
851         sum2 -= *v2++ *tmp0;
852       }
853       sum2     -= *v2++ *sum1;
854       tmp[row++]=sum1;
855       tmp[row++]=sum2;
856       break;
857     case 3:
858       sum1 = b[*r++];
859       sum2 = b[*r++];
860       sum3 = b[*r++];
861       v2   = aa + ai[row+1];
862       v3   = aa + ai[row+2];
863 
864       for (j=0; j<nz-1; j+=2) {
865         i0    = vi[0];
866         i1    = vi[1];
867         vi   +=2;
868         tmp0  = tmps[i0];
869         tmp1  = tmps[i1];
870         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
871         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
872         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
873       }
874       if (j == nz-1) {
875         tmp0  = tmps[*vi++];
876         sum1 -= *v1++ *tmp0;
877         sum2 -= *v2++ *tmp0;
878         sum3 -= *v3++ *tmp0;
879       }
880       sum2 -= *v2++ * sum1;
881       sum3 -= *v3++ * sum1;
882       sum3 -= *v3++ * sum2;
883 
884       tmp[row++]=sum1;
885       tmp[row++]=sum2;
886       tmp[row++]=sum3;
887       break;
888 
889     case 4:
890       sum1 = b[*r++];
891       sum2 = b[*r++];
892       sum3 = b[*r++];
893       sum4 = b[*r++];
894       v2   = aa + ai[row+1];
895       v3   = aa + ai[row+2];
896       v4   = aa + ai[row+3];
897 
898       for (j=0; j<nz-1; j+=2) {
899         i0    = vi[0];
900         i1    = vi[1];
901         vi   +=2;
902         tmp0  = tmps[i0];
903         tmp1  = tmps[i1];
904         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
905         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
906         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
907         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
908       }
909       if (j == nz-1) {
910         tmp0  = tmps[*vi++];
911         sum1 -= *v1++ *tmp0;
912         sum2 -= *v2++ *tmp0;
913         sum3 -= *v3++ *tmp0;
914         sum4 -= *v4++ *tmp0;
915       }
916       sum2 -= *v2++ * sum1;
917       sum3 -= *v3++ * sum1;
918       sum4 -= *v4++ * sum1;
919       sum3 -= *v3++ * sum2;
920       sum4 -= *v4++ * sum2;
921       sum4 -= *v4++ * sum3;
922 
923       tmp[row++]=sum1;
924       tmp[row++]=sum2;
925       tmp[row++]=sum3;
926       tmp[row++]=sum4;
927       break;
928     case 5:
929       sum1 = b[*r++];
930       sum2 = b[*r++];
931       sum3 = b[*r++];
932       sum4 = b[*r++];
933       sum5 = b[*r++];
934       v2   = aa + ai[row+1];
935       v3   = aa + ai[row+2];
936       v4   = aa + ai[row+3];
937       v5   = aa + ai[row+4];
938 
939       for (j=0; j<nz-1; j+=2) {
940         i0    = vi[0];
941         i1    = vi[1];
942         vi   +=2;
943         tmp0  = tmps[i0];
944         tmp1  = tmps[i1];
945         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
946         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
947         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
948         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
949         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
950       }
951       if (j == nz-1) {
952         tmp0  = tmps[*vi++];
953         sum1 -= *v1++ *tmp0;
954         sum2 -= *v2++ *tmp0;
955         sum3 -= *v3++ *tmp0;
956         sum4 -= *v4++ *tmp0;
957         sum5 -= *v5++ *tmp0;
958       }
959 
960       sum2 -= *v2++ * sum1;
961       sum3 -= *v3++ * sum1;
962       sum4 -= *v4++ * sum1;
963       sum5 -= *v5++ * sum1;
964       sum3 -= *v3++ * sum2;
965       sum4 -= *v4++ * sum2;
966       sum5 -= *v5++ * sum2;
967       sum4 -= *v4++ * sum3;
968       sum5 -= *v5++ * sum3;
969       sum5 -= *v5++ * sum4;
970 
971       tmp[row++]=sum1;
972       tmp[row++]=sum2;
973       tmp[row++]=sum3;
974       tmp[row++]=sum4;
975       tmp[row++]=sum5;
976       break;
977     default:
978       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
979     }
980   }
981   /* backward solve the upper triangular */
982   for (i=node_max -1,row = n-1; i>=0; i--) {
983     nsz = ns[i];
984     aii = ai[row+1] -1;
985     v1  = aa + aii;
986     vi  = aj + aii;
987     nz  = aii- ad[row];
988     switch (nsz) {               /* Each loop in 'case' is unrolled */
989     case 1:
990       sum1 = tmp[row];
991 
992       for (j=nz; j>1; j-=2) {
993         vi   -=2;
994         i0    = vi[2];
995         i1    = vi[1];
996         tmp0  = tmps[i0];
997         tmp1  = tmps[i1];
998         v1   -= 2;
999         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1000       }
1001       if (j==1) {
1002         tmp0  = tmps[*vi--];
1003         sum1 -= *v1-- * tmp0;
1004       }
1005       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1006       break;
1007     case 2:
1008       sum1 = tmp[row];
1009       sum2 = tmp[row -1];
1010       v2   = aa + ai[row]-1;
1011       for (j=nz; j>1; j-=2) {
1012         vi   -=2;
1013         i0    = vi[2];
1014         i1    = vi[1];
1015         tmp0  = tmps[i0];
1016         tmp1  = tmps[i1];
1017         v1   -= 2;
1018         v2   -= 2;
1019         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1020         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1021       }
1022       if (j==1) {
1023         tmp0  = tmps[*vi--];
1024         sum1 -= *v1-- * tmp0;
1025         sum2 -= *v2-- * tmp0;
1026       }
1027 
1028       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1029       sum2   -= *v2-- * tmp0;
1030       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1031       break;
1032     case 3:
1033       sum1 = tmp[row];
1034       sum2 = tmp[row -1];
1035       sum3 = tmp[row -2];
1036       v2   = aa + ai[row]-1;
1037       v3   = aa + ai[row -1]-1;
1038       for (j=nz; j>1; j-=2) {
1039         vi   -=2;
1040         i0    = vi[2];
1041         i1    = vi[1];
1042         tmp0  = tmps[i0];
1043         tmp1  = tmps[i1];
1044         v1   -= 2;
1045         v2   -= 2;
1046         v3   -= 2;
1047         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1048         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1049         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1050       }
1051       if (j==1) {
1052         tmp0  = tmps[*vi--];
1053         sum1 -= *v1-- * tmp0;
1054         sum2 -= *v2-- * tmp0;
1055         sum3 -= *v3-- * tmp0;
1056       }
1057       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1058       sum2   -= *v2-- * tmp0;
1059       sum3   -= *v3-- * tmp0;
1060       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1061       sum3   -= *v3-- * tmp0;
1062       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1063 
1064       break;
1065     case 4:
1066       sum1 = tmp[row];
1067       sum2 = tmp[row -1];
1068       sum3 = tmp[row -2];
1069       sum4 = tmp[row -3];
1070       v2   = aa + ai[row]-1;
1071       v3   = aa + ai[row -1]-1;
1072       v4   = aa + ai[row -2]-1;
1073 
1074       for (j=nz; j>1; j-=2) {
1075         vi   -=2;
1076         i0    = vi[2];
1077         i1    = vi[1];
1078         tmp0  = tmps[i0];
1079         tmp1  = tmps[i1];
1080         v1   -= 2;
1081         v2   -= 2;
1082         v3   -= 2;
1083         v4   -= 2;
1084         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1085         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1086         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1087         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1088       }
1089       if (j==1) {
1090         tmp0  = tmps[*vi--];
1091         sum1 -= *v1-- * tmp0;
1092         sum2 -= *v2-- * tmp0;
1093         sum3 -= *v3-- * tmp0;
1094         sum4 -= *v4-- * tmp0;
1095       }
1096 
1097       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1098       sum2   -= *v2-- * tmp0;
1099       sum3   -= *v3-- * tmp0;
1100       sum4   -= *v4-- * tmp0;
1101       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1102       sum3   -= *v3-- * tmp0;
1103       sum4   -= *v4-- * tmp0;
1104       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1105       sum4   -= *v4-- * tmp0;
1106       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1107       break;
1108     case 5:
1109       sum1 = tmp[row];
1110       sum2 = tmp[row -1];
1111       sum3 = tmp[row -2];
1112       sum4 = tmp[row -3];
1113       sum5 = tmp[row -4];
1114       v2   = aa + ai[row]-1;
1115       v3   = aa + ai[row -1]-1;
1116       v4   = aa + ai[row -2]-1;
1117       v5   = aa + ai[row -3]-1;
1118       for (j=nz; j>1; j-=2) {
1119         vi   -= 2;
1120         i0    = vi[2];
1121         i1    = vi[1];
1122         tmp0  = tmps[i0];
1123         tmp1  = tmps[i1];
1124         v1   -= 2;
1125         v2   -= 2;
1126         v3   -= 2;
1127         v4   -= 2;
1128         v5   -= 2;
1129         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1130         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1131         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1132         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1133         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1134       }
1135       if (j==1) {
1136         tmp0  = tmps[*vi--];
1137         sum1 -= *v1-- * tmp0;
1138         sum2 -= *v2-- * tmp0;
1139         sum3 -= *v3-- * tmp0;
1140         sum4 -= *v4-- * tmp0;
1141         sum5 -= *v5-- * tmp0;
1142       }
1143 
1144       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1145       sum2   -= *v2-- * tmp0;
1146       sum3   -= *v3-- * tmp0;
1147       sum4   -= *v4-- * tmp0;
1148       sum5   -= *v5-- * tmp0;
1149       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1150       sum3   -= *v3-- * tmp0;
1151       sum4   -= *v4-- * tmp0;
1152       sum5   -= *v5-- * tmp0;
1153       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1154       sum4   -= *v4-- * tmp0;
1155       sum5   -= *v5-- * tmp0;
1156       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1157       sum5   -= *v5-- * tmp0;
1158       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1159       break;
1160     default:
1161       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
1162     }
1163   }
1164   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1165   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1166   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1167   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1168   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
1169   PetscFunctionReturn(0);
1170 }
1171 
1172 #undef __FUNCT__
1173 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode"
1174 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info)
1175 {
1176   Mat             C     =B;
1177   Mat_SeqAIJ      *a    =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)C->data;
1178   IS              isrow = b->row,isicol = b->icol;
1179   PetscErrorCode  ierr;
1180   const PetscInt  *r,*ic,*ics;
1181   const PetscInt  n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag;
1182   PetscInt        i,j,k,nz,nzL,row,*pj;
1183   const PetscInt  *ajtmp,*bjtmp;
1184   MatScalar       *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4;
1185   const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4;
1186   FactorShiftCtx  sctx;
1187   const PetscInt  *ddiag;
1188   PetscReal       rs;
1189   MatScalar       d;
1190   PetscInt        inod,nodesz,node_max,col;
1191   const PetscInt  *ns;
1192   PetscInt        *tmp_vec1,*tmp_vec2,*nsmap;
1193 
1194   PetscFunctionBegin;
1195   /* MatPivotSetUp(): initialize shift context sctx */
1196   ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr);
1197 
1198   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1199     ddiag          = a->diag;
1200     sctx.shift_top = info->zeropivot;
1201     for (i=0; i<n; i++) {
1202       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1203       d  = (aa)[ddiag[i]];
1204       rs = -PetscAbsScalar(d) - PetscRealPart(d);
1205       v  = aa+ai[i];
1206       nz = ai[i+1] - ai[i];
1207       for (j=0; j<nz; j++) rs += PetscAbsScalar(v[j]);
1208       if (rs>sctx.shift_top) sctx.shift_top = rs;
1209     }
1210     sctx.shift_top *= 1.1;
1211     sctx.nshift_max = 5;
1212     sctx.shift_lo   = 0.;
1213     sctx.shift_hi   = 1.;
1214   }
1215 
1216   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1217   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1218 
1219   ierr  = PetscCalloc4(n,&rtmp1,n,&rtmp2,n,&rtmp3,n,&rtmp4);CHKERRQ(ierr);
1220   ics   = ic;
1221 
1222   node_max = a->inode.node_count;
1223   ns       = a->inode.size;
1224   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1225 
1226   /* If max inode size > 4, split it into two inodes.*/
1227   /* also map the inode sizes according to the ordering */
1228   ierr = PetscMalloc1(n+1,&tmp_vec1);CHKERRQ(ierr);
1229   for (i=0,j=0; i<node_max; ++i,++j) {
1230     if (ns[i] > 4) {
1231       tmp_vec1[j] = 4;
1232       ++j;
1233       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1234     } else {
1235       tmp_vec1[j] = ns[i];
1236     }
1237   }
1238   /* Use the correct node_max */
1239   node_max = j;
1240 
1241   /* Now reorder the inode info based on mat re-ordering info */
1242   /* First create a row -> inode_size_array_index map */
1243   ierr = PetscMalloc1(n+1,&nsmap);CHKERRQ(ierr);
1244   ierr = PetscMalloc1(node_max+1,&tmp_vec2);CHKERRQ(ierr);
1245   for (i=0,row=0; i<node_max; i++) {
1246     nodesz = tmp_vec1[i];
1247     for (j=0; j<nodesz; j++,row++) {
1248       nsmap[row] = i;
1249     }
1250   }
1251   /* Using nsmap, create a reordered ns structure */
1252   for (i=0,j=0; i< node_max; i++) {
1253     nodesz      = tmp_vec1[nsmap[r[j]]];     /* here the reordered row_no is in r[] */
1254     tmp_vec2[i] = nodesz;
1255     j          += nodesz;
1256   }
1257   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1258   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1259 
1260   /* Now use the correct ns */
1261   ns = tmp_vec2;
1262 
1263   do {
1264     sctx.newshift = PETSC_FALSE;
1265     /* Now loop over each block-row, and do the factorization */
1266     for (inod=0,i=0; inod<node_max; inod++) { /* i: row index; inod: inode index */
1267       nodesz = ns[inod];
1268 
1269       switch (nodesz) {
1270       case 1:
1271         /*----------*/
1272         /* zero rtmp1 */
1273         /* L part */
1274         nz    = bi[i+1] - bi[i];
1275         bjtmp = bj + bi[i];
1276         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1277 
1278         /* U part */
1279         nz    = bdiag[i]-bdiag[i+1];
1280         bjtmp = bj + bdiag[i+1]+1;
1281         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1282 
1283         /* load in initial (unfactored row) */
1284         nz    = ai[r[i]+1] - ai[r[i]];
1285         ajtmp = aj + ai[r[i]];
1286         v     = aa + ai[r[i]];
1287         for (j=0; j<nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];
1288 
1289         /* ZeropivotApply() */
1290         rtmp1[i] += sctx.shift_amount;  /* shift the diagonal of the matrix */
1291 
1292         /* elimination */
1293         bjtmp = bj + bi[i];
1294         row   = *bjtmp++;
1295         nzL   = bi[i+1] - bi[i];
1296         for (k=0; k < nzL; k++) {
1297           pc = rtmp1 + row;
1298           if (*pc != 0.0) {
1299             pv   = b->a + bdiag[row];
1300             mul1 = *pc * (*pv);
1301             *pc  = mul1;
1302             pj   = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1303             pv   = b->a + bdiag[row+1]+1;
1304             nz   = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1305             for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1306             ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1307           }
1308           row = *bjtmp++;
1309         }
1310 
1311         /* finished row so stick it into b->a */
1312         rs = 0.0;
1313         /* L part */
1314         pv = b->a + bi[i];
1315         pj = b->j + bi[i];
1316         nz = bi[i+1] - bi[i];
1317         for (j=0; j<nz; j++) {
1318           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1319         }
1320 
1321         /* U part */
1322         pv = b->a + bdiag[i+1]+1;
1323         pj = b->j + bdiag[i+1]+1;
1324         nz = bdiag[i] - bdiag[i+1]-1;
1325         for (j=0; j<nz; j++) {
1326           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1327         }
1328 
1329         /* Check zero pivot */
1330         sctx.rs = rs;
1331         sctx.pv = rtmp1[i];
1332         ierr    = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr);
1333         if (sctx.newshift) break;
1334 
1335         /* Mark diagonal and invert diagonal for simplier triangular solves */
1336         pv  = b->a + bdiag[i];
1337         *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1338         break;
1339 
1340       case 2:
1341         /*----------*/
1342         /* zero rtmp1 and rtmp2 */
1343         /* L part */
1344         nz    = bi[i+1] - bi[i];
1345         bjtmp = bj + bi[i];
1346         for  (j=0; j<nz; j++) {
1347           col        = bjtmp[j];
1348           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1349         }
1350 
1351         /* U part */
1352         nz    = bdiag[i]-bdiag[i+1];
1353         bjtmp = bj + bdiag[i+1]+1;
1354         for  (j=0; j<nz; j++) {
1355           col        = bjtmp[j];
1356           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1357         }
1358 
1359         /* load in initial (unfactored row) */
1360         nz    = ai[r[i]+1] - ai[r[i]];
1361         ajtmp = aj + ai[r[i]];
1362         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1];
1363         for (j=0; j<nz; j++) {
1364           col        = ics[ajtmp[j]];
1365           rtmp1[col] = v1[j]; rtmp2[col] = v2[j];
1366         }
1367         /* ZeropivotApply(): shift the diagonal of the matrix  */
1368         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount;
1369 
1370         /* elimination */
1371         bjtmp = bj + bi[i];
1372         row   = *bjtmp++; /* pivot row */
1373         nzL   = bi[i+1] - bi[i];
1374         for (k=0; k < nzL; k++) {
1375           pc1 = rtmp1 + row;
1376           pc2 = rtmp2 + row;
1377           if (*pc1 != 0.0 || *pc2 != 0.0) {
1378             pv   = b->a + bdiag[row];
1379             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv);
1380             *pc1 = mul1;       *pc2 = mul2;
1381 
1382             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1383             pv = b->a + bdiag[row+1]+1;
1384             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1385             for (j=0; j<nz; j++) {
1386               col         = pj[j];
1387               rtmp1[col] -= mul1 * pv[j];
1388               rtmp2[col] -= mul2 * pv[j];
1389             }
1390             ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr);
1391           }
1392           row = *bjtmp++;
1393         }
1394 
1395         /* finished row i; check zero pivot, then stick row i into b->a */
1396         rs = 0.0;
1397         /* L part */
1398         pc1 = b->a + bi[i];
1399         pj  = b->j + bi[i];
1400         nz  = bi[i+1] - bi[i];
1401         for (j=0; j<nz; j++) {
1402           col    = pj[j];
1403           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1404         }
1405         /* U part */
1406         pc1 = b->a + bdiag[i+1]+1;
1407         pj  = b->j + bdiag[i+1]+1;
1408         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1409         for (j=0; j<nz; j++) {
1410           col    = pj[j];
1411           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1412         }
1413 
1414         sctx.rs = rs;
1415         sctx.pv = rtmp1[i];
1416         ierr    = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr);
1417         if (sctx.newshift) break;
1418         pc1  = b->a + bdiag[i]; /* Mark diagonal */
1419         *pc1 = 1.0/sctx.pv;
1420 
1421         /* Now take care of diagonal 2x2 block. */
1422         pc2 = rtmp2 + i;
1423         if (*pc2 != 0.0) {
1424           mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */
1425           *pc2 = mul1;          /* insert L entry */
1426           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1427           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1428           for (j=0; j<nz; j++) {
1429             col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col];
1430           }
1431           ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1432         }
1433 
1434         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1435         rs = 0.0;
1436         /* L part */
1437         pc2 = b->a + bi[i+1];
1438         pj  = b->j + bi[i+1];
1439         nz  = bi[i+2] - bi[i+1];
1440         for (j=0; j<nz; j++) {
1441           col    = pj[j];
1442           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1443         }
1444         /* U part */
1445         pc2 = b->a + bdiag[i+2]+1;
1446         pj  = b->j + bdiag[i+2]+1;
1447         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1448         for (j=0; j<nz; j++) {
1449           col    = pj[j];
1450           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1451         }
1452 
1453         sctx.rs = rs;
1454         sctx.pv = rtmp2[i+1];
1455         ierr    = MatPivotCheck(B,A,info,&sctx,i+1);CHKERRQ(ierr);
1456         if (sctx.newshift) break;
1457         pc2  = b->a + bdiag[i+1];
1458         *pc2 = 1.0/sctx.pv;
1459         break;
1460 
1461       case 3:
1462         /*----------*/
1463         /* zero rtmp */
1464         /* L part */
1465         nz    = bi[i+1] - bi[i];
1466         bjtmp = bj + bi[i];
1467         for  (j=0; j<nz; j++) {
1468           col        = bjtmp[j];
1469           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1470         }
1471 
1472         /* U part */
1473         nz    = bdiag[i]-bdiag[i+1];
1474         bjtmp = bj + bdiag[i+1]+1;
1475         for  (j=0; j<nz; j++) {
1476           col        = bjtmp[j];
1477           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1478         }
1479 
1480         /* load in initial (unfactored row) */
1481         nz    = ai[r[i]+1] - ai[r[i]];
1482         ajtmp = aj + ai[r[i]];
1483         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2];
1484         for (j=0; j<nz; j++) {
1485           col        = ics[ajtmp[j]];
1486           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j];
1487         }
1488         /* ZeropivotApply(): shift the diagonal of the matrix  */
1489         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount;
1490 
1491         /* elimination */
1492         bjtmp = bj + bi[i];
1493         row   = *bjtmp++; /* pivot row */
1494         nzL   = bi[i+1] - bi[i];
1495         for (k=0; k < nzL; k++) {
1496           pc1 = rtmp1 + row;
1497           pc2 = rtmp2 + row;
1498           pc3 = rtmp3 + row;
1499           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1500             pv   = b->a + bdiag[row];
1501             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv);
1502             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3;
1503 
1504             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1505             pv = b->a + bdiag[row+1]+1;
1506             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1507             for (j=0; j<nz; j++) {
1508               col         = pj[j];
1509               rtmp1[col] -= mul1 * pv[j];
1510               rtmp2[col] -= mul2 * pv[j];
1511               rtmp3[col] -= mul3 * pv[j];
1512             }
1513             ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr);
1514           }
1515           row = *bjtmp++;
1516         }
1517 
1518         /* finished row i; check zero pivot, then stick row i into b->a */
1519         rs = 0.0;
1520         /* L part */
1521         pc1 = b->a + bi[i];
1522         pj  = b->j + bi[i];
1523         nz  = bi[i+1] - bi[i];
1524         for (j=0; j<nz; j++) {
1525           col    = pj[j];
1526           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1527         }
1528         /* U part */
1529         pc1 = b->a + bdiag[i+1]+1;
1530         pj  = b->j + bdiag[i+1]+1;
1531         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1532         for (j=0; j<nz; j++) {
1533           col    = pj[j];
1534           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1535         }
1536 
1537         sctx.rs = rs;
1538         sctx.pv = rtmp1[i];
1539         ierr    = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr);
1540         if (sctx.newshift) break;
1541         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1542         *pc1 = 1.0/sctx.pv;
1543 
1544         /* Now take care of 1st column of diagonal 3x3 block. */
1545         pc2 = rtmp2 + i;
1546         pc3 = rtmp3 + i;
1547         if (*pc2 != 0.0 || *pc3 != 0.0) {
1548           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1549           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1550           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1551           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1552           for (j=0; j<nz; j++) {
1553             col         = pj[j];
1554             rtmp2[col] -= mul2 * rtmp1[col];
1555             rtmp3[col] -= mul3 * rtmp1[col];
1556           }
1557           ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr);
1558         }
1559 
1560         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1561         rs = 0.0;
1562         /* L part */
1563         pc2 = b->a + bi[i+1];
1564         pj  = b->j + bi[i+1];
1565         nz  = bi[i+2] - bi[i+1];
1566         for (j=0; j<nz; j++) {
1567           col    = pj[j];
1568           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1569         }
1570         /* U part */
1571         pc2 = b->a + bdiag[i+2]+1;
1572         pj  = b->j + bdiag[i+2]+1;
1573         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1574         for (j=0; j<nz; j++) {
1575           col    = pj[j];
1576           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1577         }
1578 
1579         sctx.rs = rs;
1580         sctx.pv = rtmp2[i+1];
1581         ierr    = MatPivotCheck(B,A,info,&sctx,i+1);CHKERRQ(ierr);
1582         if (sctx.newshift) break;
1583         pc2  = b->a + bdiag[i+1];
1584         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1585 
1586         /* Now take care of 2nd column of diagonal 3x3 block. */
1587         pc3 = rtmp3 + i+1;
1588         if (*pc3 != 0.0) {
1589           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1590           pj   = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1591           nz   = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1592           for (j=0; j<nz; j++) {
1593             col         = pj[j];
1594             rtmp3[col] -= mul3 * rtmp2[col];
1595           }
1596           ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1597         }
1598 
1599         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1600         rs = 0.0;
1601         /* L part */
1602         pc3 = b->a + bi[i+2];
1603         pj  = b->j + bi[i+2];
1604         nz  = bi[i+3] - bi[i+2];
1605         for (j=0; j<nz; j++) {
1606           col    = pj[j];
1607           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1608         }
1609         /* U part */
1610         pc3 = b->a + bdiag[i+3]+1;
1611         pj  = b->j + bdiag[i+3]+1;
1612         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1613         for (j=0; j<nz; j++) {
1614           col    = pj[j];
1615           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1616         }
1617 
1618         sctx.rs = rs;
1619         sctx.pv = rtmp3[i+2];
1620         ierr    = MatPivotCheck(B,A,info,&sctx,i+2);CHKERRQ(ierr);
1621         if (sctx.newshift) break;
1622         pc3  = b->a + bdiag[i+2];
1623         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1624         break;
1625       case 4:
1626         /*----------*/
1627         /* zero rtmp */
1628         /* L part */
1629         nz    = bi[i+1] - bi[i];
1630         bjtmp = bj + bi[i];
1631         for  (j=0; j<nz; j++) {
1632           col        = bjtmp[j];
1633           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0;
1634         }
1635 
1636         /* U part */
1637         nz    = bdiag[i]-bdiag[i+1];
1638         bjtmp = bj + bdiag[i+1]+1;
1639         for  (j=0; j<nz; j++) {
1640           col        = bjtmp[j];
1641           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0;
1642         }
1643 
1644         /* load in initial (unfactored row) */
1645         nz    = ai[r[i]+1] - ai[r[i]];
1646         ajtmp = aj + ai[r[i]];
1647         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3];
1648         for (j=0; j<nz; j++) {
1649           col        = ics[ajtmp[j]];
1650           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j];
1651         }
1652         /* ZeropivotApply(): shift the diagonal of the matrix  */
1653         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount;
1654 
1655         /* elimination */
1656         bjtmp = bj + bi[i];
1657         row   = *bjtmp++; /* pivot row */
1658         nzL   = bi[i+1] - bi[i];
1659         for (k=0; k < nzL; k++) {
1660           pc1 = rtmp1 + row;
1661           pc2 = rtmp2 + row;
1662           pc3 = rtmp3 + row;
1663           pc4 = rtmp4 + row;
1664           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1665             pv   = b->a + bdiag[row];
1666             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv);
1667             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4;
1668 
1669             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1670             pv = b->a + bdiag[row+1]+1;
1671             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1672             for (j=0; j<nz; j++) {
1673               col         = pj[j];
1674               rtmp1[col] -= mul1 * pv[j];
1675               rtmp2[col] -= mul2 * pv[j];
1676               rtmp3[col] -= mul3 * pv[j];
1677               rtmp4[col] -= mul4 * pv[j];
1678             }
1679             ierr = PetscLogFlops(4+8*nz);CHKERRQ(ierr);
1680           }
1681           row = *bjtmp++;
1682         }
1683 
1684         /* finished row i; check zero pivot, then stick row i into b->a */
1685         rs = 0.0;
1686         /* L part */
1687         pc1 = b->a + bi[i];
1688         pj  = b->j + bi[i];
1689         nz  = bi[i+1] - bi[i];
1690         for (j=0; j<nz; j++) {
1691           col    = pj[j];
1692           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1693         }
1694         /* U part */
1695         pc1 = b->a + bdiag[i+1]+1;
1696         pj  = b->j + bdiag[i+1]+1;
1697         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1698         for (j=0; j<nz; j++) {
1699           col    = pj[j];
1700           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1701         }
1702 
1703         sctx.rs = rs;
1704         sctx.pv = rtmp1[i];
1705         ierr    = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr);
1706         if (sctx.newshift) break;
1707         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1708         *pc1 = 1.0/sctx.pv;
1709 
1710         /* Now take care of 1st column of diagonal 4x4 block. */
1711         pc2 = rtmp2 + i;
1712         pc3 = rtmp3 + i;
1713         pc4 = rtmp4 + i;
1714         if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1715           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1716           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1717           mul4 = (*pc4)*(*pc1); *pc4 = mul4;
1718           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1719           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1720           for (j=0; j<nz; j++) {
1721             col         = pj[j];
1722             rtmp2[col] -= mul2 * rtmp1[col];
1723             rtmp3[col] -= mul3 * rtmp1[col];
1724             rtmp4[col] -= mul4 * rtmp1[col];
1725           }
1726           ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr);
1727         }
1728 
1729         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1730         rs = 0.0;
1731         /* L part */
1732         pc2 = b->a + bi[i+1];
1733         pj  = b->j + bi[i+1];
1734         nz  = bi[i+2] - bi[i+1];
1735         for (j=0; j<nz; j++) {
1736           col    = pj[j];
1737           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1738         }
1739         /* U part */
1740         pc2 = b->a + bdiag[i+2]+1;
1741         pj  = b->j + bdiag[i+2]+1;
1742         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1743         for (j=0; j<nz; j++) {
1744           col    = pj[j];
1745           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1746         }
1747 
1748         sctx.rs = rs;
1749         sctx.pv = rtmp2[i+1];
1750         ierr    = MatPivotCheck(B,A,info,&sctx,i+1);CHKERRQ(ierr);
1751         if (sctx.newshift) break;
1752         pc2  = b->a + bdiag[i+1];
1753         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1754 
1755         /* Now take care of 2nd column of diagonal 4x4 block. */
1756         pc3 = rtmp3 + i+1;
1757         pc4 = rtmp4 + i+1;
1758         if (*pc3 != 0.0 || *pc4 != 0.0) {
1759           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1760           mul4 = (*pc4)*(*pc2); *pc4 = mul4;
1761           pj   = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1762           nz   = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1763           for (j=0; j<nz; j++) {
1764             col         = pj[j];
1765             rtmp3[col] -= mul3 * rtmp2[col];
1766             rtmp4[col] -= mul4 * rtmp2[col];
1767           }
1768           ierr = PetscLogFlops(4*nz);CHKERRQ(ierr);
1769         }
1770 
1771         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1772         rs = 0.0;
1773         /* L part */
1774         pc3 = b->a + bi[i+2];
1775         pj  = b->j + bi[i+2];
1776         nz  = bi[i+3] - bi[i+2];
1777         for (j=0; j<nz; j++) {
1778           col    = pj[j];
1779           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1780         }
1781         /* U part */
1782         pc3 = b->a + bdiag[i+3]+1;
1783         pj  = b->j + bdiag[i+3]+1;
1784         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1785         for (j=0; j<nz; j++) {
1786           col    = pj[j];
1787           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1788         }
1789 
1790         sctx.rs = rs;
1791         sctx.pv = rtmp3[i+2];
1792         ierr    = MatPivotCheck(B,A,info,&sctx,i+2);CHKERRQ(ierr);
1793         if (sctx.newshift) break;
1794         pc3  = b->a + bdiag[i+2];
1795         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1796 
1797         /* Now take care of 3rd column of diagonal 4x4 block. */
1798         pc4 = rtmp4 + i+2;
1799         if (*pc4 != 0.0) {
1800           mul4 = (*pc4)*(*pc3); *pc4 = mul4;
1801           pj   = b->j + bdiag[i+3]+1;     /* beginning of U(i+2,:) */
1802           nz   = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */
1803           for (j=0; j<nz; j++) {
1804             col         = pj[j];
1805             rtmp4[col] -= mul4 * rtmp3[col];
1806           }
1807           ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1808         }
1809 
1810         /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1811         rs = 0.0;
1812         /* L part */
1813         pc4 = b->a + bi[i+3];
1814         pj  = b->j + bi[i+3];
1815         nz  = bi[i+4] - bi[i+3];
1816         for (j=0; j<nz; j++) {
1817           col    = pj[j];
1818           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1819         }
1820         /* U part */
1821         pc4 = b->a + bdiag[i+4]+1;
1822         pj  = b->j + bdiag[i+4]+1;
1823         nz  = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */
1824         for (j=0; j<nz; j++) {
1825           col    = pj[j];
1826           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1827         }
1828 
1829         sctx.rs = rs;
1830         sctx.pv = rtmp4[i+3];
1831         ierr    = MatPivotCheck(B,A,info,&sctx,i+3);CHKERRQ(ierr);
1832         if (sctx.newshift) break;
1833         pc4  = b->a + bdiag[i+3];
1834         *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */
1835         break;
1836 
1837       default:
1838         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
1839       }
1840       if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1841       i += nodesz;                 /* Update the row */
1842     }
1843 
1844     /* MatPivotRefine() */
1845     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max) {
1846       /*
1847        * if no shift in this attempt & shifting & started shifting & can refine,
1848        * then try lower shift
1849        */
1850       sctx.shift_hi       = sctx.shift_fraction;
1851       sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.;
1852       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
1853       sctx.newshift       = PETSC_TRUE;
1854       sctx.nshift++;
1855     }
1856   } while (sctx.newshift);
1857 
1858   ierr = PetscFree4(rtmp1,rtmp2,rtmp3,rtmp4);CHKERRQ(ierr);
1859   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
1860   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
1861   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1862 
1863   if (b->inode.size) {
1864     C->ops->solve           = MatSolve_SeqAIJ_Inode;
1865   } else {
1866     C->ops->solve           = MatSolve_SeqAIJ;
1867   }
1868   C->ops->solveadd          = MatSolveAdd_SeqAIJ;
1869   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1870   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1871   C->ops->matsolve          = MatMatSolve_SeqAIJ;
1872   C->assembled              = PETSC_TRUE;
1873   C->preallocated           = PETSC_TRUE;
1874 
1875   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
1876 
1877   /* MatShiftView(A,info,&sctx) */
1878   if (sctx.nshift) {
1879     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) {
1880       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);CHKERRQ(ierr);
1881     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1882       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);CHKERRQ(ierr);
1883     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1884       ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %g\n",sctx.nshift,(double)info->shiftamount);CHKERRQ(ierr);
1885     }
1886   }
1887   PetscFunctionReturn(0);
1888 }
1889 
1890 #undef __FUNCT__
1891 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode_inplace"
1892 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info)
1893 {
1894   Mat             C     = B;
1895   Mat_SeqAIJ      *a    = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1896   IS              iscol = b->col,isrow = b->row,isicol = b->icol;
1897   PetscErrorCode  ierr;
1898   const PetscInt  *r,*ic,*c,*ics;
1899   PetscInt        n   = A->rmap->n,*bi = b->i;
1900   PetscInt        *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1901   PetscInt        i,j,idx,*bd = b->diag,node_max,nodesz;
1902   PetscInt        *ai = a->i,*aj = a->j;
1903   PetscInt        *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1904   PetscScalar     mul1,mul2,mul3,tmp;
1905   MatScalar       *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1906   const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1;
1907   PetscReal       rs=0.0;
1908   FactorShiftCtx  sctx;
1909 
1910   PetscFunctionBegin;
1911   sctx.shift_top      = 0;
1912   sctx.nshift_max     = 0;
1913   sctx.shift_lo       = 0;
1914   sctx.shift_hi       = 0;
1915   sctx.shift_fraction = 0;
1916 
1917   /* if both shift schemes are chosen by user, only use info->shiftpd */
1918   if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1919     sctx.shift_top = 0;
1920     for (i=0; i<n; i++) {
1921       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1922       rs    = 0.0;
1923       ajtmp = aj + ai[i];
1924       rtmp1 = aa + ai[i];
1925       nz    = ai[i+1] - ai[i];
1926       for (j=0; j<nz; j++) {
1927         if (*ajtmp != i) {
1928           rs += PetscAbsScalar(*rtmp1++);
1929         } else {
1930           rs -= PetscRealPart(*rtmp1++);
1931         }
1932         ajtmp++;
1933       }
1934       if (rs>sctx.shift_top) sctx.shift_top = rs;
1935     }
1936     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1937     sctx.shift_top *= 1.1;
1938     sctx.nshift_max = 5;
1939     sctx.shift_lo   = 0.;
1940     sctx.shift_hi   = 1.;
1941   }
1942   sctx.shift_amount = 0;
1943   sctx.nshift       = 0;
1944 
1945   ierr   = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1946   ierr   = ISGetIndices(iscol,&c);CHKERRQ(ierr);
1947   ierr   = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1948   ierr   = PetscCalloc3(n,&rtmp11,n,&rtmp22,n,&rtmp33);CHKERRQ(ierr);
1949   ics    = ic;
1950 
1951   node_max = a->inode.node_count;
1952   ns       = a->inode.size;
1953   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1954 
1955   /* If max inode size > 3, split it into two inodes.*/
1956   /* also map the inode sizes according to the ordering */
1957   ierr = PetscMalloc1(n+1,&tmp_vec1);CHKERRQ(ierr);
1958   for (i=0,j=0; i<node_max; ++i,++j) {
1959     if (ns[i]>3) {
1960       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1961       ++j;
1962       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1963     } else {
1964       tmp_vec1[j] = ns[i];
1965     }
1966   }
1967   /* Use the correct node_max */
1968   node_max = j;
1969 
1970   /* Now reorder the inode info based on mat re-ordering info */
1971   /* First create a row -> inode_size_array_index map */
1972   ierr = PetscMalloc1(n+1,&nsmap);CHKERRQ(ierr);
1973   ierr = PetscMalloc1(node_max+1,&tmp_vec2);CHKERRQ(ierr);
1974   for (i=0,row=0; i<node_max; i++) {
1975     nodesz = tmp_vec1[i];
1976     for (j=0; j<nodesz; j++,row++) {
1977       nsmap[row] = i;
1978     }
1979   }
1980   /* Using nsmap, create a reordered ns structure */
1981   for (i=0,j=0; i< node_max; i++) {
1982     nodesz      = tmp_vec1[nsmap[r[j]]];     /* here the reordered row_no is in r[] */
1983     tmp_vec2[i] = nodesz;
1984     j          += nodesz;
1985   }
1986   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1987   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1988   /* Now use the correct ns */
1989   ns = tmp_vec2;
1990 
1991   do {
1992     sctx.newshift = PETSC_FALSE;
1993     /* Now loop over each block-row, and do the factorization */
1994     for (i=0,row=0; i<node_max; i++) {
1995       nodesz = ns[i];
1996       nz     = bi[row+1] - bi[row];
1997       bjtmp  = bj + bi[row];
1998 
1999       switch (nodesz) {
2000       case 1:
2001         for  (j=0; j<nz; j++) {
2002           idx         = bjtmp[j];
2003           rtmp11[idx] = 0.0;
2004         }
2005 
2006         /* load in initial (unfactored row) */
2007         idx    = r[row];
2008         nz_tmp = ai[idx+1] - ai[idx];
2009         ajtmp  = aj + ai[idx];
2010         v1     = aa + ai[idx];
2011 
2012         for (j=0; j<nz_tmp; j++) {
2013           idx         = ics[ajtmp[j]];
2014           rtmp11[idx] = v1[j];
2015         }
2016         rtmp11[ics[r[row]]] += sctx.shift_amount;
2017 
2018         prow = *bjtmp++;
2019         while (prow < row) {
2020           pc1 = rtmp11 + prow;
2021           if (*pc1 != 0.0) {
2022             pv     = ba + bd[prow];
2023             pj     = nbj + bd[prow];
2024             mul1   = *pc1 * *pv++;
2025             *pc1   = mul1;
2026             nz_tmp = bi[prow+1] - bd[prow] - 1;
2027             ierr   = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr);
2028             for (j=0; j<nz_tmp; j++) {
2029               tmp          = pv[j];
2030               idx          = pj[j];
2031               rtmp11[idx] -= mul1 * tmp;
2032             }
2033           }
2034           prow = *bjtmp++;
2035         }
2036         pj  = bj + bi[row];
2037         pc1 = ba + bi[row];
2038 
2039         sctx.pv     = rtmp11[row];
2040         rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
2041         rs          = 0.0;
2042         for (j=0; j<nz; j++) {
2043           idx    = pj[j];
2044           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2045           if (idx != row) rs += PetscAbsScalar(pc1[j]);
2046         }
2047         sctx.rs = rs;
2048         ierr    = MatPivotCheck(B,A,info,&sctx,row);CHKERRQ(ierr);
2049         if (sctx.newshift) goto endofwhile;
2050         break;
2051 
2052       case 2:
2053         for (j=0; j<nz; j++) {
2054           idx         = bjtmp[j];
2055           rtmp11[idx] = 0.0;
2056           rtmp22[idx] = 0.0;
2057         }
2058 
2059         /* load in initial (unfactored row) */
2060         idx    = r[row];
2061         nz_tmp = ai[idx+1] - ai[idx];
2062         ajtmp  = aj + ai[idx];
2063         v1     = aa + ai[idx];
2064         v2     = aa + ai[idx+1];
2065         for (j=0; j<nz_tmp; j++) {
2066           idx         = ics[ajtmp[j]];
2067           rtmp11[idx] = v1[j];
2068           rtmp22[idx] = v2[j];
2069         }
2070         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2071         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2072 
2073         prow = *bjtmp++;
2074         while (prow < row) {
2075           pc1 = rtmp11 + prow;
2076           pc2 = rtmp22 + prow;
2077           if (*pc1 != 0.0 || *pc2 != 0.0) {
2078             pv   = ba + bd[prow];
2079             pj   = nbj + bd[prow];
2080             mul1 = *pc1 * *pv;
2081             mul2 = *pc2 * *pv;
2082             ++pv;
2083             *pc1 = mul1;
2084             *pc2 = mul2;
2085 
2086             nz_tmp = bi[prow+1] - bd[prow] - 1;
2087             for (j=0; j<nz_tmp; j++) {
2088               tmp          = pv[j];
2089               idx          = pj[j];
2090               rtmp11[idx] -= mul1 * tmp;
2091               rtmp22[idx] -= mul2 * tmp;
2092             }
2093             ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr);
2094           }
2095           prow = *bjtmp++;
2096         }
2097 
2098         /* Now take care of diagonal 2x2 block. Note: prow = row here */
2099         pc1 = rtmp11 + prow;
2100         pc2 = rtmp22 + prow;
2101 
2102         sctx.pv = *pc1;
2103         pj      = bj + bi[prow];
2104         rs      = 0.0;
2105         for (j=0; j<nz; j++) {
2106           idx = pj[j];
2107           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2108         }
2109         sctx.rs = rs;
2110         ierr    = MatPivotCheck(B,A,info,&sctx,row);CHKERRQ(ierr);
2111         if (sctx.newshift) goto endofwhile;
2112 
2113         if (*pc2 != 0.0) {
2114           pj     = nbj + bd[prow];
2115           mul2   = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
2116           *pc2   = mul2;
2117           nz_tmp = bi[prow+1] - bd[prow] - 1;
2118           for (j=0; j<nz_tmp; j++) {
2119             idx          = pj[j];
2120             tmp          = rtmp11[idx];
2121             rtmp22[idx] -= mul2 * tmp;
2122           }
2123           ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr);
2124         }
2125 
2126         pj  = bj + bi[row];
2127         pc1 = ba + bi[row];
2128         pc2 = ba + bi[row+1];
2129 
2130         sctx.pv       = rtmp22[row+1];
2131         rs            = 0.0;
2132         rtmp11[row]   = 1.0/rtmp11[row];
2133         rtmp22[row+1] = 1.0/rtmp22[row+1];
2134         /* copy row entries from dense representation to sparse */
2135         for (j=0; j<nz; j++) {
2136           idx    = pj[j];
2137           pc1[j] = rtmp11[idx];
2138           pc2[j] = rtmp22[idx];
2139           if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
2140         }
2141         sctx.rs = rs;
2142         ierr    = MatPivotCheck(B,A,info,&sctx,row+1);CHKERRQ(ierr);
2143         if (sctx.newshift) goto endofwhile;
2144         break;
2145 
2146       case 3:
2147         for  (j=0; j<nz; j++) {
2148           idx         = bjtmp[j];
2149           rtmp11[idx] = 0.0;
2150           rtmp22[idx] = 0.0;
2151           rtmp33[idx] = 0.0;
2152         }
2153         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2154         idx    = r[row];
2155         nz_tmp = ai[idx+1] - ai[idx];
2156         ajtmp  = aj + ai[idx];
2157         v1     = aa + ai[idx];
2158         v2     = aa + ai[idx+1];
2159         v3     = aa + ai[idx+2];
2160         for (j=0; j<nz_tmp; j++) {
2161           idx         = ics[ajtmp[j]];
2162           rtmp11[idx] = v1[j];
2163           rtmp22[idx] = v2[j];
2164           rtmp33[idx] = v3[j];
2165         }
2166         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2167         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2168         rtmp33[ics[r[row+2]]] += sctx.shift_amount;
2169 
2170         /* loop over all pivot row blocks above this row block */
2171         prow = *bjtmp++;
2172         while (prow < row) {
2173           pc1 = rtmp11 + prow;
2174           pc2 = rtmp22 + prow;
2175           pc3 = rtmp33 + prow;
2176           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0) {
2177             pv   = ba  + bd[prow];
2178             pj   = nbj + bd[prow];
2179             mul1 = *pc1 * *pv;
2180             mul2 = *pc2 * *pv;
2181             mul3 = *pc3 * *pv;
2182             ++pv;
2183             *pc1 = mul1;
2184             *pc2 = mul2;
2185             *pc3 = mul3;
2186 
2187             nz_tmp = bi[prow+1] - bd[prow] - 1;
2188             /* update this row based on pivot row */
2189             for (j=0; j<nz_tmp; j++) {
2190               tmp          = pv[j];
2191               idx          = pj[j];
2192               rtmp11[idx] -= mul1 * tmp;
2193               rtmp22[idx] -= mul2 * tmp;
2194               rtmp33[idx] -= mul3 * tmp;
2195             }
2196             ierr = PetscLogFlops(3+6*nz_tmp);CHKERRQ(ierr);
2197           }
2198           prow = *bjtmp++;
2199         }
2200 
2201         /* Now take care of diagonal 3x3 block in this set of rows */
2202         /* note: prow = row here */
2203         pc1 = rtmp11 + prow;
2204         pc2 = rtmp22 + prow;
2205         pc3 = rtmp33 + prow;
2206 
2207         sctx.pv = *pc1;
2208         pj      = bj + bi[prow];
2209         rs      = 0.0;
2210         for (j=0; j<nz; j++) {
2211           idx = pj[j];
2212           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2213         }
2214         sctx.rs = rs;
2215         ierr    = MatPivotCheck(B,A,info,&sctx,row);CHKERRQ(ierr);
2216         if (sctx.newshift) goto endofwhile;
2217 
2218         if (*pc2 != 0.0 || *pc3 != 0.0) {
2219           mul2   = (*pc2)/(*pc1);
2220           mul3   = (*pc3)/(*pc1);
2221           *pc2   = mul2;
2222           *pc3   = mul3;
2223           nz_tmp = bi[prow+1] - bd[prow] - 1;
2224           pj     = nbj + bd[prow];
2225           for (j=0; j<nz_tmp; j++) {
2226             idx          = pj[j];
2227             tmp          = rtmp11[idx];
2228             rtmp22[idx] -= mul2 * tmp;
2229             rtmp33[idx] -= mul3 * tmp;
2230           }
2231           ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr);
2232         }
2233         ++prow;
2234 
2235         pc2     = rtmp22 + prow;
2236         pc3     = rtmp33 + prow;
2237         sctx.pv = *pc2;
2238         pj      = bj + bi[prow];
2239         rs      = 0.0;
2240         for (j=0; j<nz; j++) {
2241           idx = pj[j];
2242           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2243         }
2244         sctx.rs = rs;
2245         ierr    = MatPivotCheck(B,A,info,&sctx,row+1);CHKERRQ(ierr);
2246         if (sctx.newshift) goto endofwhile;
2247 
2248         if (*pc3 != 0.0) {
2249           mul3   = (*pc3)/(*pc2);
2250           *pc3   = mul3;
2251           pj     = nbj + bd[prow];
2252           nz_tmp = bi[prow+1] - bd[prow] - 1;
2253           for (j=0; j<nz_tmp; j++) {
2254             idx          = pj[j];
2255             tmp          = rtmp22[idx];
2256             rtmp33[idx] -= mul3 * tmp;
2257           }
2258           ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr);
2259         }
2260 
2261         pj  = bj + bi[row];
2262         pc1 = ba + bi[row];
2263         pc2 = ba + bi[row+1];
2264         pc3 = ba + bi[row+2];
2265 
2266         sctx.pv       = rtmp33[row+2];
2267         rs            = 0.0;
2268         rtmp11[row]   = 1.0/rtmp11[row];
2269         rtmp22[row+1] = 1.0/rtmp22[row+1];
2270         rtmp33[row+2] = 1.0/rtmp33[row+2];
2271         /* copy row entries from dense representation to sparse */
2272         for (j=0; j<nz; j++) {
2273           idx    = pj[j];
2274           pc1[j] = rtmp11[idx];
2275           pc2[j] = rtmp22[idx];
2276           pc3[j] = rtmp33[idx];
2277           if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
2278         }
2279 
2280         sctx.rs = rs;
2281         ierr    = MatPivotCheck(B,A,info,&sctx,row+2);CHKERRQ(ierr);
2282         if (sctx.newshift) goto endofwhile;
2283         break;
2284 
2285       default:
2286         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
2287       }
2288       row += nodesz;                 /* Update the row */
2289     }
2290 endofwhile:;
2291   } while (sctx.newshift);
2292   ierr = PetscFree3(rtmp11,rtmp22,rtmp33);CHKERRQ(ierr);
2293   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
2294   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
2295   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
2296   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
2297 
2298   (B)->ops->solve = MatSolve_SeqAIJ_inplace;
2299   /* do not set solve add, since MatSolve_Inode + Add is faster */
2300   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ_inplace;
2301   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2302   C->assembled              = PETSC_TRUE;
2303   C->preallocated           = PETSC_TRUE;
2304   if (sctx.nshift) {
2305     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2306       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);CHKERRQ(ierr);
2307     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2308       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);CHKERRQ(ierr);
2309     }
2310   }
2311   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
2312   ierr = MatSeqAIJCheckInode(C);CHKERRQ(ierr);
2313   PetscFunctionReturn(0);
2314 }
2315 
2316 
2317 /* ----------------------------------------------------------- */
2318 #undef __FUNCT__
2319 #define __FUNCT__ "MatSolve_SeqAIJ_Inode"
2320 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
2321 {
2322   Mat_SeqAIJ        *a    = (Mat_SeqAIJ*)A->data;
2323   IS                iscol = a->col,isrow = a->row;
2324   PetscErrorCode    ierr;
2325   const PetscInt    *r,*c,*rout,*cout;
2326   PetscInt          i,j,n = A->rmap->n;
2327   PetscInt          node_max,row,nsz,aii,i0,i1,nz;
2328   const PetscInt    *ai = a->i,*a_j = a->j,*ns,*vi,*ad,*aj;
2329   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
2330   PetscScalar       sum1,sum2,sum3,sum4,sum5;
2331   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
2332   const PetscScalar *b;
2333 
2334   PetscFunctionBegin;
2335   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
2336   node_max = a->inode.node_count;
2337   ns       = a->inode.size;     /* Node Size array */
2338 
2339   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2340   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2341   tmp  = a->solve_work;
2342 
2343   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2344   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2345 
2346   /* forward solve the lower triangular */
2347   tmps = tmp;
2348   aa   = a_a;
2349   aj   = a_j;
2350   ad   = a->diag;
2351 
2352   for (i = 0,row = 0; i< node_max; ++i) {
2353     nsz = ns[i];
2354     aii = ai[row];
2355     v1  = aa + aii;
2356     vi  = aj + aii;
2357     nz  = ai[row+1]- ai[row];
2358 
2359     if (i < node_max-1) {
2360       /* Prefetch the indices for the next block */
2361       PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
2362       /* Prefetch the data for the next block */
2363       PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
2364     }
2365 
2366     switch (nsz) {               /* Each loop in 'case' is unrolled */
2367     case 1:
2368       sum1 = b[r[row]];
2369       for (j=0; j<nz-1; j+=2) {
2370         i0    = vi[j];
2371         i1    = vi[j+1];
2372         tmp0  = tmps[i0];
2373         tmp1  = tmps[i1];
2374         sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1;
2375       }
2376       if (j == nz-1) {
2377         tmp0  = tmps[vi[j]];
2378         sum1 -= v1[j]*tmp0;
2379       }
2380       tmp[row++]=sum1;
2381       break;
2382     case 2:
2383       sum1 = b[r[row]];
2384       sum2 = b[r[row+1]];
2385       v2   = aa + ai[row+1];
2386 
2387       for (j=0; j<nz-1; j+=2) {
2388         i0    = vi[j];
2389         i1    = vi[j+1];
2390         tmp0  = tmps[i0];
2391         tmp1  = tmps[i1];
2392         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2393         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2394       }
2395       if (j == nz-1) {
2396         tmp0  = tmps[vi[j]];
2397         sum1 -= v1[j] *tmp0;
2398         sum2 -= v2[j] *tmp0;
2399       }
2400       sum2     -= v2[nz] * sum1;
2401       tmp[row++]=sum1;
2402       tmp[row++]=sum2;
2403       break;
2404     case 3:
2405       sum1 = b[r[row]];
2406       sum2 = b[r[row+1]];
2407       sum3 = b[r[row+2]];
2408       v2   = aa + ai[row+1];
2409       v3   = aa + ai[row+2];
2410 
2411       for (j=0; j<nz-1; j+=2) {
2412         i0    = vi[j];
2413         i1    = vi[j+1];
2414         tmp0  = tmps[i0];
2415         tmp1  = tmps[i1];
2416         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2417         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2418         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2419       }
2420       if (j == nz-1) {
2421         tmp0  = tmps[vi[j]];
2422         sum1 -= v1[j] *tmp0;
2423         sum2 -= v2[j] *tmp0;
2424         sum3 -= v3[j] *tmp0;
2425       }
2426       sum2     -= v2[nz] * sum1;
2427       sum3     -= v3[nz] * sum1;
2428       sum3     -= v3[nz+1] * sum2;
2429       tmp[row++]=sum1;
2430       tmp[row++]=sum2;
2431       tmp[row++]=sum3;
2432       break;
2433 
2434     case 4:
2435       sum1 = b[r[row]];
2436       sum2 = b[r[row+1]];
2437       sum3 = b[r[row+2]];
2438       sum4 = b[r[row+3]];
2439       v2   = aa + ai[row+1];
2440       v3   = aa + ai[row+2];
2441       v4   = aa + ai[row+3];
2442 
2443       for (j=0; j<nz-1; j+=2) {
2444         i0    = vi[j];
2445         i1    = vi[j+1];
2446         tmp0  = tmps[i0];
2447         tmp1  = tmps[i1];
2448         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2449         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2450         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2451         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2452       }
2453       if (j == nz-1) {
2454         tmp0  = tmps[vi[j]];
2455         sum1 -= v1[j] *tmp0;
2456         sum2 -= v2[j] *tmp0;
2457         sum3 -= v3[j] *tmp0;
2458         sum4 -= v4[j] *tmp0;
2459       }
2460       sum2 -= v2[nz] * sum1;
2461       sum3 -= v3[nz] * sum1;
2462       sum4 -= v4[nz] * sum1;
2463       sum3 -= v3[nz+1] * sum2;
2464       sum4 -= v4[nz+1] * sum2;
2465       sum4 -= v4[nz+2] * sum3;
2466 
2467       tmp[row++]=sum1;
2468       tmp[row++]=sum2;
2469       tmp[row++]=sum3;
2470       tmp[row++]=sum4;
2471       break;
2472     case 5:
2473       sum1 = b[r[row]];
2474       sum2 = b[r[row+1]];
2475       sum3 = b[r[row+2]];
2476       sum4 = b[r[row+3]];
2477       sum5 = b[r[row+4]];
2478       v2   = aa + ai[row+1];
2479       v3   = aa + ai[row+2];
2480       v4   = aa + ai[row+3];
2481       v5   = aa + ai[row+4];
2482 
2483       for (j=0; j<nz-1; j+=2) {
2484         i0    = vi[j];
2485         i1    = vi[j+1];
2486         tmp0  = tmps[i0];
2487         tmp1  = tmps[i1];
2488         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2489         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2490         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2491         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2492         sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1;
2493       }
2494       if (j == nz-1) {
2495         tmp0  = tmps[vi[j]];
2496         sum1 -= v1[j] *tmp0;
2497         sum2 -= v2[j] *tmp0;
2498         sum3 -= v3[j] *tmp0;
2499         sum4 -= v4[j] *tmp0;
2500         sum5 -= v5[j] *tmp0;
2501       }
2502 
2503       sum2 -= v2[nz] * sum1;
2504       sum3 -= v3[nz] * sum1;
2505       sum4 -= v4[nz] * sum1;
2506       sum5 -= v5[nz] * sum1;
2507       sum3 -= v3[nz+1] * sum2;
2508       sum4 -= v4[nz+1] * sum2;
2509       sum5 -= v5[nz+1] * sum2;
2510       sum4 -= v4[nz+2] * sum3;
2511       sum5 -= v5[nz+2] * sum3;
2512       sum5 -= v5[nz+3] * sum4;
2513 
2514       tmp[row++]=sum1;
2515       tmp[row++]=sum2;
2516       tmp[row++]=sum3;
2517       tmp[row++]=sum4;
2518       tmp[row++]=sum5;
2519       break;
2520     default:
2521       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2522     }
2523   }
2524   /* backward solve the upper triangular */
2525   for (i=node_max -1,row = n-1; i>=0; i--) {
2526     nsz = ns[i];
2527     aii = ad[row+1] + 1;
2528     v1  = aa + aii;
2529     vi  = aj + aii;
2530     nz  = ad[row]- ad[row+1] - 1;
2531 
2532     if (i > 0) {
2533       /* Prefetch the indices for the next block */
2534       PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2535       /* Prefetch the data for the next block */
2536       PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2537     }
2538 
2539     switch (nsz) {               /* Each loop in 'case' is unrolled */
2540     case 1:
2541       sum1 = tmp[row];
2542 
2543       for (j=0; j<nz-1; j+=2) {
2544         i0    = vi[j];
2545         i1    = vi[j+1];
2546         tmp0  = tmps[i0];
2547         tmp1  = tmps[i1];
2548         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2549       }
2550       if (j == nz-1) {
2551         tmp0  = tmps[vi[j]];
2552         sum1 -= v1[j]*tmp0;
2553       }
2554       x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2555       break;
2556     case 2:
2557       sum1 = tmp[row];
2558       sum2 = tmp[row-1];
2559       v2   = aa + ad[row] + 1;
2560       for (j=0; j<nz-1; j+=2) {
2561         i0    = vi[j];
2562         i1    = vi[j+1];
2563         tmp0  = tmps[i0];
2564         tmp1  = tmps[i1];
2565         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2566         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2567       }
2568       if (j == nz-1) {
2569         tmp0  = tmps[vi[j]];
2570         sum1 -= v1[j]* tmp0;
2571         sum2 -= v2[j+1]* tmp0;
2572       }
2573 
2574       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2575       sum2     -= v2[0] * tmp0;
2576       x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2577       break;
2578     case 3:
2579       sum1 = tmp[row];
2580       sum2 = tmp[row -1];
2581       sum3 = tmp[row -2];
2582       v2   = aa + ad[row] + 1;
2583       v3   = aa + ad[row -1] + 1;
2584       for (j=0; j<nz-1; j+=2) {
2585         i0    = vi[j];
2586         i1    = vi[j+1];
2587         tmp0  = tmps[i0];
2588         tmp1  = tmps[i1];
2589         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2590         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2591         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2592       }
2593       if (j== nz-1) {
2594         tmp0  = tmps[vi[j]];
2595         sum1 -= v1[j] * tmp0;
2596         sum2 -= v2[j+1] * tmp0;
2597         sum3 -= v3[j+2] * tmp0;
2598       }
2599       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2600       sum2     -= v2[0]* tmp0;
2601       sum3     -= v3[1] * tmp0;
2602       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2603       sum3     -= v3[0]* tmp0;
2604       x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2605 
2606       break;
2607     case 4:
2608       sum1 = tmp[row];
2609       sum2 = tmp[row -1];
2610       sum3 = tmp[row -2];
2611       sum4 = tmp[row -3];
2612       v2   = aa + ad[row]+1;
2613       v3   = aa + ad[row -1]+1;
2614       v4   = aa + ad[row -2]+1;
2615 
2616       for (j=0; j<nz-1; j+=2) {
2617         i0    = vi[j];
2618         i1    = vi[j+1];
2619         tmp0  = tmps[i0];
2620         tmp1  = tmps[i1];
2621         sum1 -= v1[j] * tmp0   + v1[j+1] * tmp1;
2622         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2623         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2624         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2625       }
2626       if (j== nz-1) {
2627         tmp0  = tmps[vi[j]];
2628         sum1 -= v1[j] * tmp0;
2629         sum2 -= v2[j+1] * tmp0;
2630         sum3 -= v3[j+2] * tmp0;
2631         sum4 -= v4[j+3] * tmp0;
2632       }
2633 
2634       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2635       sum2     -= v2[0] * tmp0;
2636       sum3     -= v3[1] * tmp0;
2637       sum4     -= v4[2] * tmp0;
2638       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2639       sum3     -= v3[0] * tmp0;
2640       sum4     -= v4[1] * tmp0;
2641       tmp0      = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2642       sum4     -= v4[0] * tmp0;
2643       x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2644       break;
2645     case 5:
2646       sum1 = tmp[row];
2647       sum2 = tmp[row -1];
2648       sum3 = tmp[row -2];
2649       sum4 = tmp[row -3];
2650       sum5 = tmp[row -4];
2651       v2   = aa + ad[row]+1;
2652       v3   = aa + ad[row -1]+1;
2653       v4   = aa + ad[row -2]+1;
2654       v5   = aa + ad[row -3]+1;
2655       for (j=0; j<nz-1; j+=2) {
2656         i0    = vi[j];
2657         i1    = vi[j+1];
2658         tmp0  = tmps[i0];
2659         tmp1  = tmps[i1];
2660         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2661         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2662         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2663         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2664         sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1;
2665       }
2666       if (j==nz-1) {
2667         tmp0  = tmps[vi[j]];
2668         sum1 -= v1[j] * tmp0;
2669         sum2 -= v2[j+1] * tmp0;
2670         sum3 -= v3[j+2] * tmp0;
2671         sum4 -= v4[j+3] * tmp0;
2672         sum5 -= v5[j+4] * tmp0;
2673       }
2674 
2675       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2676       sum2     -= v2[0] * tmp0;
2677       sum3     -= v3[1] * tmp0;
2678       sum4     -= v4[2] * tmp0;
2679       sum5     -= v5[3] * tmp0;
2680       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2681       sum3     -= v3[0] * tmp0;
2682       sum4     -= v4[1] * tmp0;
2683       sum5     -= v5[2] * tmp0;
2684       tmp0      = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2685       sum4     -= v4[0] * tmp0;
2686       sum5     -= v5[1] * tmp0;
2687       tmp0      = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2688       sum5     -= v5[0] * tmp0;
2689       x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--;
2690       break;
2691     default:
2692       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2693     }
2694   }
2695   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2696   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2697   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2698   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2699   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
2700   PetscFunctionReturn(0);
2701 }
2702 
2703 
2704 /*
2705      Makes a longer coloring[] array and calls the usual code with that
2706 */
2707 #undef __FUNCT__
2708 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode"
2709 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
2710 {
2711   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)mat->data;
2712   PetscErrorCode  ierr;
2713   PetscInt        n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
2714   PetscInt        *colorused,i;
2715   ISColoringValue *newcolor;
2716 
2717   PetscFunctionBegin;
2718   ierr = PetscMalloc1(n+1,&newcolor);CHKERRQ(ierr);
2719   /* loop over inodes, marking a color for each column*/
2720   row = 0;
2721   for (i=0; i<m; i++) {
2722     for (j=0; j<ns[i]; j++) {
2723       newcolor[row++] = coloring[i] + j*ncolors;
2724     }
2725   }
2726 
2727   /* eliminate unneeded colors */
2728   ierr = PetscCalloc1(5*ncolors,&colorused);CHKERRQ(ierr);
2729   for (i=0; i<n; i++) {
2730     colorused[newcolor[i]] = 1;
2731   }
2732 
2733   for (i=1; i<5*ncolors; i++) {
2734     colorused[i] += colorused[i-1];
2735   }
2736   ncolors = colorused[5*ncolors-1];
2737   for (i=0; i<n; i++) {
2738     newcolor[i] = colorused[newcolor[i]]-1;
2739   }
2740   ierr = PetscFree(colorused);CHKERRQ(ierr);
2741   ierr = ISColoringCreate(PetscObjectComm((PetscObject)mat),ncolors,n,newcolor,PETSC_OWN_POINTER,iscoloring);CHKERRQ(ierr);
2742   ierr = PetscFree(coloring);CHKERRQ(ierr);
2743   PetscFunctionReturn(0);
2744 }
2745 
2746 #include <petsc/private/kernels/blockinvert.h>
2747 
2748 #undef __FUNCT__
2749 #define __FUNCT__ "MatSOR_SeqAIJ_Inode"
2750 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
2751 {
2752   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
2753   PetscScalar       sum1 = 0.0,sum2 = 0.0,sum3 = 0.0,sum4 = 0.0,sum5 = 0.0,tmp0,tmp1,tmp2,tmp3;
2754   MatScalar         *ibdiag,*bdiag,work[25],*t;
2755   PetscScalar       *x,tmp4,tmp5,x1,x2,x3,x4,x5;
2756   const MatScalar   *v = a->a,*v1 = NULL,*v2 = NULL,*v3 = NULL,*v4 = NULL,*v5 = NULL;
2757   const PetscScalar *xb, *b;
2758   PetscReal         zeropivot = 1.0e-15, shift = 0.0;
2759   PetscErrorCode    ierr;
2760   PetscInt          n,m = a->inode.node_count,cnt = 0,i,j,row,i1,i2;
2761   PetscInt          sz,k,ipvt[5];
2762   PetscBool         allowzeropivot,zeropivotdetected=PETSC_FALSE;
2763   const PetscInt    *sizes = a->inode.size,*idx,*diag = a->diag,*ii = a->i;
2764 
2765   PetscFunctionBegin;
2766   allowzeropivot = PetscNot(A->erroriffailure);
2767   if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
2768   if (fshift == -1.0) fshift = 0.0; /* negative fshift indicates do not error on zero diagonal; this code never errors on zero diagonal */
2769   if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
2770 
2771   if (!a->inode.ibdiagvalid) {
2772     if (!a->inode.ibdiag) {
2773       /* calculate space needed for diagonal blocks */
2774       for (i=0; i<m; i++) {
2775         cnt += sizes[i]*sizes[i];
2776       }
2777       a->inode.bdiagsize = cnt;
2778 
2779       ierr = PetscMalloc3(cnt,&a->inode.ibdiag,cnt,&a->inode.bdiag,A->rmap->n,&a->inode.ssor_work);CHKERRQ(ierr);
2780     }
2781 
2782     /* copy over the diagonal blocks and invert them */
2783     ibdiag = a->inode.ibdiag;
2784     bdiag  = a->inode.bdiag;
2785     cnt    = 0;
2786     for (i=0, row = 0; i<m; i++) {
2787       for (j=0; j<sizes[i]; j++) {
2788         for (k=0; k<sizes[i]; k++) {
2789           bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
2790         }
2791       }
2792       ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr);
2793 
2794       switch (sizes[i]) {
2795       case 1:
2796         /* Create matrix data structure */
2797         if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
2798         ibdiag[cnt] = 1.0/ibdiag[cnt];
2799         break;
2800       case 2:
2801         ierr = PetscKernel_A_gets_inverse_A_2(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
2802         break;
2803       case 3:
2804         ierr = PetscKernel_A_gets_inverse_A_3(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
2805         break;
2806       case 4:
2807         ierr = PetscKernel_A_gets_inverse_A_4(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
2808         break;
2809       case 5:
2810         ierr = PetscKernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
2811         break;
2812       default:
2813         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2814       }
2815       if (zeropivotdetected) {
2816         A->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2817       }
2818 
2819       cnt += sizes[i]*sizes[i];
2820       row += sizes[i];
2821     }
2822     a->inode.ibdiagvalid = PETSC_TRUE;
2823   }
2824   ibdiag = a->inode.ibdiag;
2825   bdiag  = a->inode.bdiag;
2826   t      = a->inode.ssor_work;
2827 
2828   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2829   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2830   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2831   if (flag & SOR_ZERO_INITIAL_GUESS) {
2832     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
2833 
2834       for (i=0, row=0; i<m; i++) {
2835         sz  = diag[row] - ii[row];
2836         v1  = a->a + ii[row];
2837         idx = a->j + ii[row];
2838 
2839         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2840         switch (sizes[i]) {
2841         case 1:
2842 
2843           sum1 = b[row];
2844           for (n = 0; n<sz-1; n+=2) {
2845             i1    = idx[0];
2846             i2    = idx[1];
2847             idx  += 2;
2848             tmp0  = x[i1];
2849             tmp1  = x[i2];
2850             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2851           }
2852 
2853           if (n == sz-1) {
2854             tmp0  = x[*idx];
2855             sum1 -= *v1 * tmp0;
2856           }
2857           t[row]   = sum1;
2858           x[row++] = sum1*(*ibdiag++);
2859           break;
2860         case 2:
2861           v2   = a->a + ii[row+1];
2862           sum1 = b[row];
2863           sum2 = b[row+1];
2864           for (n = 0; n<sz-1; n+=2) {
2865             i1    = idx[0];
2866             i2    = idx[1];
2867             idx  += 2;
2868             tmp0  = x[i1];
2869             tmp1  = x[i2];
2870             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2871             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2872           }
2873 
2874           if (n == sz-1) {
2875             tmp0  = x[*idx];
2876             sum1 -= v1[0] * tmp0;
2877             sum2 -= v2[0] * tmp0;
2878           }
2879           t[row]   = sum1;
2880           t[row+1] = sum2;
2881           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
2882           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
2883           ibdiag  += 4;
2884           break;
2885         case 3:
2886           v2   = a->a + ii[row+1];
2887           v3   = a->a + ii[row+2];
2888           sum1 = b[row];
2889           sum2 = b[row+1];
2890           sum3 = b[row+2];
2891           for (n = 0; n<sz-1; n+=2) {
2892             i1    = idx[0];
2893             i2    = idx[1];
2894             idx  += 2;
2895             tmp0  = x[i1];
2896             tmp1  = x[i2];
2897             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2898             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2899             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2900           }
2901 
2902           if (n == sz-1) {
2903             tmp0  = x[*idx];
2904             sum1 -= v1[0] * tmp0;
2905             sum2 -= v2[0] * tmp0;
2906             sum3 -= v3[0] * tmp0;
2907           }
2908           t[row]   = sum1;
2909           t[row+1] = sum2;
2910           t[row+2] = sum3;
2911           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2912           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2913           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2914           ibdiag  += 9;
2915           break;
2916         case 4:
2917           v2   = a->a + ii[row+1];
2918           v3   = a->a + ii[row+2];
2919           v4   = a->a + ii[row+3];
2920           sum1 = b[row];
2921           sum2 = b[row+1];
2922           sum3 = b[row+2];
2923           sum4 = b[row+3];
2924           for (n = 0; n<sz-1; n+=2) {
2925             i1    = idx[0];
2926             i2    = idx[1];
2927             idx  += 2;
2928             tmp0  = x[i1];
2929             tmp1  = x[i2];
2930             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2931             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2932             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2933             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2934           }
2935 
2936           if (n == sz-1) {
2937             tmp0  = x[*idx];
2938             sum1 -= v1[0] * tmp0;
2939             sum2 -= v2[0] * tmp0;
2940             sum3 -= v3[0] * tmp0;
2941             sum4 -= v4[0] * tmp0;
2942           }
2943           t[row]   = sum1;
2944           t[row+1] = sum2;
2945           t[row+2] = sum3;
2946           t[row+3] = sum4;
2947           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2948           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2949           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2950           x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2951           ibdiag  += 16;
2952           break;
2953         case 5:
2954           v2   = a->a + ii[row+1];
2955           v3   = a->a + ii[row+2];
2956           v4   = a->a + ii[row+3];
2957           v5   = a->a + ii[row+4];
2958           sum1 = b[row];
2959           sum2 = b[row+1];
2960           sum3 = b[row+2];
2961           sum4 = b[row+3];
2962           sum5 = b[row+4];
2963           for (n = 0; n<sz-1; n+=2) {
2964             i1    = idx[0];
2965             i2    = idx[1];
2966             idx  += 2;
2967             tmp0  = x[i1];
2968             tmp1  = x[i2];
2969             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2970             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2971             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2972             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2973             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2974           }
2975 
2976           if (n == sz-1) {
2977             tmp0  = x[*idx];
2978             sum1 -= v1[0] * tmp0;
2979             sum2 -= v2[0] * tmp0;
2980             sum3 -= v3[0] * tmp0;
2981             sum4 -= v4[0] * tmp0;
2982             sum5 -= v5[0] * tmp0;
2983           }
2984           t[row]   = sum1;
2985           t[row+1] = sum2;
2986           t[row+2] = sum3;
2987           t[row+3] = sum4;
2988           t[row+4] = sum5;
2989           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2990           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
2991           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
2992           x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
2993           x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
2994           ibdiag  += 25;
2995           break;
2996         default:
2997           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2998         }
2999       }
3000 
3001       xb   = t;
3002       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3003     } else xb = b;
3004     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3005 
3006       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3007       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3008         ibdiag -= sizes[i]*sizes[i];
3009         sz      = ii[row+1] - diag[row] - 1;
3010         v1      = a->a + diag[row] + 1;
3011         idx     = a->j + diag[row] + 1;
3012 
3013         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3014         switch (sizes[i]) {
3015         case 1:
3016 
3017           sum1 = xb[row];
3018           for (n = 0; n<sz-1; n+=2) {
3019             i1    = idx[0];
3020             i2    = idx[1];
3021             idx  += 2;
3022             tmp0  = x[i1];
3023             tmp1  = x[i2];
3024             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3025           }
3026 
3027           if (n == sz-1) {
3028             tmp0  = x[*idx];
3029             sum1 -= *v1*tmp0;
3030           }
3031           x[row--] = sum1*(*ibdiag);
3032           break;
3033 
3034         case 2:
3035 
3036           sum1 = xb[row];
3037           sum2 = xb[row-1];
3038           /* note that sum1 is associated with the second of the two rows */
3039           v2 = a->a + diag[row-1] + 2;
3040           for (n = 0; n<sz-1; n+=2) {
3041             i1    = idx[0];
3042             i2    = idx[1];
3043             idx  += 2;
3044             tmp0  = x[i1];
3045             tmp1  = x[i2];
3046             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3047             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3048           }
3049 
3050           if (n == sz-1) {
3051             tmp0  = x[*idx];
3052             sum1 -= *v1*tmp0;
3053             sum2 -= *v2*tmp0;
3054           }
3055           x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3056           x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3057           break;
3058         case 3:
3059 
3060           sum1 = xb[row];
3061           sum2 = xb[row-1];
3062           sum3 = xb[row-2];
3063           v2   = a->a + diag[row-1] + 2;
3064           v3   = a->a + diag[row-2] + 3;
3065           for (n = 0; n<sz-1; n+=2) {
3066             i1    = idx[0];
3067             i2    = idx[1];
3068             idx  += 2;
3069             tmp0  = x[i1];
3070             tmp1  = x[i2];
3071             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3072             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3073             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3074           }
3075 
3076           if (n == sz-1) {
3077             tmp0  = x[*idx];
3078             sum1 -= *v1*tmp0;
3079             sum2 -= *v2*tmp0;
3080             sum3 -= *v3*tmp0;
3081           }
3082           x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3083           x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3084           x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3085           break;
3086         case 4:
3087 
3088           sum1 = xb[row];
3089           sum2 = xb[row-1];
3090           sum3 = xb[row-2];
3091           sum4 = xb[row-3];
3092           v2   = a->a + diag[row-1] + 2;
3093           v3   = a->a + diag[row-2] + 3;
3094           v4   = a->a + diag[row-3] + 4;
3095           for (n = 0; n<sz-1; n+=2) {
3096             i1    = idx[0];
3097             i2    = idx[1];
3098             idx  += 2;
3099             tmp0  = x[i1];
3100             tmp1  = x[i2];
3101             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3102             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3103             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3104             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3105           }
3106 
3107           if (n == sz-1) {
3108             tmp0  = x[*idx];
3109             sum1 -= *v1*tmp0;
3110             sum2 -= *v2*tmp0;
3111             sum3 -= *v3*tmp0;
3112             sum4 -= *v4*tmp0;
3113           }
3114           x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3115           x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3116           x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3117           x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3118           break;
3119         case 5:
3120 
3121           sum1 = xb[row];
3122           sum2 = xb[row-1];
3123           sum3 = xb[row-2];
3124           sum4 = xb[row-3];
3125           sum5 = xb[row-4];
3126           v2   = a->a + diag[row-1] + 2;
3127           v3   = a->a + diag[row-2] + 3;
3128           v4   = a->a + diag[row-3] + 4;
3129           v5   = a->a + diag[row-4] + 5;
3130           for (n = 0; n<sz-1; n+=2) {
3131             i1    = idx[0];
3132             i2    = idx[1];
3133             idx  += 2;
3134             tmp0  = x[i1];
3135             tmp1  = x[i2];
3136             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3137             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3138             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3139             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3140             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3141           }
3142 
3143           if (n == sz-1) {
3144             tmp0  = x[*idx];
3145             sum1 -= *v1*tmp0;
3146             sum2 -= *v2*tmp0;
3147             sum3 -= *v3*tmp0;
3148             sum4 -= *v4*tmp0;
3149             sum5 -= *v5*tmp0;
3150           }
3151           x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3152           x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3153           x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3154           x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3155           x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3156           break;
3157         default:
3158           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3159         }
3160       }
3161 
3162       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3163     }
3164     its--;
3165   }
3166   while (its--) {
3167 
3168     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
3169       for (i=0, row=0, ibdiag = a->inode.ibdiag;
3170            i<m;
3171            row += sizes[i], ibdiag += sizes[i]*sizes[i], i++) {
3172 
3173         sz  = diag[row] - ii[row];
3174         v1  = a->a + ii[row];
3175         idx = a->j + ii[row];
3176         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3177         switch (sizes[i]) {
3178         case 1:
3179           sum1 = b[row];
3180           for (n = 0; n<sz-1; n+=2) {
3181             i1    = idx[0];
3182             i2    = idx[1];
3183             idx  += 2;
3184             tmp0  = x[i1];
3185             tmp1  = x[i2];
3186             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3187           }
3188           if (n == sz-1) {
3189             tmp0  = x[*idx++];
3190             sum1 -= *v1 * tmp0;
3191             v1++;
3192           }
3193           t[row]   = sum1;
3194           sz      = ii[row+1] - diag[row] - 1;
3195           idx     = a->j + diag[row] + 1;
3196           v1 += 1;
3197           for (n = 0; n<sz-1; n+=2) {
3198             i1    = idx[0];
3199             i2    = idx[1];
3200             idx  += 2;
3201             tmp0  = x[i1];
3202             tmp1  = x[i2];
3203             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3204           }
3205           if (n == sz-1) {
3206             tmp0  = x[*idx++];
3207             sum1 -= *v1 * tmp0;
3208           }
3209           /* in MatSOR_SeqAIJ this line would be
3210            *
3211            * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
3212            *
3213            * but omega == 1, so this becomes
3214            *
3215            * x[row] = sum1*(*ibdiag++);
3216            *
3217            */
3218           x[row] = sum1*(*ibdiag);
3219           break;
3220         case 2:
3221           v2   = a->a + ii[row+1];
3222           sum1 = b[row];
3223           sum2 = b[row+1];
3224           for (n = 0; n<sz-1; n+=2) {
3225             i1    = idx[0];
3226             i2    = idx[1];
3227             idx  += 2;
3228             tmp0  = x[i1];
3229             tmp1  = x[i2];
3230             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3231             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3232           }
3233           if (n == sz-1) {
3234             tmp0  = x[*idx++];
3235             sum1 -= v1[0] * tmp0;
3236             sum2 -= v2[0] * tmp0;
3237             v1++; v2++;
3238           }
3239           t[row]   = sum1;
3240           t[row+1] = sum2;
3241           sz      = ii[row+1] - diag[row] - 2;
3242           idx     = a->j + diag[row] + 2;
3243           v1 += 2;
3244           v2 += 2;
3245           for (n = 0; n<sz-1; n+=2) {
3246             i1    = idx[0];
3247             i2    = idx[1];
3248             idx  += 2;
3249             tmp0  = x[i1];
3250             tmp1  = x[i2];
3251             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3252             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3253           }
3254           if (n == sz-1) {
3255             tmp0  = x[*idx];
3256             sum1 -= v1[0] * tmp0;
3257             sum2 -= v2[0] * tmp0;
3258           }
3259           x[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3260           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3261           break;
3262         case 3:
3263           v2   = a->a + ii[row+1];
3264           v3   = a->a + ii[row+2];
3265           sum1 = b[row];
3266           sum2 = b[row+1];
3267           sum3 = b[row+2];
3268           for (n = 0; n<sz-1; n+=2) {
3269             i1    = idx[0];
3270             i2    = idx[1];
3271             idx  += 2;
3272             tmp0  = x[i1];
3273             tmp1  = x[i2];
3274             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3275             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3276             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3277           }
3278           if (n == sz-1) {
3279             tmp0  = x[*idx++];
3280             sum1 -= v1[0] * tmp0;
3281             sum2 -= v2[0] * tmp0;
3282             sum3 -= v3[0] * tmp0;
3283             v1++; v2++; v3++;
3284           }
3285           t[row]   = sum1;
3286           t[row+1] = sum2;
3287           t[row+2] = sum3;
3288           sz      = ii[row+1] - diag[row] - 3;
3289           idx     = a->j + diag[row] + 3;
3290           v1 += 3;
3291           v2 += 3;
3292           v3 += 3;
3293           for (n = 0; n<sz-1; n+=2) {
3294             i1    = idx[0];
3295             i2    = idx[1];
3296             idx  += 2;
3297             tmp0  = x[i1];
3298             tmp1  = x[i2];
3299             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3300             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3301             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3302           }
3303           if (n == sz-1) {
3304             tmp0  = x[*idx];
3305             sum1 -= v1[0] * tmp0;
3306             sum2 -= v2[0] * tmp0;
3307             sum3 -= v3[0] * tmp0;
3308           }
3309           x[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3310           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3311           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3312           break;
3313         case 4:
3314           v2   = a->a + ii[row+1];
3315           v3   = a->a + ii[row+2];
3316           v4   = a->a + ii[row+3];
3317           sum1 = b[row];
3318           sum2 = b[row+1];
3319           sum3 = b[row+2];
3320           sum4 = b[row+3];
3321           for (n = 0; n<sz-1; n+=2) {
3322             i1    = idx[0];
3323             i2    = idx[1];
3324             idx  += 2;
3325             tmp0  = x[i1];
3326             tmp1  = x[i2];
3327             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3328             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3329             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3330             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3331           }
3332           if (n == sz-1) {
3333             tmp0  = x[*idx++];
3334             sum1 -= v1[0] * tmp0;
3335             sum2 -= v2[0] * tmp0;
3336             sum3 -= v3[0] * tmp0;
3337             sum4 -= v4[0] * tmp0;
3338             v1++; v2++; v3++; v4++;
3339           }
3340           t[row]   = sum1;
3341           t[row+1] = sum2;
3342           t[row+2] = sum3;
3343           t[row+3] = sum4;
3344           sz      = ii[row+1] - diag[row] - 4;
3345           idx     = a->j + diag[row] + 4;
3346           v1 += 4;
3347           v2 += 4;
3348           v3 += 4;
3349           v4 += 4;
3350           for (n = 0; n<sz-1; n+=2) {
3351             i1    = idx[0];
3352             i2    = idx[1];
3353             idx  += 2;
3354             tmp0  = x[i1];
3355             tmp1  = x[i2];
3356             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3357             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3358             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3359             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3360           }
3361           if (n == sz-1) {
3362             tmp0  = x[*idx];
3363             sum1 -= v1[0] * tmp0;
3364             sum2 -= v2[0] * tmp0;
3365             sum3 -= v3[0] * tmp0;
3366             sum4 -= v4[0] * tmp0;
3367           }
3368           x[row] =   sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3369           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3370           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3371           x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3372           break;
3373         case 5:
3374           v2   = a->a + ii[row+1];
3375           v3   = a->a + ii[row+2];
3376           v4   = a->a + ii[row+3];
3377           v5   = a->a + ii[row+4];
3378           sum1 = b[row];
3379           sum2 = b[row+1];
3380           sum3 = b[row+2];
3381           sum4 = b[row+3];
3382           sum5 = b[row+4];
3383           for (n = 0; n<sz-1; n+=2) {
3384             i1    = idx[0];
3385             i2    = idx[1];
3386             idx  += 2;
3387             tmp0  = x[i1];
3388             tmp1  = x[i2];
3389             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3390             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3391             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3392             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3393             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3394           }
3395           if (n == sz-1) {
3396             tmp0  = x[*idx++];
3397             sum1 -= v1[0] * tmp0;
3398             sum2 -= v2[0] * tmp0;
3399             sum3 -= v3[0] * tmp0;
3400             sum4 -= v4[0] * tmp0;
3401             sum5 -= v5[0] * tmp0;
3402             v1++; v2++; v3++; v4++; v5++;
3403           }
3404           t[row]   = sum1;
3405           t[row+1] = sum2;
3406           t[row+2] = sum3;
3407           t[row+3] = sum4;
3408           t[row+4] = sum5;
3409           sz      = ii[row+1] - diag[row] - 5;
3410           idx     = a->j + diag[row] + 5;
3411           v1 += 5;
3412           v2 += 5;
3413           v3 += 5;
3414           v4 += 5;
3415           v5 += 5;
3416           for (n = 0; n<sz-1; n+=2) {
3417             i1    = idx[0];
3418             i2    = idx[1];
3419             idx  += 2;
3420             tmp0  = x[i1];
3421             tmp1  = x[i2];
3422             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3423             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3424             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3425             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3426             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3427           }
3428           if (n == sz-1) {
3429             tmp0  = x[*idx];
3430             sum1 -= v1[0] * tmp0;
3431             sum2 -= v2[0] * tmp0;
3432             sum3 -= v3[0] * tmp0;
3433             sum4 -= v4[0] * tmp0;
3434             sum5 -= v5[0] * tmp0;
3435           }
3436           x[row]   = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3437           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3438           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3439           x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3440           x[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3441           break;
3442         default:
3443           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3444         }
3445       }
3446       xb   = t;
3447       ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);  /* undercounts diag inverse */
3448     } else xb = b;
3449 
3450     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3451 
3452       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3453       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3454         ibdiag -= sizes[i]*sizes[i];
3455 
3456         /* set RHS */
3457         if (xb == b) {
3458           /* whole (old way) */
3459           sz      = ii[row+1] - ii[row];
3460           idx     = a->j + ii[row];
3461           switch (sizes[i]) {
3462           case 5:
3463             v5      = a->a + ii[row-4];
3464           case 4: /* fall through */
3465             v4      = a->a + ii[row-3];
3466           case 3:
3467             v3      = a->a + ii[row-2];
3468           case 2:
3469             v2      = a->a + ii[row-1];
3470           case 1:
3471             v1      = a->a + ii[row];
3472             break;
3473           default:
3474             SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3475           }
3476         } else {
3477           /* upper, no diag */
3478           sz      = ii[row+1] - diag[row] - 1;
3479           idx     = a->j + diag[row] + 1;
3480           switch (sizes[i]) {
3481           case 5:
3482             v5      = a->a + diag[row-4] + 5;
3483           case 4: /* fall through */
3484             v4      = a->a + diag[row-3] + 4;
3485           case 3:
3486             v3      = a->a + diag[row-2] + 3;
3487           case 2:
3488             v2      = a->a + diag[row-1] + 2;
3489           case 1:
3490             v1      = a->a + diag[row] + 1;
3491           }
3492         }
3493         /* set sum */
3494         switch (sizes[i]) {
3495         case 5:
3496           sum5 = xb[row-4];
3497         case 4: /* fall through */
3498           sum4 = xb[row-3];
3499         case 3:
3500           sum3 = xb[row-2];
3501         case 2:
3502           sum2 = xb[row-1];
3503         case 1:
3504           /* note that sum1 is associated with the last row */
3505           sum1 = xb[row];
3506         }
3507         /* do sums */
3508         for (n = 0; n<sz-1; n+=2) {
3509             i1    = idx[0];
3510             i2    = idx[1];
3511             idx  += 2;
3512             tmp0  = x[i1];
3513             tmp1  = x[i2];
3514             switch (sizes[i]) {
3515             case 5:
3516               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3517             case 4: /* fall through */
3518               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3519             case 3:
3520               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3521             case 2:
3522               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3523             case 1:
3524               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3525             }
3526         }
3527         /* ragged edge */
3528         if (n == sz-1) {
3529           tmp0  = x[*idx];
3530           switch (sizes[i]) {
3531           case 5:
3532             sum5 -= *v5*tmp0;
3533           case 4: /* fall through */
3534             sum4 -= *v4*tmp0;
3535           case 3:
3536             sum3 -= *v3*tmp0;
3537           case 2:
3538             sum2 -= *v2*tmp0;
3539           case 1:
3540             sum1 -= *v1*tmp0;
3541           }
3542         }
3543         /* update */
3544         if (xb == b) {
3545           /* whole (old way) w/ diag */
3546           switch (sizes[i]) {
3547           case 5:
3548             x[row--] += sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3549             x[row--] += sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3550             x[row--] += sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3551             x[row--] += sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3552             x[row--] += sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3553             break;
3554           case 4:
3555             x[row--] += sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3556             x[row--] += sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3557             x[row--] += sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3558             x[row--] += sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3559             break;
3560           case 3:
3561             x[row--] += sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3562             x[row--] += sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3563             x[row--] += sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3564             break;
3565           case 2:
3566             x[row--] += sum2*ibdiag[1] + sum1*ibdiag[3];
3567             x[row--] += sum2*ibdiag[0] + sum1*ibdiag[2];
3568             break;
3569           case 1:
3570             x[row--] += sum1*(*ibdiag);
3571             break;
3572           }
3573         } else {
3574           /* no diag so set =  */
3575           switch (sizes[i]) {
3576           case 5:
3577             x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3578             x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3579             x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3580             x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3581             x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3582             break;
3583           case 4:
3584             x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3585             x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3586             x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3587             x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3588             break;
3589           case 3:
3590             x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3591             x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3592             x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3593             break;
3594           case 2:
3595             x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3596             x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3597             break;
3598           case 1:
3599             x[row--] = sum1*(*ibdiag);
3600             break;
3601           }
3602         }
3603       }
3604       if (xb == b) {
3605         ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
3606       } else {
3607         ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); /* assumes 1/2 in upper, undercounts diag inverse */
3608       }
3609     }
3610   }
3611   if (flag & SOR_EISENSTAT) {
3612     /*
3613           Apply  (U + D)^-1  where D is now the block diagonal
3614     */
3615     ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3616     for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3617       ibdiag -= sizes[i]*sizes[i];
3618       sz      = ii[row+1] - diag[row] - 1;
3619       v1      = a->a + diag[row] + 1;
3620       idx     = a->j + diag[row] + 1;
3621       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3622       switch (sizes[i]) {
3623       case 1:
3624 
3625         sum1 = b[row];
3626         for (n = 0; n<sz-1; n+=2) {
3627           i1    = idx[0];
3628           i2    = idx[1];
3629           idx  += 2;
3630           tmp0  = x[i1];
3631           tmp1  = x[i2];
3632           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3633         }
3634 
3635         if (n == sz-1) {
3636           tmp0  = x[*idx];
3637           sum1 -= *v1*tmp0;
3638         }
3639         x[row] = sum1*(*ibdiag);row--;
3640         break;
3641 
3642       case 2:
3643 
3644         sum1 = b[row];
3645         sum2 = b[row-1];
3646         /* note that sum1 is associated with the second of the two rows */
3647         v2 = a->a + diag[row-1] + 2;
3648         for (n = 0; n<sz-1; n+=2) {
3649           i1    = idx[0];
3650           i2    = idx[1];
3651           idx  += 2;
3652           tmp0  = x[i1];
3653           tmp1  = x[i2];
3654           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3655           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3656         }
3657 
3658         if (n == sz-1) {
3659           tmp0  = x[*idx];
3660           sum1 -= *v1*tmp0;
3661           sum2 -= *v2*tmp0;
3662         }
3663         x[row]   = sum2*ibdiag[1] + sum1*ibdiag[3];
3664         x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
3665         row     -= 2;
3666         break;
3667       case 3:
3668 
3669         sum1 = b[row];
3670         sum2 = b[row-1];
3671         sum3 = b[row-2];
3672         v2   = a->a + diag[row-1] + 2;
3673         v3   = a->a + diag[row-2] + 3;
3674         for (n = 0; n<sz-1; n+=2) {
3675           i1    = idx[0];
3676           i2    = idx[1];
3677           idx  += 2;
3678           tmp0  = x[i1];
3679           tmp1  = x[i2];
3680           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3681           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3682           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3683         }
3684 
3685         if (n == sz-1) {
3686           tmp0  = x[*idx];
3687           sum1 -= *v1*tmp0;
3688           sum2 -= *v2*tmp0;
3689           sum3 -= *v3*tmp0;
3690         }
3691         x[row]   = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3692         x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3693         x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3694         row     -= 3;
3695         break;
3696       case 4:
3697 
3698         sum1 = b[row];
3699         sum2 = b[row-1];
3700         sum3 = b[row-2];
3701         sum4 = b[row-3];
3702         v2   = a->a + diag[row-1] + 2;
3703         v3   = a->a + diag[row-2] + 3;
3704         v4   = a->a + diag[row-3] + 4;
3705         for (n = 0; n<sz-1; n+=2) {
3706           i1    = idx[0];
3707           i2    = idx[1];
3708           idx  += 2;
3709           tmp0  = x[i1];
3710           tmp1  = x[i2];
3711           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3712           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3713           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3714           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3715         }
3716 
3717         if (n == sz-1) {
3718           tmp0  = x[*idx];
3719           sum1 -= *v1*tmp0;
3720           sum2 -= *v2*tmp0;
3721           sum3 -= *v3*tmp0;
3722           sum4 -= *v4*tmp0;
3723         }
3724         x[row]   = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3725         x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3726         x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3727         x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3728         row     -= 4;
3729         break;
3730       case 5:
3731 
3732         sum1 = b[row];
3733         sum2 = b[row-1];
3734         sum3 = b[row-2];
3735         sum4 = b[row-3];
3736         sum5 = b[row-4];
3737         v2   = a->a + diag[row-1] + 2;
3738         v3   = a->a + diag[row-2] + 3;
3739         v4   = a->a + diag[row-3] + 4;
3740         v5   = a->a + diag[row-4] + 5;
3741         for (n = 0; n<sz-1; n+=2) {
3742           i1    = idx[0];
3743           i2    = idx[1];
3744           idx  += 2;
3745           tmp0  = x[i1];
3746           tmp1  = x[i2];
3747           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3748           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3749           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3750           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3751           sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3752         }
3753 
3754         if (n == sz-1) {
3755           tmp0  = x[*idx];
3756           sum1 -= *v1*tmp0;
3757           sum2 -= *v2*tmp0;
3758           sum3 -= *v3*tmp0;
3759           sum4 -= *v4*tmp0;
3760           sum5 -= *v5*tmp0;
3761         }
3762         x[row]   = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3763         x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3764         x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3765         x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3766         x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3767         row     -= 5;
3768         break;
3769       default:
3770         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3771       }
3772     }
3773     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3774 
3775     /*
3776            t = b - D x    where D is the block diagonal
3777     */
3778     cnt = 0;
3779     for (i=0, row=0; i<m; i++) {
3780       switch (sizes[i]) {
3781       case 1:
3782         t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
3783         break;
3784       case 2:
3785         x1       = x[row]; x2 = x[row+1];
3786         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3787         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3788         t[row]   = b[row] - tmp1;
3789         t[row+1] = b[row+1] - tmp2; row += 2;
3790         cnt     += 4;
3791         break;
3792       case 3:
3793         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2];
3794         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3795         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3796         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3797         t[row]   = b[row] - tmp1;
3798         t[row+1] = b[row+1] - tmp2;
3799         t[row+2] = b[row+2] - tmp3; row += 3;
3800         cnt     += 9;
3801         break;
3802       case 4:
3803         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3804         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3805         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3806         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3807         tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3808         t[row]   = b[row] - tmp1;
3809         t[row+1] = b[row+1] - tmp2;
3810         t[row+2] = b[row+2] - tmp3;
3811         t[row+3] = b[row+3] - tmp4; row += 4;
3812         cnt     += 16;
3813         break;
3814       case 5:
3815         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3816         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3817         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3818         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3819         tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3820         tmp5     = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3821         t[row]   = b[row] - tmp1;
3822         t[row+1] = b[row+1] - tmp2;
3823         t[row+2] = b[row+2] - tmp3;
3824         t[row+3] = b[row+3] - tmp4;
3825         t[row+4] = b[row+4] - tmp5;row += 5;
3826         cnt     += 25;
3827         break;
3828       default:
3829         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3830       }
3831     }
3832     ierr = PetscLogFlops(m);CHKERRQ(ierr);
3833 
3834 
3835 
3836     /*
3837           Apply (L + D)^-1 where D is the block diagonal
3838     */
3839     for (i=0, row=0; i<m; i++) {
3840       sz  = diag[row] - ii[row];
3841       v1  = a->a + ii[row];
3842       idx = a->j + ii[row];
3843       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3844       switch (sizes[i]) {
3845       case 1:
3846 
3847         sum1 = t[row];
3848         for (n = 0; n<sz-1; n+=2) {
3849           i1    = idx[0];
3850           i2    = idx[1];
3851           idx  += 2;
3852           tmp0  = t[i1];
3853           tmp1  = t[i2];
3854           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3855         }
3856 
3857         if (n == sz-1) {
3858           tmp0  = t[*idx];
3859           sum1 -= *v1 * tmp0;
3860         }
3861         x[row] += t[row] = sum1*(*ibdiag++); row++;
3862         break;
3863       case 2:
3864         v2   = a->a + ii[row+1];
3865         sum1 = t[row];
3866         sum2 = t[row+1];
3867         for (n = 0; n<sz-1; n+=2) {
3868           i1    = idx[0];
3869           i2    = idx[1];
3870           idx  += 2;
3871           tmp0  = t[i1];
3872           tmp1  = t[i2];
3873           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3874           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3875         }
3876 
3877         if (n == sz-1) {
3878           tmp0  = t[*idx];
3879           sum1 -= v1[0] * tmp0;
3880           sum2 -= v2[0] * tmp0;
3881         }
3882         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3883         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3884         ibdiag   += 4; row += 2;
3885         break;
3886       case 3:
3887         v2   = a->a + ii[row+1];
3888         v3   = a->a + ii[row+2];
3889         sum1 = t[row];
3890         sum2 = t[row+1];
3891         sum3 = t[row+2];
3892         for (n = 0; n<sz-1; n+=2) {
3893           i1    = idx[0];
3894           i2    = idx[1];
3895           idx  += 2;
3896           tmp0  = t[i1];
3897           tmp1  = t[i2];
3898           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3899           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3900           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3901         }
3902 
3903         if (n == sz-1) {
3904           tmp0  = t[*idx];
3905           sum1 -= v1[0] * tmp0;
3906           sum2 -= v2[0] * tmp0;
3907           sum3 -= v3[0] * tmp0;
3908         }
3909         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3910         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3911         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3912         ibdiag   += 9; row += 3;
3913         break;
3914       case 4:
3915         v2   = a->a + ii[row+1];
3916         v3   = a->a + ii[row+2];
3917         v4   = a->a + ii[row+3];
3918         sum1 = t[row];
3919         sum2 = t[row+1];
3920         sum3 = t[row+2];
3921         sum4 = t[row+3];
3922         for (n = 0; n<sz-1; n+=2) {
3923           i1    = idx[0];
3924           i2    = idx[1];
3925           idx  += 2;
3926           tmp0  = t[i1];
3927           tmp1  = t[i2];
3928           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3929           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3930           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3931           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3932         }
3933 
3934         if (n == sz-1) {
3935           tmp0  = t[*idx];
3936           sum1 -= v1[0] * tmp0;
3937           sum2 -= v2[0] * tmp0;
3938           sum3 -= v3[0] * tmp0;
3939           sum4 -= v4[0] * tmp0;
3940         }
3941         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3942         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3943         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3944         x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3945         ibdiag   += 16; row += 4;
3946         break;
3947       case 5:
3948         v2   = a->a + ii[row+1];
3949         v3   = a->a + ii[row+2];
3950         v4   = a->a + ii[row+3];
3951         v5   = a->a + ii[row+4];
3952         sum1 = t[row];
3953         sum2 = t[row+1];
3954         sum3 = t[row+2];
3955         sum4 = t[row+3];
3956         sum5 = t[row+4];
3957         for (n = 0; n<sz-1; n+=2) {
3958           i1    = idx[0];
3959           i2    = idx[1];
3960           idx  += 2;
3961           tmp0  = t[i1];
3962           tmp1  = t[i2];
3963           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3964           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3965           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3966           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3967           sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3968         }
3969 
3970         if (n == sz-1) {
3971           tmp0  = t[*idx];
3972           sum1 -= v1[0] * tmp0;
3973           sum2 -= v2[0] * tmp0;
3974           sum3 -= v3[0] * tmp0;
3975           sum4 -= v4[0] * tmp0;
3976           sum5 -= v5[0] * tmp0;
3977         }
3978         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3979         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3980         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3981         x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3982         x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3983         ibdiag   += 25; row += 5;
3984         break;
3985       default:
3986         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3987       }
3988     }
3989     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3990   }
3991   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3992   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3993   PetscFunctionReturn(0);
3994 }
3995 
3996 #undef __FUNCT__
3997 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode"
3998 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
3999 {
4000   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
4001   PetscScalar       *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
4002   const MatScalar   *bdiag = a->inode.bdiag;
4003   const PetscScalar *b;
4004   PetscErrorCode    ierr;
4005   PetscInt          m      = a->inode.node_count,cnt = 0,i,row;
4006   const PetscInt    *sizes = a->inode.size;
4007 
4008   PetscFunctionBegin;
4009   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4010   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
4011   cnt  = 0;
4012   for (i=0, row=0; i<m; i++) {
4013     switch (sizes[i]) {
4014     case 1:
4015       x[row] = b[row]*bdiag[cnt++];row++;
4016       break;
4017     case 2:
4018       x1       = b[row]; x2 = b[row+1];
4019       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+2];
4020       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
4021       x[row++] = tmp1;
4022       x[row++] = tmp2;
4023       cnt     += 4;
4024       break;
4025     case 3:
4026       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2];
4027       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
4028       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
4029       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
4030       x[row++] = tmp1;
4031       x[row++] = tmp2;
4032       x[row++] = tmp3;
4033       cnt     += 9;
4034       break;
4035     case 4:
4036       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
4037       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
4038       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
4039       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
4040       tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
4041       x[row++] = tmp1;
4042       x[row++] = tmp2;
4043       x[row++] = tmp3;
4044       x[row++] = tmp4;
4045       cnt     += 16;
4046       break;
4047     case 5:
4048       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
4049       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
4050       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
4051       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
4052       tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
4053       tmp5     = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
4054       x[row++] = tmp1;
4055       x[row++] = tmp2;
4056       x[row++] = tmp3;
4057       x[row++] = tmp4;
4058       x[row++] = tmp5;
4059       cnt     += 25;
4060       break;
4061     default:
4062       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
4063     }
4064   }
4065   ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr);
4066   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4067   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
4068   PetscFunctionReturn(0);
4069 }
4070 
4071 /*
4072     samestructure indicates that the matrix has not changed its nonzero structure so we
4073     do not need to recompute the inodes
4074 */
4075 #undef __FUNCT__
4076 #define __FUNCT__ "MatSeqAIJCheckInode"
4077 PetscErrorCode MatSeqAIJCheckInode(Mat A)
4078 {
4079   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4080   PetscErrorCode ierr;
4081   PetscInt       i,j,m,nzx,nzy,*ns,node_count,blk_size;
4082   PetscBool      flag;
4083   const PetscInt *idx,*idy,*ii;
4084 
4085   PetscFunctionBegin;
4086   if (!a->inode.use) PetscFunctionReturn(0);
4087   if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(0);
4088 
4089   m = A->rmap->n;
4090   if (a->inode.size) ns = a->inode.size;
4091   else {
4092     ierr = PetscMalloc1(m+1,&ns);CHKERRQ(ierr);
4093   }
4094 
4095   i          = 0;
4096   node_count = 0;
4097   idx        = a->j;
4098   ii         = a->i;
4099   while (i < m) {                /* For each row */
4100     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
4101     /* Limits the number of elements in a node to 'a->inode.limit' */
4102     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4103       nzy = ii[j+1] - ii[j];     /* Same number of nonzeros */
4104       if (nzy != nzx) break;
4105       idy += nzx;              /* Same nonzero pattern */
4106       ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
4107       if (!flag) break;
4108     }
4109     ns[node_count++] = blk_size;
4110     idx             += blk_size*nzx;
4111     i                = j;
4112   }
4113   /* If not enough inodes found,, do not use inode version of the routines */
4114   if (!m || node_count > .8*m) {
4115     ierr = PetscFree(ns);CHKERRQ(ierr);
4116 
4117     a->inode.node_count       = 0;
4118     a->inode.size             = NULL;
4119     a->inode.use              = PETSC_FALSE;
4120     A->ops->mult              = MatMult_SeqAIJ;
4121     A->ops->sor               = MatSOR_SeqAIJ;
4122     A->ops->multadd           = MatMultAdd_SeqAIJ;
4123     A->ops->getrowij          = MatGetRowIJ_SeqAIJ;
4124     A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ;
4125     A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ;
4126     A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ;
4127     A->ops->coloringpatch     = 0;
4128     A->ops->multdiagonalblock = 0;
4129 
4130     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
4131   } else {
4132     if (!A->factortype) {
4133       A->ops->mult              = MatMult_SeqAIJ_Inode;
4134       A->ops->sor               = MatSOR_SeqAIJ_Inode;
4135       A->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
4136       A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4137       if (A->rmap->n == A->cmap->n) {
4138         A->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4139         A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4140         A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4141         A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4142         A->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4143       }
4144     } else {
4145       A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4146     }
4147     a->inode.node_count = node_count;
4148     a->inode.size       = ns;
4149     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
4150   }
4151   a->inode.checked          = PETSC_TRUE;
4152   a->inode.mat_nonzerostate = A->nonzerostate;
4153   PetscFunctionReturn(0);
4154 }
4155 
4156 #undef __FUNCT__
4157 #define __FUNCT__ "MatDuplicate_SeqAIJ_Inode"
4158 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat *C)
4159 {
4160   Mat            B =*C;
4161   Mat_SeqAIJ     *c=(Mat_SeqAIJ*)B->data,*a=(Mat_SeqAIJ*)A->data;
4162   PetscErrorCode ierr;
4163   PetscInt       m=A->rmap->n;
4164 
4165   PetscFunctionBegin;
4166   c->inode.use       = a->inode.use;
4167   c->inode.limit     = a->inode.limit;
4168   c->inode.max_limit = a->inode.max_limit;
4169   if (a->inode.size) {
4170     ierr                = PetscMalloc1(m+1,&c->inode.size);CHKERRQ(ierr);
4171     c->inode.node_count = a->inode.node_count;
4172     ierr                = PetscMemcpy(c->inode.size,a->inode.size,(m+1)*sizeof(PetscInt));CHKERRQ(ierr);
4173     /* note the table of functions below should match that in MatSeqAIJCheckInode() */
4174     if (!B->factortype) {
4175       B->ops->mult              = MatMult_SeqAIJ_Inode;
4176       B->ops->sor               = MatSOR_SeqAIJ_Inode;
4177       B->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
4178       B->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4179       B->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4180       B->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4181       B->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4182       B->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4183       B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4184     } else {
4185       B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4186     }
4187   } else {
4188     c->inode.size       = 0;
4189     c->inode.node_count = 0;
4190   }
4191   c->inode.ibdiagvalid = PETSC_FALSE;
4192   c->inode.ibdiag      = 0;
4193   c->inode.bdiag       = 0;
4194   PetscFunctionReturn(0);
4195 }
4196 
4197 #undef __FUNCT__
4198 #define __FUNCT__ "MatGetRow_FactoredLU"
4199 PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,const PetscInt *ai,const PetscInt *aj,const PetscInt *adiag,PetscInt row)
4200 {
4201   PetscInt       k;
4202   const PetscInt *vi;
4203 
4204   PetscFunctionBegin;
4205   vi = aj + ai[row];
4206   for (k=0; k<nzl; k++) cols[k] = vi[k];
4207   vi        = aj + adiag[row];
4208   cols[nzl] = vi[0];
4209   vi        = aj + adiag[row+1]+1;
4210   for (k=0; k<nzu; k++) cols[nzl+1+k] = vi[k];
4211   PetscFunctionReturn(0);
4212 }
4213 /*
4214    MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix.
4215    Modified from MatSeqAIJCheckInode().
4216 
4217    Input Parameters:
4218 .  Mat A - ILU or LU matrix factor
4219 
4220 */
4221 #undef __FUNCT__
4222 #define __FUNCT__ "MatSeqAIJCheckInode_FactorLU"
4223 PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A)
4224 {
4225   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4226   PetscErrorCode ierr;
4227   PetscInt       i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size;
4228   PetscInt       *cols1,*cols2,*ns;
4229   const PetscInt *ai = a->i,*aj = a->j, *adiag = a->diag;
4230   PetscBool      flag;
4231 
4232   PetscFunctionBegin;
4233   if (!a->inode.use)    PetscFunctionReturn(0);
4234   if (a->inode.checked) PetscFunctionReturn(0);
4235 
4236   m = A->rmap->n;
4237   if (a->inode.size) ns = a->inode.size;
4238   else {
4239     ierr = PetscMalloc1(m+1,&ns);CHKERRQ(ierr);
4240   }
4241 
4242   i          = 0;
4243   node_count = 0;
4244   ierr = PetscMalloc2(m,&cols1,m,&cols2);CHKERRQ(ierr);
4245   while (i < m) {                /* For each row */
4246     nzl1 = ai[i+1] - ai[i];       /* Number of nonzeros in L */
4247     nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/
4248     nzx  = nzl1 + nzu1 + 1;
4249     MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i);
4250 
4251     /* Limits the number of elements in a node to 'a->inode.limit' */
4252     for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4253       nzl2 = ai[j+1] - ai[j];
4254       nzu2 = adiag[j] - adiag[j+1] - 1;
4255       nzy  = nzl2 + nzu2 + 1;
4256       if (nzy != nzx) break;
4257       ierr = MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);CHKERRQ(ierr);
4258       ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
4259       if (!flag) break;
4260     }
4261     ns[node_count++] = blk_size;
4262     i                = j;
4263   }
4264   ierr             = PetscFree2(cols1,cols2);CHKERRQ(ierr);
4265   /* If not enough inodes found,, do not use inode version of the routines */
4266   if (!m || node_count > .8*m) {
4267     ierr = PetscFree(ns);CHKERRQ(ierr);
4268 
4269     a->inode.node_count = 0;
4270     a->inode.size       = NULL;
4271     a->inode.use        = PETSC_FALSE;
4272 
4273     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
4274   } else {
4275     A->ops->mult              = 0;
4276     A->ops->sor               = 0;
4277     A->ops->multadd           = 0;
4278     A->ops->getrowij          = 0;
4279     A->ops->restorerowij      = 0;
4280     A->ops->getcolumnij       = 0;
4281     A->ops->restorecolumnij   = 0;
4282     A->ops->coloringpatch     = 0;
4283     A->ops->multdiagonalblock = 0;
4284     a->inode.node_count       = node_count;
4285     a->inode.size             = ns;
4286 
4287     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
4288   }
4289   a->inode.checked = PETSC_TRUE;
4290   PetscFunctionReturn(0);
4291 }
4292 
4293 #undef __FUNCT__
4294 #define __FUNCT__ "MatSeqAIJInvalidateDiagonal_Inode"
4295 PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4296 {
4297   Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;
4298 
4299   PetscFunctionBegin;
4300   a->inode.ibdiagvalid = PETSC_FALSE;
4301   PetscFunctionReturn(0);
4302 }
4303 
4304 /*
4305      This is really ugly. if inodes are used this replaces the
4306   permutations with ones that correspond to rows/cols of the matrix
4307   rather then inode blocks
4308 */
4309 #undef __FUNCT__
4310 #define __FUNCT__ "MatInodeAdjustForInodes"
4311 PetscErrorCode  MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
4312 {
4313   PetscErrorCode ierr;
4314 
4315   PetscFunctionBegin;
4316   ierr = PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));CHKERRQ(ierr);
4317   PetscFunctionReturn(0);
4318 }
4319 
4320 #undef __FUNCT__
4321 #define __FUNCT__ "MatInodeAdjustForInodes_SeqAIJ_Inode"
4322 PetscErrorCode  MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
4323 {
4324   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data;
4325   PetscErrorCode ierr;
4326   PetscInt       m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
4327   const PetscInt *ridx,*cidx;
4328   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
4329   PetscInt       nslim_col,*ns_col;
4330   IS             ris = *rperm,cis = *cperm;
4331 
4332   PetscFunctionBegin;
4333   if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */
4334   if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */
4335 
4336   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
4337   ierr = PetscMalloc1(((nslim_row>nslim_col) ? nslim_row : nslim_col)+1,&tns);CHKERRQ(ierr);
4338   ierr = PetscMalloc2(m,&permr,n,&permc);CHKERRQ(ierr);
4339 
4340   ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr);
4341   ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr);
4342 
4343   /* Form the inode structure for the rows of permuted matric using inv perm*/
4344   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
4345 
4346   /* Construct the permutations for rows*/
4347   for (i=0,row = 0; i<nslim_row; ++i) {
4348     indx      = ridx[i];
4349     start_val = tns[indx];
4350     end_val   = tns[indx + 1];
4351     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
4352   }
4353 
4354   /* Form the inode structure for the columns of permuted matrix using inv perm*/
4355   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
4356 
4357   /* Construct permutations for columns */
4358   for (i=0,col=0; i<nslim_col; ++i) {
4359     indx      = cidx[i];
4360     start_val = tns[indx];
4361     end_val   = tns[indx + 1];
4362     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
4363   }
4364 
4365   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm);CHKERRQ(ierr);
4366   ierr = ISSetPermutation(*rperm);CHKERRQ(ierr);
4367   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm);CHKERRQ(ierr);
4368   ierr = ISSetPermutation(*cperm);CHKERRQ(ierr);
4369 
4370   ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr);
4371   ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr);
4372 
4373   ierr = PetscFree(ns_col);CHKERRQ(ierr);
4374   ierr = PetscFree2(permr,permc);CHKERRQ(ierr);
4375   ierr = ISDestroy(&cis);CHKERRQ(ierr);
4376   ierr = ISDestroy(&ris);CHKERRQ(ierr);
4377   ierr = PetscFree(tns);CHKERRQ(ierr);
4378   PetscFunctionReturn(0);
4379 }
4380 
4381 #undef __FUNCT__
4382 #define __FUNCT__ "MatInodeGetInodeSizes"
4383 /*@C
4384    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
4385 
4386    Not Collective
4387 
4388    Input Parameter:
4389 .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
4390 
4391    Output Parameter:
4392 +  node_count - no of inodes present in the matrix.
4393 .  sizes      - an array of size node_count,with sizes of each inode.
4394 -  limit      - the max size used to generate the inodes.
4395 
4396    Level: advanced
4397 
4398    Notes: This routine returns some internal storage information
4399    of the matrix, it is intended to be used by advanced users.
4400    It should be called after the matrix is assembled.
4401    The contents of the sizes[] array should not be changed.
4402    NULL may be passed for information not requested.
4403 
4404 .keywords: matrix, seqaij, get, inode
4405 
4406 .seealso: MatGetInfo()
4407 @*/
4408 PetscErrorCode  MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4409 {
4410   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
4411 
4412   PetscFunctionBegin;
4413   if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
4414   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",&f);CHKERRQ(ierr);
4415   if (f) {
4416     ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr);
4417   }
4418   PetscFunctionReturn(0);
4419 }
4420 
4421 #undef __FUNCT__
4422 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode"
4423 PetscErrorCode  MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4424 {
4425   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4426 
4427   PetscFunctionBegin;
4428   if (node_count) *node_count = a->inode.node_count;
4429   if (sizes)      *sizes      = a->inode.size;
4430   if (limit)      *limit      = a->inode.limit;
4431   PetscFunctionReturn(0);
4432 }
4433