xref: /petsc/src/mat/impls/aij/seq/inode.c (revision 0e7a5c2b80ddf573adbcd5268e4f427f138ff352)
1 #define PETSCMAT_DLL
2 
3 /*
4   This file provides high performance routines for the Inode format (compressed sparse row)
5   by taking advantage of rows with identical nonzero structure (I-nodes).
6 */
7 #include "../src/mat/impls/aij/seq/aij.h"
8 
9 #undef __FUNCT__
10 #define __FUNCT__ "Mat_CreateColInode"
11 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns)
12 {
13   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
14   PetscErrorCode ierr;
15   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;
16 
17   PetscFunctionBegin;
18   n      = A->cmap->n;
19   m      = A->rmap->n;
20   ns_row = a->inode.size;
21 
22   min_mn = (m < n) ? m : n;
23   if (!ns) {
24     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++);
25     for(; count+1 < n; count++,i++);
26     if (count < n)  {
27       i++;
28     }
29     *size = i;
30     PetscFunctionReturn(0);
31   }
32   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr);
33 
34   /* Use the same row structure wherever feasible. */
35   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
36     ns_col[i] = ns_row[i];
37   }
38 
39   /* if m < n; pad up the remainder with inode_limit */
40   for(; count+1 < n; count++,i++) {
41     ns_col[i] = 1;
42   }
43   /* The last node is the odd ball. padd it up with the remaining rows; */
44   if (count < n)  {
45     ns_col[i] = n - count;
46     i++;
47   } else if (count > n) {
48     /* Adjust for the over estimation */
49     ns_col[i-1] += n - count;
50   }
51   *size = i;
52   *ns   = ns_col;
53   PetscFunctionReturn(0);
54 }
55 
56 
57 /*
58       This builds symmetric version of nonzero structure,
59 */
60 #undef __FUNCT__
61 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric"
62 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
63 {
64   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
65   PetscErrorCode ierr;
66   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n;
67   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j;
68 
69   PetscFunctionBegin;
70   nslim_row = a->inode.node_count;
71   m         = A->rmap->n;
72   n         = A->cmap->n;
73   if (m != n) SETERRQ(PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
74 
75   /* Use the row_inode as column_inode */
76   nslim_col = nslim_row;
77   ns_col    = ns_row;
78 
79   /* allocate space for reformated inode structure */
80   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
81   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
82   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
83 
84   for (i1=0,col=0; i1<nslim_col; ++i1){
85     nsz = ns_col[i1];
86     for (i2=0; i2<nsz; ++i2,++col)
87       tvc[col] = i1;
88   }
89   /* allocate space for row pointers */
90   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
91   *iia = ia;
92   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
93   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
94 
95   /* determine the number of columns in each row */
96   ia[0] = oshift;
97   for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) {
98 
99     j    = aj + ai[row] + ishift;
100     jmax = aj + ai[row+1] + ishift;
101     i2   = 0;
102     col  = *j++ + ishift;
103     i2   = tvc[col];
104     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
105       ia[i1+1]++;
106       ia[i2+1]++;
107       i2++;                     /* Start col of next node */
108       while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j;
109       i2 = tvc[col];
110     }
111     if(i2 == i1) ia[i2+1]++;    /* now the diagonal element */
112   }
113 
114   /* shift ia[i] to point to next row */
115   for (i1=1; i1<nslim_row+1; i1++) {
116     row        = ia[i1-1];
117     ia[i1]    += row;
118     work[i1-1] = row - oshift;
119   }
120 
121   /* allocate space for column pointers */
122   nz   = ia[nslim_row] + (!ishift);
123   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
124   *jja = ja;
125 
126  /* loop over lower triangular part putting into ja */
127   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
128     j    = aj + ai[row] + ishift;
129     jmax = aj + ai[row+1] + ishift;
130     i2   = 0;                     /* Col inode index */
131     col  = *j++ + ishift;
132     i2   = tvc[col];
133     while (i2<i1 && j<jmax) {
134       ja[work[i2]++] = i1 + oshift;
135       ja[work[i1]++] = i2 + oshift;
136       ++i2;
137       while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */
138       i2 = tvc[col];
139     }
140     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
141 
142   }
143   ierr = PetscFree(work);CHKERRQ(ierr);
144   ierr = PetscFree(tns);CHKERRQ(ierr);
145   ierr = PetscFree(tvc);CHKERRQ(ierr);
146   PetscFunctionReturn(0);
147 }
148 
149 /*
150       This builds nonsymmetric version of nonzero structure,
151 */
152 #undef __FUNCT__
153 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric"
154 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
155 {
156   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
157   PetscErrorCode ierr;
158   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col;
159   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
160 
161   PetscFunctionBegin;
162   nslim_row = a->inode.node_count;
163   n         = A->cmap->n;
164 
165   /* Create The column_inode for this matrix */
166   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
167 
168   /* allocate space for reformated column_inode structure */
169   ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
170   ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
171   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
172 
173   for (i1=0,col=0; i1<nslim_col; ++i1){
174     nsz = ns_col[i1];
175     for (i2=0; i2<nsz; ++i2,++col)
176       tvc[col] = i1;
177   }
178   /* allocate space for row pointers */
179   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
180   *iia = ia;
181   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
182   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
183 
184   /* determine the number of columns in each row */
185   ia[0] = oshift;
186   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
187     j   = aj + ai[row] + ishift;
188     col = *j++ + ishift;
189     i2  = tvc[col];
190     nz  = ai[row+1] - ai[row];
191     while (nz-- > 0) {           /* off-diagonal elemets */
192       ia[i1+1]++;
193       i2++;                     /* Start col of next node */
194       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
195       if (nz > 0) i2 = tvc[col];
196     }
197   }
198 
199   /* shift ia[i] to point to next row */
200   for (i1=1; i1<nslim_row+1; i1++) {
201     row        = ia[i1-1];
202     ia[i1]    += row;
203     work[i1-1] = row - oshift;
204   }
205 
206   /* allocate space for column pointers */
207   nz   = ia[nslim_row] + (!ishift);
208   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
209   *jja = ja;
210 
211  /* loop over matrix putting into ja */
212   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
213     j   = aj + ai[row] + ishift;
214     i2  = 0;                     /* Col inode index */
215     col = *j++ + ishift;
216     i2  = tvc[col];
217     nz  = ai[row+1] - ai[row];
218     while (nz-- > 0) {
219       ja[work[i1]++] = i2 + oshift;
220       ++i2;
221       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
222       if (nz > 0) i2 = tvc[col];
223     }
224   }
225   ierr = PetscFree(ns_col);CHKERRQ(ierr);
226   ierr = PetscFree(work);CHKERRQ(ierr);
227   ierr = PetscFree(tns);CHKERRQ(ierr);
228   ierr = PetscFree(tvc);CHKERRQ(ierr);
229   PetscFunctionReturn(0);
230 }
231 
232 #undef __FUNCT__
233 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode"
234 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
235 {
236   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
237   PetscErrorCode ierr;
238 
239   PetscFunctionBegin;
240   *n     = a->inode.node_count;
241   if (!ia) PetscFunctionReturn(0);
242   if (!blockcompressed) {
243     ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
244   } else if (symmetric) {
245     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
246   } else {
247     ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
248   }
249   PetscFunctionReturn(0);
250 }
251 
252 #undef __FUNCT__
253 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode"
254 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
255 {
256   PetscErrorCode ierr;
257 
258   PetscFunctionBegin;
259   if (!ia) PetscFunctionReturn(0);
260 
261   if (!blockcompressed) {
262     ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
263   } else {
264     ierr = PetscFree(*ia);CHKERRQ(ierr);
265     ierr = PetscFree(*ja);CHKERRQ(ierr);
266   }
267 
268   PetscFunctionReturn(0);
269 }
270 
271 /* ----------------------------------------------------------- */
272 
273 #undef __FUNCT__
274 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric"
275 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
276 {
277   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
278   PetscErrorCode ierr;
279   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
280   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
281 
282   PetscFunctionBegin;
283   nslim_row = a->inode.node_count;
284   n         = A->cmap->n;
285 
286   /* Create The column_inode for this matrix */
287   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
288 
289   /* allocate space for reformated column_inode structure */
290   ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
291   ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
292   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
293 
294   for (i1=0,col=0; i1<nslim_col; ++i1){
295     nsz = ns_col[i1];
296     for (i2=0; i2<nsz; ++i2,++col)
297       tvc[col] = i1;
298   }
299   /* allocate space for column pointers */
300   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
301   *iia = ia;
302   ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr);
303   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
304 
305   /* determine the number of columns in each row */
306   ia[0] = oshift;
307   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
308     j   = aj + ai[row] + ishift;
309     col = *j++ + ishift;
310     i2  = tvc[col];
311     nz  = ai[row+1] - ai[row];
312     while (nz-- > 0) {           /* off-diagonal elemets */
313       /* ia[i1+1]++; */
314       ia[i2+1]++;
315       i2++;
316       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
317       if (nz > 0) i2 = tvc[col];
318     }
319   }
320 
321   /* shift ia[i] to point to next col */
322   for (i1=1; i1<nslim_col+1; i1++) {
323     col        = ia[i1-1];
324     ia[i1]    += col;
325     work[i1-1] = col - oshift;
326   }
327 
328   /* allocate space for column pointers */
329   nz   = ia[nslim_col] + (!ishift);
330   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
331   *jja = ja;
332 
333  /* loop over matrix putting into ja */
334   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
335     j   = aj + ai[row] + ishift;
336     i2  = 0;                     /* Col inode index */
337     col = *j++ + ishift;
338     i2  = tvc[col];
339     nz  = ai[row+1] - ai[row];
340     while (nz-- > 0) {
341       /* ja[work[i1]++] = i2 + oshift; */
342       ja[work[i2]++] = i1 + oshift;
343       i2++;
344       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
345       if (nz > 0) i2 = tvc[col];
346     }
347   }
348   ierr = PetscFree(ns_col);CHKERRQ(ierr);
349   ierr = PetscFree(work);CHKERRQ(ierr);
350   ierr = PetscFree(tns);CHKERRQ(ierr);
351   ierr = PetscFree(tvc);CHKERRQ(ierr);
352   PetscFunctionReturn(0);
353 }
354 
355 #undef __FUNCT__
356 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode"
357 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
358 {
359   PetscErrorCode ierr;
360 
361   PetscFunctionBegin;
362   ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr);
363   if (!ia) PetscFunctionReturn(0);
364 
365   if (!blockcompressed) {
366     ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
367   } else if (symmetric) {
368     /* Since the indices are symmetric it does'nt matter */
369     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
370   } else {
371     ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
372   }
373   PetscFunctionReturn(0);
374 }
375 
376 #undef __FUNCT__
377 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode"
378 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
379 {
380   PetscErrorCode ierr;
381 
382   PetscFunctionBegin;
383   if (!ia) PetscFunctionReturn(0);
384   if (!blockcompressed) {
385     ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
386   } else {
387     ierr = PetscFree(*ia);CHKERRQ(ierr);
388     ierr = PetscFree(*ja);CHKERRQ(ierr);
389   }
390   PetscFunctionReturn(0);
391 }
392 
393 /* ----------------------------------------------------------- */
394 
395 #undef __FUNCT__
396 #define __FUNCT__ "MatMult_SeqAIJ_Inode"
397 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
398 {
399   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
400   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
401   PetscScalar       *y;
402   const PetscScalar *x;
403   const MatScalar   *v1,*v2,*v3,*v4,*v5;
404   PetscErrorCode    ierr;
405   PetscInt          *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0;
406 
407 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
408 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
409 #endif
410 
411   PetscFunctionBegin;
412   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
413   node_max = a->inode.node_count;
414   ns       = a->inode.size;     /* Node Size array */
415   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
416   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
417   idx  = a->j;
418   v1   = a->a;
419   ii   = a->i;
420 
421   for (i = 0,row = 0; i< node_max; ++i){
422     nsz  = ns[i];
423     n    = ii[1] - ii[0];
424     nonzerorow += (n>0)*nsz;
425     ii  += nsz;
426     PetscPrefetchBlock(idx+nsz*n,n,0,0);    /* Prefetch the indices for the block row after the current one */
427     PetscPrefetchBlock(v1+nsz*n,nsz*n,0,0); /* Prefetch the values for the block row after the current one  */
428     sz   = n;                   /* No of non zeros in this row */
429                                 /* Switch on the size of Node */
430     switch (nsz){               /* Each loop in 'case' is unrolled */
431     case 1 :
432       sum1  = 0.;
433 
434       for(n = 0; n< sz-1; n+=2) {
435         i1   = idx[0];          /* The instructions are ordered to */
436         i2   = idx[1];          /* make the compiler's job easy */
437         idx += 2;
438         tmp0 = x[i1];
439         tmp1 = x[i2];
440         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
441        }
442 
443       if (n == sz-1){          /* Take care of the last nonzero  */
444         tmp0  = x[*idx++];
445         sum1 += *v1++ * tmp0;
446       }
447       y[row++]=sum1;
448       break;
449     case 2:
450       sum1  = 0.;
451       sum2  = 0.;
452       v2    = v1 + n;
453 
454       for (n = 0; n< sz-1; n+=2) {
455         i1   = idx[0];
456         i2   = idx[1];
457         idx += 2;
458         tmp0 = x[i1];
459         tmp1 = x[i2];
460         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
461         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
462       }
463       if (n == sz-1){
464         tmp0  = x[*idx++];
465         sum1 += *v1++ * tmp0;
466         sum2 += *v2++ * tmp0;
467       }
468       y[row++]=sum1;
469       y[row++]=sum2;
470       v1      =v2;              /* Since the next block to be processed starts there*/
471       idx    +=sz;
472       break;
473     case 3:
474       sum1  = 0.;
475       sum2  = 0.;
476       sum3  = 0.;
477       v2    = v1 + n;
478       v3    = v2 + n;
479 
480       for (n = 0; n< sz-1; n+=2) {
481         i1   = idx[0];
482         i2   = idx[1];
483         idx += 2;
484         tmp0 = x[i1];
485         tmp1 = x[i2];
486         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
487         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
488         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
489       }
490       if (n == sz-1){
491         tmp0  = x[*idx++];
492         sum1 += *v1++ * tmp0;
493         sum2 += *v2++ * tmp0;
494         sum3 += *v3++ * tmp0;
495       }
496       y[row++]=sum1;
497       y[row++]=sum2;
498       y[row++]=sum3;
499       v1       =v3;             /* Since the next block to be processed starts there*/
500       idx     +=2*sz;
501       break;
502     case 4:
503       sum1  = 0.;
504       sum2  = 0.;
505       sum3  = 0.;
506       sum4  = 0.;
507       v2    = v1 + n;
508       v3    = v2 + n;
509       v4    = v3 + n;
510 
511       for (n = 0; n< sz-1; n+=2) {
512         i1   = idx[0];
513         i2   = idx[1];
514         idx += 2;
515         tmp0 = x[i1];
516         tmp1 = x[i2];
517         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
518         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
519         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
520         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
521       }
522       if (n == sz-1){
523         tmp0  = x[*idx++];
524         sum1 += *v1++ * tmp0;
525         sum2 += *v2++ * tmp0;
526         sum3 += *v3++ * tmp0;
527         sum4 += *v4++ * tmp0;
528       }
529       y[row++]=sum1;
530       y[row++]=sum2;
531       y[row++]=sum3;
532       y[row++]=sum4;
533       v1      =v4;              /* Since the next block to be processed starts there*/
534       idx    +=3*sz;
535       break;
536     case 5:
537       sum1  = 0.;
538       sum2  = 0.;
539       sum3  = 0.;
540       sum4  = 0.;
541       sum5  = 0.;
542       v2    = v1 + n;
543       v3    = v2 + n;
544       v4    = v3 + n;
545       v5    = v4 + n;
546 
547       for (n = 0; n<sz-1; n+=2) {
548         i1   = idx[0];
549         i2   = idx[1];
550         idx += 2;
551         tmp0 = x[i1];
552         tmp1 = x[i2];
553         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
554         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
555         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
556         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
557         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
558       }
559       if (n == sz-1){
560         tmp0  = x[*idx++];
561         sum1 += *v1++ * tmp0;
562         sum2 += *v2++ * tmp0;
563         sum3 += *v3++ * tmp0;
564         sum4 += *v4++ * tmp0;
565         sum5 += *v5++ * tmp0;
566       }
567       y[row++]=sum1;
568       y[row++]=sum2;
569       y[row++]=sum3;
570       y[row++]=sum4;
571       y[row++]=sum5;
572       v1      =v5;       /* Since the next block to be processed starts there */
573       idx    +=4*sz;
574       break;
575     default :
576       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
577     }
578   }
579   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
580   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
581   ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr);
582   PetscFunctionReturn(0);
583 }
584 /* ----------------------------------------------------------- */
585 /* Almost same code as the MatMult_SeqAIJ_Inode() */
586 #undef __FUNCT__
587 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode"
588 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
589 {
590   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
591   PetscScalar    sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
592   MatScalar      *v1,*v2,*v3,*v4,*v5;
593   PetscScalar    *x,*y,*z,*zt;
594   PetscErrorCode ierr;
595   PetscInt       *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
596 
597   PetscFunctionBegin;
598   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
599   node_max = a->inode.node_count;
600   ns       = a->inode.size;     /* Node Size array */
601   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
602   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
603   if (zz != yy) {
604     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
605   } else {
606     z = y;
607   }
608   zt = z;
609 
610   idx  = a->j;
611   v1   = a->a;
612   ii   = a->i;
613 
614   for (i = 0,row = 0; i< node_max; ++i){
615     nsz  = ns[i];
616     n    = ii[1] - ii[0];
617     ii  += nsz;
618     sz   = n;                   /* No of non zeros in this row */
619                                 /* Switch on the size of Node */
620     switch (nsz){               /* Each loop in 'case' is unrolled */
621     case 1 :
622       sum1  = *zt++;
623 
624       for(n = 0; n< sz-1; n+=2) {
625         i1   = idx[0];          /* The instructions are ordered to */
626         i2   = idx[1];          /* make the compiler's job easy */
627         idx += 2;
628         tmp0 = x[i1];
629         tmp1 = x[i2];
630         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
631        }
632 
633       if(n   == sz-1){          /* Take care of the last nonzero  */
634         tmp0  = x[*idx++];
635         sum1 += *v1++ * tmp0;
636       }
637       y[row++]=sum1;
638       break;
639     case 2:
640       sum1  = *zt++;
641       sum2  = *zt++;
642       v2    = v1 + n;
643 
644       for(n = 0; n< sz-1; n+=2) {
645         i1   = idx[0];
646         i2   = idx[1];
647         idx += 2;
648         tmp0 = x[i1];
649         tmp1 = x[i2];
650         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
651         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
652       }
653       if(n   == sz-1){
654         tmp0  = x[*idx++];
655         sum1 += *v1++ * tmp0;
656         sum2 += *v2++ * tmp0;
657       }
658       y[row++]=sum1;
659       y[row++]=sum2;
660       v1      =v2;              /* Since the next block to be processed starts there*/
661       idx    +=sz;
662       break;
663     case 3:
664       sum1  = *zt++;
665       sum2  = *zt++;
666       sum3  = *zt++;
667       v2    = v1 + n;
668       v3    = v2 + n;
669 
670       for (n = 0; n< sz-1; n+=2) {
671         i1   = idx[0];
672         i2   = idx[1];
673         idx += 2;
674         tmp0 = x[i1];
675         tmp1 = x[i2];
676         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
677         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
678         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
679       }
680       if (n == sz-1){
681         tmp0  = x[*idx++];
682         sum1 += *v1++ * tmp0;
683         sum2 += *v2++ * tmp0;
684         sum3 += *v3++ * tmp0;
685       }
686       y[row++]=sum1;
687       y[row++]=sum2;
688       y[row++]=sum3;
689       v1       =v3;             /* Since the next block to be processed starts there*/
690       idx     +=2*sz;
691       break;
692     case 4:
693       sum1  = *zt++;
694       sum2  = *zt++;
695       sum3  = *zt++;
696       sum4  = *zt++;
697       v2    = v1 + n;
698       v3    = v2 + n;
699       v4    = v3 + n;
700 
701       for (n = 0; n< sz-1; n+=2) {
702         i1   = idx[0];
703         i2   = idx[1];
704         idx += 2;
705         tmp0 = x[i1];
706         tmp1 = x[i2];
707         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
708         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
709         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
710         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
711       }
712       if (n == sz-1){
713         tmp0  = x[*idx++];
714         sum1 += *v1++ * tmp0;
715         sum2 += *v2++ * tmp0;
716         sum3 += *v3++ * tmp0;
717         sum4 += *v4++ * tmp0;
718       }
719       y[row++]=sum1;
720       y[row++]=sum2;
721       y[row++]=sum3;
722       y[row++]=sum4;
723       v1      =v4;              /* Since the next block to be processed starts there*/
724       idx    +=3*sz;
725       break;
726     case 5:
727       sum1  = *zt++;
728       sum2  = *zt++;
729       sum3  = *zt++;
730       sum4  = *zt++;
731       sum5  = *zt++;
732       v2    = v1 + n;
733       v3    = v2 + n;
734       v4    = v3 + n;
735       v5    = v4 + n;
736 
737       for (n = 0; n<sz-1; n+=2) {
738         i1   = idx[0];
739         i2   = idx[1];
740         idx += 2;
741         tmp0 = x[i1];
742         tmp1 = x[i2];
743         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
744         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
745         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
746         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
747         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
748       }
749       if(n   == sz-1){
750         tmp0  = x[*idx++];
751         sum1 += *v1++ * tmp0;
752         sum2 += *v2++ * tmp0;
753         sum3 += *v3++ * tmp0;
754         sum4 += *v4++ * tmp0;
755         sum5 += *v5++ * tmp0;
756       }
757       y[row++]=sum1;
758       y[row++]=sum2;
759       y[row++]=sum3;
760       y[row++]=sum4;
761       y[row++]=sum5;
762       v1      =v5;       /* Since the next block to be processed starts there */
763       idx    +=4*sz;
764       break;
765     default :
766       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
767     }
768   }
769   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
770   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
771   if (zz != yy) {
772     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
773   }
774   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
775   PetscFunctionReturn(0);
776 }
777 
778 /* ----------------------------------------------------------- */
779 #undef __FUNCT__
780 #define __FUNCT__ "MatSolve_SeqAIJ_Inode_inplace"
781 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)
782 {
783   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
784   IS                iscol = a->col,isrow = a->row;
785   PetscErrorCode    ierr;
786   const PetscInt    *r,*c,*rout,*cout;
787   PetscInt          i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
788   PetscInt          node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
789   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
790   PetscScalar       sum1,sum2,sum3,sum4,sum5;
791   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
792   const PetscScalar *b;
793 
794   PetscFunctionBegin;
795   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
796   node_max = a->inode.node_count;
797   ns       = a->inode.size;     /* Node Size array */
798 
799   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
800   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
801   tmp  = a->solve_work;
802 
803   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
804   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
805 
806   /* forward solve the lower triangular */
807   tmps = tmp ;
808   aa   = a_a ;
809   aj   = a_j ;
810   ad   = a->diag;
811 
812   for (i = 0,row = 0; i< node_max; ++i){
813     nsz = ns[i];
814     aii = ai[row];
815     v1  = aa + aii;
816     vi  = aj + aii;
817     nz  = ad[row]- aii;
818     if (i < node_max-1) {
819       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
820       * but our indexing to determine it's size could. */
821       PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,0); /* indices */
822       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
823       PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,0);
824       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
825     }
826 
827     switch (nsz){               /* Each loop in 'case' is unrolled */
828     case 1 :
829       sum1 = b[*r++];
830       for(j=0; j<nz-1; j+=2){
831         i0   = vi[0];
832         i1   = vi[1];
833         vi  +=2;
834         tmp0 = tmps[i0];
835         tmp1 = tmps[i1];
836         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
837       }
838       if(j == nz-1){
839         tmp0 = tmps[*vi++];
840         sum1 -= *v1++ *tmp0;
841       }
842       tmp[row ++]=sum1;
843       break;
844     case 2:
845       sum1 = b[*r++];
846       sum2 = b[*r++];
847       v2   = aa + ai[row+1];
848 
849       for(j=0; j<nz-1; j+=2){
850         i0   = vi[0];
851         i1   = vi[1];
852         vi  +=2;
853         tmp0 = tmps[i0];
854         tmp1 = tmps[i1];
855         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
856         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
857       }
858       if(j == nz-1){
859         tmp0 = tmps[*vi++];
860         sum1 -= *v1++ *tmp0;
861         sum2 -= *v2++ *tmp0;
862       }
863       sum2 -= *v2++ * sum1;
864       tmp[row ++]=sum1;
865       tmp[row ++]=sum2;
866       break;
867     case 3:
868       sum1 = b[*r++];
869       sum2 = b[*r++];
870       sum3 = b[*r++];
871       v2   = aa + ai[row+1];
872       v3   = aa + ai[row+2];
873 
874       for (j=0; j<nz-1; j+=2){
875         i0   = vi[0];
876         i1   = vi[1];
877         vi  +=2;
878         tmp0 = tmps[i0];
879         tmp1 = tmps[i1];
880         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
881         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
882         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
883       }
884       if (j == nz-1){
885         tmp0 = tmps[*vi++];
886         sum1 -= *v1++ *tmp0;
887         sum2 -= *v2++ *tmp0;
888         sum3 -= *v3++ *tmp0;
889       }
890       sum2 -= *v2++ * sum1;
891       sum3 -= *v3++ * sum1;
892       sum3 -= *v3++ * sum2;
893       tmp[row ++]=sum1;
894       tmp[row ++]=sum2;
895       tmp[row ++]=sum3;
896       break;
897 
898     case 4:
899       sum1 = b[*r++];
900       sum2 = b[*r++];
901       sum3 = b[*r++];
902       sum4 = b[*r++];
903       v2   = aa + ai[row+1];
904       v3   = aa + ai[row+2];
905       v4   = aa + ai[row+3];
906 
907       for (j=0; j<nz-1; j+=2){
908         i0   = vi[0];
909         i1   = vi[1];
910         vi  +=2;
911         tmp0 = tmps[i0];
912         tmp1 = tmps[i1];
913         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
914         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
915         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
916         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
917       }
918       if (j == nz-1){
919         tmp0 = tmps[*vi++];
920         sum1 -= *v1++ *tmp0;
921         sum2 -= *v2++ *tmp0;
922         sum3 -= *v3++ *tmp0;
923         sum4 -= *v4++ *tmp0;
924       }
925       sum2 -= *v2++ * sum1;
926       sum3 -= *v3++ * sum1;
927       sum4 -= *v4++ * sum1;
928       sum3 -= *v3++ * sum2;
929       sum4 -= *v4++ * sum2;
930       sum4 -= *v4++ * sum3;
931 
932       tmp[row ++]=sum1;
933       tmp[row ++]=sum2;
934       tmp[row ++]=sum3;
935       tmp[row ++]=sum4;
936       break;
937     case 5:
938       sum1 = b[*r++];
939       sum2 = b[*r++];
940       sum3 = b[*r++];
941       sum4 = b[*r++];
942       sum5 = b[*r++];
943       v2   = aa + ai[row+1];
944       v3   = aa + ai[row+2];
945       v4   = aa + ai[row+3];
946       v5   = aa + ai[row+4];
947 
948       for (j=0; j<nz-1; j+=2){
949         i0   = vi[0];
950         i1   = vi[1];
951         vi  +=2;
952         tmp0 = tmps[i0];
953         tmp1 = tmps[i1];
954         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
955         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
956         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
957         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
958         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
959       }
960       if (j == nz-1){
961         tmp0 = tmps[*vi++];
962         sum1 -= *v1++ *tmp0;
963         sum2 -= *v2++ *tmp0;
964         sum3 -= *v3++ *tmp0;
965         sum4 -= *v4++ *tmp0;
966         sum5 -= *v5++ *tmp0;
967       }
968 
969       sum2 -= *v2++ * sum1;
970       sum3 -= *v3++ * sum1;
971       sum4 -= *v4++ * sum1;
972       sum5 -= *v5++ * sum1;
973       sum3 -= *v3++ * sum2;
974       sum4 -= *v4++ * sum2;
975       sum5 -= *v5++ * sum2;
976       sum4 -= *v4++ * sum3;
977       sum5 -= *v5++ * sum3;
978       sum5 -= *v5++ * sum4;
979 
980       tmp[row ++]=sum1;
981       tmp[row ++]=sum2;
982       tmp[row ++]=sum3;
983       tmp[row ++]=sum4;
984       tmp[row ++]=sum5;
985       break;
986     default:
987       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
988     }
989   }
990   /* backward solve the upper triangular */
991   for (i=node_max -1 ,row = n-1 ; i>=0; i--){
992     nsz = ns[i];
993     aii = ai[row+1] -1;
994     v1  = aa + aii;
995     vi  = aj + aii;
996     nz  = aii- ad[row];
997     switch (nsz){               /* Each loop in 'case' is unrolled */
998     case 1 :
999       sum1 = tmp[row];
1000 
1001       for(j=nz ; j>1; j-=2){
1002         vi  -=2;
1003         i0   = vi[2];
1004         i1   = vi[1];
1005         tmp0 = tmps[i0];
1006         tmp1 = tmps[i1];
1007         v1   -= 2;
1008         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1009       }
1010       if (j==1){
1011         tmp0  = tmps[*vi--];
1012         sum1 -= *v1-- * tmp0;
1013       }
1014       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1015       break;
1016     case 2 :
1017       sum1 = tmp[row];
1018       sum2 = tmp[row -1];
1019       v2   = aa + ai[row]-1;
1020       for (j=nz ; j>1; j-=2){
1021         vi  -=2;
1022         i0   = vi[2];
1023         i1   = vi[1];
1024         tmp0 = tmps[i0];
1025         tmp1 = tmps[i1];
1026         v1   -= 2;
1027         v2   -= 2;
1028         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1029         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1030       }
1031       if (j==1){
1032         tmp0  = tmps[*vi--];
1033         sum1 -= *v1-- * tmp0;
1034         sum2 -= *v2-- * tmp0;
1035       }
1036 
1037       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1038       sum2   -= *v2-- * tmp0;
1039       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1040       break;
1041     case 3 :
1042       sum1 = tmp[row];
1043       sum2 = tmp[row -1];
1044       sum3 = tmp[row -2];
1045       v2   = aa + ai[row]-1;
1046       v3   = aa + ai[row -1]-1;
1047       for (j=nz ; j>1; j-=2){
1048         vi  -=2;
1049         i0   = vi[2];
1050         i1   = vi[1];
1051         tmp0 = tmps[i0];
1052         tmp1 = tmps[i1];
1053         v1   -= 2;
1054         v2   -= 2;
1055         v3   -= 2;
1056         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1057         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1058         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1059       }
1060       if (j==1){
1061         tmp0  = tmps[*vi--];
1062         sum1 -= *v1-- * tmp0;
1063         sum2 -= *v2-- * tmp0;
1064         sum3 -= *v3-- * tmp0;
1065       }
1066       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1067       sum2   -= *v2-- * tmp0;
1068       sum3   -= *v3-- * tmp0;
1069       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1070       sum3   -= *v3-- * tmp0;
1071       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1072 
1073       break;
1074     case 4 :
1075       sum1 = tmp[row];
1076       sum2 = tmp[row -1];
1077       sum3 = tmp[row -2];
1078       sum4 = tmp[row -3];
1079       v2   = aa + ai[row]-1;
1080       v3   = aa + ai[row -1]-1;
1081       v4   = aa + ai[row -2]-1;
1082 
1083       for (j=nz ; j>1; j-=2){
1084         vi  -=2;
1085         i0   = vi[2];
1086         i1   = vi[1];
1087         tmp0 = tmps[i0];
1088         tmp1 = tmps[i1];
1089         v1  -= 2;
1090         v2  -= 2;
1091         v3  -= 2;
1092         v4  -= 2;
1093         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1094         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1095         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1096         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1097       }
1098       if (j==1){
1099         tmp0  = tmps[*vi--];
1100         sum1 -= *v1-- * tmp0;
1101         sum2 -= *v2-- * tmp0;
1102         sum3 -= *v3-- * tmp0;
1103         sum4 -= *v4-- * tmp0;
1104       }
1105 
1106       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1107       sum2   -= *v2-- * tmp0;
1108       sum3   -= *v3-- * tmp0;
1109       sum4   -= *v4-- * tmp0;
1110       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1111       sum3   -= *v3-- * tmp0;
1112       sum4   -= *v4-- * tmp0;
1113       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1114       sum4   -= *v4-- * tmp0;
1115       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1116       break;
1117     case 5 :
1118       sum1 = tmp[row];
1119       sum2 = tmp[row -1];
1120       sum3 = tmp[row -2];
1121       sum4 = tmp[row -3];
1122       sum5 = tmp[row -4];
1123       v2   = aa + ai[row]-1;
1124       v3   = aa + ai[row -1]-1;
1125       v4   = aa + ai[row -2]-1;
1126       v5   = aa + ai[row -3]-1;
1127       for (j=nz ; j>1; j-=2){
1128         vi  -= 2;
1129         i0   = vi[2];
1130         i1   = vi[1];
1131         tmp0 = tmps[i0];
1132         tmp1 = tmps[i1];
1133         v1   -= 2;
1134         v2   -= 2;
1135         v3   -= 2;
1136         v4   -= 2;
1137         v5   -= 2;
1138         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1139         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1140         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1141         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1142         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1143       }
1144       if (j==1){
1145         tmp0  = tmps[*vi--];
1146         sum1 -= *v1-- * tmp0;
1147         sum2 -= *v2-- * tmp0;
1148         sum3 -= *v3-- * tmp0;
1149         sum4 -= *v4-- * tmp0;
1150         sum5 -= *v5-- * tmp0;
1151       }
1152 
1153       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1154       sum2   -= *v2-- * tmp0;
1155       sum3   -= *v3-- * tmp0;
1156       sum4   -= *v4-- * tmp0;
1157       sum5   -= *v5-- * tmp0;
1158       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1159       sum3   -= *v3-- * tmp0;
1160       sum4   -= *v4-- * tmp0;
1161       sum5   -= *v5-- * tmp0;
1162       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1163       sum4   -= *v4-- * tmp0;
1164       sum5   -= *v5-- * tmp0;
1165       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1166       sum5   -= *v5-- * tmp0;
1167       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1168       break;
1169     default:
1170       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1171     }
1172   }
1173   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1174   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1175   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1176   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1177   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
1178   PetscFunctionReturn(0);
1179 }
1180 
1181 #undef __FUNCT__
1182 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode"
1183 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info)
1184 {
1185   Mat              C=B;
1186   Mat_SeqAIJ       *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ *)C->data;
1187   IS               isrow = b->row,isicol = b->icol;
1188   PetscErrorCode   ierr;
1189   const PetscInt   *r,*ic,*ics;
1190   const PetscInt   n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag;
1191   PetscInt         i,j,k,nz,nzL,row,*pj;
1192   const PetscInt   *ajtmp,*bjtmp;
1193   MatScalar        *pc,*pc1,*pc2,*pc3,mul1,mul2,mul3,*pv,*rtmp1,*rtmp2,*rtmp3;
1194   const  MatScalar *aa=a->a,*v,*v1,*v2,*v3;
1195   FactorShiftCtx   sctx;
1196   PetscInt         *ddiag;
1197   PetscReal        rs;
1198   MatScalar        d;
1199   PetscInt         inod,nodesz,node_max,*ns,col;
1200   PetscInt         *tmp_vec1,*tmp_vec2,*nsmap;
1201 
1202   PetscFunctionBegin;
1203   /* MatPivotSetUp(): initialize shift context sctx */
1204   ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr);
1205 
1206   if (info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1207     ddiag          = a->diag;
1208     sctx.shift_top = info->zeropivot;
1209     for (i=0; i<n; i++) {
1210       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1211       d  = (aa)[ddiag[i]];
1212       rs = -PetscAbsScalar(d) - PetscRealPart(d);
1213       v  = aa+ai[i];
1214       nz = ai[i+1] - ai[i];
1215       for (j=0; j<nz; j++)
1216 	rs += PetscAbsScalar(v[j]);
1217       if (rs>sctx.shift_top) sctx.shift_top = rs;
1218     }
1219     sctx.shift_top   *= 1.1;
1220     sctx.nshift_max   = 5;
1221     sctx.shift_lo     = 0.;
1222     sctx.shift_hi     = 1.;
1223   }
1224 
1225   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1226   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1227 
1228   ierr  = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp1);CHKERRQ(ierr);
1229   ierr  = PetscMemzero(rtmp1,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr);
1230   rtmp2 = rtmp1 + n;
1231   rtmp3 = rtmp2 + n;
1232   ics   = ic;
1233 
1234   node_max = a->inode.node_count;
1235   ns       = a->inode.size;
1236   if (!ns){
1237     SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information");
1238   }
1239 
1240   /* If max inode size > 3, split it into two inodes.*/
1241   /* also map the inode sizes according to the ordering */
1242   ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr);
1243   for (i=0,j=0; i<node_max; ++i,++j){
1244     if (ns[i]>3) {
1245       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1246       ++j;
1247       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1248     } else {
1249       tmp_vec1[j] = ns[i];
1250     }
1251   }
1252   /* Use the correct node_max */
1253   node_max = j;
1254 
1255   /* Now reorder the inode info based on mat re-ordering info */
1256   /* First create a row -> inode_size_array_index map */
1257   ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr);
1258   ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr);
1259   for (i=0,row=0; i<node_max; i++) {
1260     nodesz = tmp_vec1[i];
1261     for (j=0; j<nodesz; j++,row++) {
1262       nsmap[row] = i;
1263     }
1264   }
1265   /* Using nsmap, create a reordered ns structure */
1266   for (i=0,j=0; i< node_max; i++) {
1267     nodesz       = tmp_vec1[nsmap[r[j]]];    /* here the reordered row_no is in r[] */
1268     tmp_vec2[i]  = nodesz;
1269     j           += nodesz;
1270   }
1271   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1272   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1273 
1274   /* Now use the correct ns */
1275   ns = tmp_vec2;
1276 
1277   do {
1278     sctx.useshift = PETSC_FALSE;
1279     /* Now loop over each block-row, and do the factorization */
1280     for (inod=0,i=0; inod<node_max; inod++){ /* i: row index; inod: inode index */
1281       nodesz = ns[inod];
1282 
1283       switch (nodesz){
1284       case 1:
1285       /*----------*/
1286         /* zero rtmp1 */
1287         /* L part */
1288         nz    = bi[i+1] - bi[i];
1289         bjtmp = bj + bi[i];
1290         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1291 
1292         /* U part */
1293         nz = bdiag[i]-bdiag[i+1];
1294         bjtmp = bj + bdiag[i+1]+1;
1295         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1296 
1297         /* load in initial (unfactored row) */
1298         nz    = ai[r[i]+1] - ai[r[i]];
1299         ajtmp = aj + ai[r[i]];
1300         v     = aa + ai[r[i]];
1301         for (j=0; j<nz; j++) {
1302           rtmp1[ics[ajtmp[j]]] = v[j];
1303         }
1304         /* ZeropivotApply() */
1305         rtmp1[i] += sctx.shift_amount;  /* shift the diagonal of the matrix */
1306 
1307         /* elimination */
1308         bjtmp = bj + bi[i];
1309         row   = *bjtmp++;
1310         nzL   = bi[i+1] - bi[i];
1311         for(k=0; k < nzL;k++) {
1312           pc = rtmp1 + row;
1313           if (*pc != 0.0) {
1314             pv   = b->a + bdiag[row];
1315             mul1 = *pc * (*pv);
1316             *pc  = mul1;
1317             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1318             pv = b->a + bdiag[row+1]+1;
1319             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1320             for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1321             ierr = PetscLogFlops(2.0*nz);CHKERRQ(ierr);
1322           }
1323           row = *bjtmp++;
1324         }
1325 
1326         /* finished row so stick it into b->a */
1327         rs = 0.0;
1328         /* L part */
1329         pv = b->a + bi[i] ;
1330         pj = b->j + bi[i] ;
1331         nz = bi[i+1] - bi[i];
1332         for (j=0; j<nz; j++) {
1333           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1334         }
1335 
1336         /* U part */
1337         pv = b->a + bdiag[i+1]+1;
1338         pj = b->j + bdiag[i+1]+1;
1339         nz = bdiag[i] - bdiag[i+1]-1;
1340         for (j=0; j<nz; j++) {
1341           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1342         }
1343 
1344         /* Check zero pivot */
1345         sctx.rs = rs;
1346         sctx.pv = rtmp1[i];
1347         ierr = MatPivotCheck(info,sctx,i);CHKERRQ(ierr);
1348 
1349         /* Mark diagonal and invert diagonal for simplier triangular solves */
1350         pv  = b->a + bdiag[i];
1351         *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1352         break;
1353 
1354       case 2:
1355       /*----------*/
1356         /* zero rtmp1 and rtmp2 */
1357         /* L part */
1358         nz    = bi[i+1] - bi[i];
1359         bjtmp = bj + bi[i];
1360         for  (j=0; j<nz; j++) {
1361           col = bjtmp[j];
1362           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1363         }
1364 
1365         /* U part */
1366         nz = bdiag[i]-bdiag[i+1];
1367         bjtmp = bj + bdiag[i+1]+1;
1368         for  (j=0; j<nz; j++) {
1369           col = bjtmp[j];
1370           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1371         }
1372 
1373         /* load in initial (unfactored row) */
1374         nz    = ai[r[i]+1] - ai[r[i]];
1375         ajtmp = aj + ai[r[i]];
1376         v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1];
1377         for (j=0; j<nz; j++) {
1378           col = ics[ajtmp[j]];
1379           rtmp1[col] = v1[j]; rtmp2[col] = v2[j];
1380         }
1381         /* ZeropivotApply(): shift the diagonal of the matrix  */
1382         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount;
1383 
1384         /* elimination */
1385         bjtmp = bj + bi[i];
1386         row   = *bjtmp++; /* pivot row */
1387         nzL   = bi[i+1] - bi[i];
1388         for(k=0; k < nzL;k++) {
1389           pc1 = rtmp1 + row;
1390           pc2 = rtmp2 + row;
1391           if (*pc1 != 0.0 || *pc2 != 0.0) {
1392             pv   = b->a + bdiag[row];
1393             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv);
1394             *pc1 = mul1;       *pc2 = mul2;
1395 
1396             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1397             pv = b->a + bdiag[row+1]+1;
1398             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1399             for (j=0; j<nz; j++){
1400               col = pj[j];
1401               rtmp1[col] -= mul1 * pv[j];
1402               rtmp2[col] -= mul2 * pv[j];
1403             }
1404             ierr = PetscLogFlops(4*nz);CHKERRQ(ierr);
1405           }
1406           row = *bjtmp++;
1407         }
1408 
1409         /* finished row i; check zero pivot, then stick row i into b->a */
1410         rs  = 0.0;
1411         /* L part */
1412         pc1 = b->a + bi[i];
1413         pj  = b->j + bi[i] ;
1414         nz  = bi[i+1] - bi[i];
1415         for (j=0; j<nz; j++) {
1416           col = pj[j];
1417           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1418         }
1419         /* U part */
1420         pc1 = b->a + bdiag[i+1]+1;
1421         pj  = b->j + bdiag[i+1]+1;
1422         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1423         for (j=0; j<nz; j++) {
1424           col = pj[j];
1425           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1426         }
1427 
1428         sctx.rs  = rs;
1429         sctx.pv  = rtmp1[i];
1430         ierr = MatPivotCheck(info,sctx,i);CHKERRQ(ierr);
1431         pc1  = b->a + bdiag[i]; /* Mark diagonal */
1432         *pc1 = 1.0/sctx.pv;
1433 
1434         /* Now take care of diagonal 2x2 block. */
1435         pc2 = rtmp2 + i;
1436         if (*pc2 != 0.0){
1437           mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */
1438           *pc2 = mul1;          /* insert L entry */
1439           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1440           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1441           for (j=0; j<nz; j++) {
1442             col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col];
1443           }
1444           ierr = PetscLogFlops(2*nz);CHKERRQ(ierr);
1445         }
1446 
1447         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1448         rs = 0.0;
1449         /* L part */
1450         pc2 = b->a + bi[i+1];
1451         pj  = b->j + bi[i+1] ;
1452         nz  = bi[i+2] - bi[i+1];
1453         for (j=0; j<nz; j++) {
1454           col = pj[j];
1455           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1456         }
1457         /* U part */
1458         pc2 = b->a + bdiag[i+2]+1;
1459         pj  = b->j + bdiag[i+2]+1;
1460         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1461         for (j=0; j<nz; j++) {
1462           col = pj[j];
1463           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1464         }
1465 
1466         sctx.rs  = rs;
1467         sctx.pv  = rtmp2[i+1];
1468         ierr = MatPivotCheck(info,sctx,i+1);CHKERRQ(ierr);
1469         pc2  = b->a + bdiag[i+1];
1470         *pc2 = 1.0/sctx.pv;
1471         break;
1472 
1473       case 3:
1474       /*----------*/
1475         /* zero rtmp */
1476         /* L part */
1477         nz    = bi[i+1] - bi[i];
1478         bjtmp = bj + bi[i];
1479         for  (j=0; j<nz; j++) {
1480           col = bjtmp[j];
1481           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1482         }
1483 
1484         /* U part */
1485         nz = bdiag[i]-bdiag[i+1];
1486         bjtmp = bj + bdiag[i+1]+1;
1487         for  (j=0; j<nz; j++) {
1488           col = bjtmp[j];
1489           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1490         }
1491 
1492         /* load in initial (unfactored row) */
1493         nz    = ai[r[i]+1] - ai[r[i]];
1494         ajtmp = aj + ai[r[i]];
1495         v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2];
1496         for (j=0; j<nz; j++) {
1497           col = ics[ajtmp[j]];
1498           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j];
1499         }
1500         /* ZeropivotApply(): shift the diagonal of the matrix  */
1501         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount;
1502 
1503         /* elimination */
1504         bjtmp = bj + bi[i];
1505         row   = *bjtmp++; /* pivot row */
1506         nzL   = bi[i+1] - bi[i];
1507         for(k=0; k < nzL;k++) {
1508           pc1 = rtmp1 + row;
1509           pc2 = rtmp2 + row;
1510           pc3 = rtmp3 + row;
1511           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1512             pv  = b->a + bdiag[row];
1513             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv);
1514             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3;
1515 
1516             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1517             pv = b->a + bdiag[row+1]+1;
1518             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1519             for (j=0; j<nz; j++){
1520               col = pj[j];
1521               rtmp1[col] -= mul1 * pv[j];
1522               rtmp2[col] -= mul2 * pv[j];
1523               rtmp3[col] -= mul3 * pv[j];
1524             }
1525             ierr = PetscLogFlops(6*nz);CHKERRQ(ierr);
1526           }
1527           row = *bjtmp++;
1528         }
1529 
1530         /* finished row i; check zero pivot, then stick row i into b->a */
1531         rs  = 0.0;
1532         /* L part */
1533         pc1 = b->a + bi[i];
1534         pj  = b->j + bi[i] ;
1535         nz  = bi[i+1] - bi[i];
1536         for (j=0; j<nz; j++) {
1537           col = pj[j];
1538           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1539         }
1540         /* U part */
1541         pc1 = b->a + bdiag[i+1]+1;
1542         pj  = b->j + bdiag[i+1]+1;
1543         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1544         for (j=0; j<nz; j++) {
1545           col = pj[j];
1546           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1547         }
1548 
1549         sctx.rs  = rs;
1550         sctx.pv  = rtmp1[i];
1551         ierr = MatPivotCheck(info,sctx,i);CHKERRQ(ierr);
1552         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1553         *pc1 = 1.0/sctx.pv;
1554 
1555         /* Now take care of 1st column of diagonal 3x3 block. */
1556         pc2 = rtmp2 + i;
1557         pc3 = rtmp3 + i;
1558         if (*pc2 != 0.0 || *pc3 != 0.0){
1559           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1560           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1561           pj = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1562           nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1563           for (j=0; j<nz; j++) {
1564             col = pj[j];
1565             rtmp2[col] -= mul2 * rtmp1[col];
1566             rtmp3[col] -= mul3 * rtmp1[col];
1567           }
1568           ierr = PetscLogFlops(4*nz);CHKERRQ(ierr);
1569         }
1570 
1571         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1572         rs = 0.0;
1573         /* L part */
1574         pc2 = b->a + bi[i+1];
1575         pj  = b->j + bi[i+1] ;
1576         nz  = bi[i+2] - bi[i+1];
1577         for (j=0; j<nz; j++) {
1578           col = pj[j];
1579           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1580         }
1581         /* U part */
1582         pc2 = b->a + bdiag[i+2]+1;
1583         pj  = b->j + bdiag[i+2]+1;
1584         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1585         for (j=0; j<nz; j++) {
1586           col = pj[j];
1587           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1588         }
1589 
1590         sctx.rs  = rs;
1591         sctx.pv  = rtmp2[i+1];
1592         ierr = MatPivotCheck(info,sctx,i+1);CHKERRQ(ierr);
1593         pc2  = b->a + bdiag[i+1];
1594         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1595 
1596         /* Now take care of 2nd column of diagonal 3x3 block. */
1597         pc3 = rtmp3 + i+1;
1598         if (*pc3 != 0.0){
1599           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1600           pj = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1601           nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1602           for (j=0; j<nz; j++) {
1603             col = pj[j];
1604             rtmp3[col] -= mul3 * rtmp2[col];
1605           }
1606           ierr = PetscLogFlops(2*nz);CHKERRQ(ierr);
1607         }
1608 
1609         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1610         rs = 0.0;
1611         /* L part */
1612         pc3 = b->a + bi[i+2];
1613         pj  = b->j + bi[i+2] ;
1614         nz  = bi[i+3] - bi[i+2];
1615         for (j=0; j<nz; j++) {
1616           col = pj[j];
1617           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1618         }
1619         /* U part */
1620         pc3 = b->a + bdiag[i+3]+1;
1621         pj  = b->j + bdiag[i+3]+1;
1622         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1623         for (j=0; j<nz; j++) {
1624           col = pj[j];
1625           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1626         }
1627 
1628         sctx.rs  = rs;
1629         sctx.pv  = rtmp3[i+2];
1630         ierr = MatPivotCheck(info,sctx,i+2);CHKERRQ(ierr);
1631         pc3  = b->a + bdiag[i+2];
1632         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1633         break;
1634 
1635       default:
1636         SETERRQ(PETSC_ERR_SUP,"Node size not yet supported \n");
1637       }
1638       i += nodesz;                 /* Update the row */
1639     }
1640 
1641     /* MatPivotRefine() */
1642     if (info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE && !sctx.useshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max){
1643       /*
1644        * if no shift in this attempt & shifting & started shifting & can refine,
1645        * then try lower shift
1646        */
1647       sctx.shift_hi       = sctx.shift_fraction;
1648       sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.;
1649       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
1650       sctx.useshift       = PETSC_TRUE;
1651       sctx.nshift++;
1652     }
1653   } while (sctx.useshift);
1654 
1655   ierr = PetscFree(rtmp1);CHKERRQ(ierr);
1656   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
1657   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
1658   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1659 
1660   C->ops->solve              = MatSolve_SeqAIJ_Inode;
1661   C->ops->solveadd           = MatSolveAdd_SeqAIJ;
1662   C->ops->solvetranspose     = MatSolveTranspose_SeqAIJ;
1663   C->ops->solvetransposeadd  = MatSolveTransposeAdd_SeqAIJ;
1664   C->ops->matsolve           = MatMatSolve_SeqAIJ;
1665   C->assembled    = PETSC_TRUE;
1666   C->preallocated = PETSC_TRUE;
1667   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
1668 
1669   /* MatShiftView(A,info,&sctx) */
1670   if (sctx.nshift){
1671     if (info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE) {
1672       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr);
1673     } else if (info->shifttype == MAT_SHIFT_NONZERO) {
1674       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
1675     } else if (info->shifttype == MAT_SHIFT_INBLOCKS){
1676       ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %G\n",sctx.nshift,info->shiftamount);CHKERRQ(ierr);
1677     }
1678   }
1679   PetscFunctionReturn(0);
1680 }
1681 
1682 #undef __FUNCT__
1683 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode_inplace"
1684 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info)
1685 {
1686   Mat               C = B;
1687   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1688   IS                iscol = b->col,isrow = b->row,isicol = b->icol;
1689   PetscErrorCode    ierr;
1690   const PetscInt    *r,*ic,*c,*ics;
1691   PetscInt          n = A->rmap->n,*bi = b->i;
1692   PetscInt          *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1693   PetscInt          i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz;
1694   PetscInt          *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1695   PetscScalar       mul1,mul2,mul3,tmp;
1696   MatScalar         *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1697   const MatScalar   *v1,*v2,*v3,*aa = a->a,*rtmp1;
1698   PetscReal         rs=0.0;
1699   FactorShiftCtx    sctx;
1700   PetscInt          newshift;
1701 
1702   PetscFunctionBegin;
1703   sctx.shift_top      = 0;
1704   sctx.nshift_max     = 0;
1705   sctx.shift_lo       = 0;
1706   sctx.shift_hi       = 0;
1707   sctx.shift_fraction = 0;
1708 
1709   /* if both shift schemes are chosen by user, only use info->shiftpd */
1710   if (info->shifttype==MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1711     sctx.shift_top = 0;
1712     for (i=0; i<n; i++) {
1713       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1714       rs    = 0.0;
1715       ajtmp = aj + ai[i];
1716       rtmp1 = aa + ai[i];
1717       nz = ai[i+1] - ai[i];
1718       for (j=0; j<nz; j++){
1719         if (*ajtmp != i){
1720           rs += PetscAbsScalar(*rtmp1++);
1721         } else {
1722           rs -= PetscRealPart(*rtmp1++);
1723         }
1724         ajtmp++;
1725       }
1726       if (rs>sctx.shift_top) sctx.shift_top = rs;
1727     }
1728     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1729     sctx.shift_top *= 1.1;
1730     sctx.nshift_max = 5;
1731     sctx.shift_lo   = 0.;
1732     sctx.shift_hi   = 1.;
1733   }
1734   sctx.shift_amount = 0;
1735   sctx.nshift       = 0;
1736 
1737   ierr  = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1738   ierr  = ISGetIndices(iscol,&c);CHKERRQ(ierr);
1739   ierr  = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1740   ierr  = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp11);CHKERRQ(ierr);
1741   ierr  = PetscMemzero(rtmp11,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr);
1742   ics   = ic ;
1743   rtmp22 = rtmp11 + n;
1744   rtmp33 = rtmp22 + n;
1745 
1746   node_max = a->inode.node_count;
1747   ns       = a->inode.size;
1748   if (!ns){
1749     SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information");
1750   }
1751 
1752   /* If max inode size > 3, split it into two inodes.*/
1753   /* also map the inode sizes according to the ordering */
1754   ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr);
1755   for (i=0,j=0; i<node_max; ++i,++j){
1756     if (ns[i]>3) {
1757       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1758       ++j;
1759       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1760     } else {
1761       tmp_vec1[j] = ns[i];
1762     }
1763   }
1764   /* Use the correct node_max */
1765   node_max = j;
1766 
1767   /* Now reorder the inode info based on mat re-ordering info */
1768   /* First create a row -> inode_size_array_index map */
1769   ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr);
1770   ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr);
1771   for (i=0,row=0; i<node_max; i++) {
1772     nodesz = tmp_vec1[i];
1773     for (j=0; j<nodesz; j++,row++) {
1774       nsmap[row] = i;
1775     }
1776   }
1777   /* Using nsmap, create a reordered ns structure */
1778   for (i=0,j=0; i< node_max; i++) {
1779     nodesz       = tmp_vec1[nsmap[r[j]]];    /* here the reordered row_no is in r[] */
1780     tmp_vec2[i]  = nodesz;
1781     j           += nodesz;
1782   }
1783   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1784   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1785   /* Now use the correct ns */
1786   ns = tmp_vec2;
1787 
1788   do {
1789     sctx.useshift = PETSC_FALSE;
1790     /* Now loop over each block-row, and do the factorization */
1791     for (i=0,row=0; i<node_max; i++) {
1792       nodesz = ns[i];
1793       nz     = bi[row+1] - bi[row];
1794       bjtmp  = bj + bi[row];
1795 
1796       switch (nodesz){
1797       case 1:
1798         for  (j=0; j<nz; j++){
1799           idx        = bjtmp[j];
1800           rtmp11[idx] = 0.0;
1801         }
1802 
1803         /* load in initial (unfactored row) */
1804         idx    = r[row];
1805         nz_tmp = ai[idx+1] - ai[idx];
1806         ajtmp  = aj + ai[idx];
1807         v1     = aa + ai[idx];
1808 
1809         for (j=0; j<nz_tmp; j++) {
1810           idx        = ics[ajtmp[j]];
1811           rtmp11[idx] = v1[j];
1812         }
1813         rtmp11[ics[r[row]]] += sctx.shift_amount;
1814 
1815         prow = *bjtmp++ ;
1816         while (prow < row) {
1817           pc1 = rtmp11 + prow;
1818           if (*pc1 != 0.0){
1819             pv   = ba + bd[prow];
1820             pj   = nbj + bd[prow];
1821             mul1 = *pc1 * *pv++;
1822             *pc1 = mul1;
1823             nz_tmp = bi[prow+1] - bd[prow] - 1;
1824             ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr);
1825             for (j=0; j<nz_tmp; j++) {
1826               tmp = pv[j];
1827               idx = pj[j];
1828               rtmp11[idx] -= mul1 * tmp;
1829             }
1830           }
1831           prow = *bjtmp++ ;
1832         }
1833         pj  = bj + bi[row];
1834         pc1 = ba + bi[row];
1835 
1836         sctx.pv    = rtmp11[row];
1837         rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
1838         rs         = 0.0;
1839         for (j=0; j<nz; j++) {
1840           idx    = pj[j];
1841           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
1842           if (idx != row) rs += PetscAbsScalar(pc1[j]);
1843         }
1844         sctx.rs  = rs;
1845         ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr);
1846         if (newshift == 1) goto endofwhile;
1847         break;
1848 
1849       case 2:
1850         for (j=0; j<nz; j++) {
1851           idx        = bjtmp[j];
1852           rtmp11[idx] = 0.0;
1853           rtmp22[idx] = 0.0;
1854         }
1855 
1856         /* load in initial (unfactored row) */
1857         idx    = r[row];
1858         nz_tmp = ai[idx+1] - ai[idx];
1859         ajtmp  = aj + ai[idx];
1860         v1     = aa + ai[idx];
1861         v2     = aa + ai[idx+1];
1862         for (j=0; j<nz_tmp; j++) {
1863           idx        = ics[ajtmp[j]];
1864           rtmp11[idx] = v1[j];
1865           rtmp22[idx] = v2[j];
1866         }
1867         rtmp11[ics[r[row]]]   += sctx.shift_amount;
1868         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
1869 
1870         prow = *bjtmp++ ;
1871         while (prow < row) {
1872           pc1 = rtmp11 + prow;
1873           pc2 = rtmp22 + prow;
1874           if (*pc1 != 0.0 || *pc2 != 0.0){
1875             pv   = ba + bd[prow];
1876             pj   = nbj + bd[prow];
1877             mul1 = *pc1 * *pv;
1878             mul2 = *pc2 * *pv;
1879             ++pv;
1880             *pc1 = mul1;
1881             *pc2 = mul2;
1882 
1883             nz_tmp = bi[prow+1] - bd[prow] - 1;
1884             for (j=0; j<nz_tmp; j++) {
1885               tmp = pv[j];
1886               idx = pj[j];
1887               rtmp11[idx] -= mul1 * tmp;
1888               rtmp22[idx] -= mul2 * tmp;
1889             }
1890             ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr);
1891           }
1892           prow = *bjtmp++ ;
1893         }
1894 
1895         /* Now take care of diagonal 2x2 block. Note: prow = row here */
1896         pc1 = rtmp11 + prow;
1897         pc2 = rtmp22 + prow;
1898 
1899         sctx.pv = *pc1;
1900         pj      = bj + bi[prow];
1901         rs      = 0.0;
1902         for (j=0; j<nz; j++){
1903           idx = pj[j];
1904           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
1905         }
1906         sctx.rs = rs;
1907         ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr);
1908         if (newshift == 1) goto endofwhile;
1909 
1910         if (*pc2 != 0.0){
1911           pj     = nbj + bd[prow];
1912           mul2   = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
1913           *pc2   = mul2;
1914           nz_tmp = bi[prow+1] - bd[prow] - 1;
1915           for (j=0; j<nz_tmp; j++) {
1916             idx = pj[j] ;
1917             tmp = rtmp11[idx];
1918             rtmp22[idx] -= mul2 * tmp;
1919           }
1920           ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr);
1921         }
1922 
1923         pj  = bj + bi[row];
1924         pc1 = ba + bi[row];
1925         pc2 = ba + bi[row+1];
1926 
1927         sctx.pv = rtmp22[row+1];
1928         rs = 0.0;
1929         rtmp11[row]   = 1.0/rtmp11[row];
1930         rtmp22[row+1] = 1.0/rtmp22[row+1];
1931         /* copy row entries from dense representation to sparse */
1932         for (j=0; j<nz; j++) {
1933           idx    = pj[j];
1934           pc1[j] = rtmp11[idx];
1935           pc2[j] = rtmp22[idx];
1936           if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
1937         }
1938         sctx.rs = rs;
1939         ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr);
1940         if (newshift == 1) goto endofwhile;
1941         break;
1942 
1943       case 3:
1944         for  (j=0; j<nz; j++) {
1945           idx        = bjtmp[j];
1946           rtmp11[idx] = 0.0;
1947           rtmp22[idx] = 0.0;
1948           rtmp33[idx] = 0.0;
1949         }
1950         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
1951         idx    = r[row];
1952         nz_tmp = ai[idx+1] - ai[idx];
1953         ajtmp = aj + ai[idx];
1954         v1    = aa + ai[idx];
1955         v2    = aa + ai[idx+1];
1956         v3    = aa + ai[idx+2];
1957         for (j=0; j<nz_tmp; j++) {
1958           idx        = ics[ajtmp[j]];
1959           rtmp11[idx] = v1[j];
1960           rtmp22[idx] = v2[j];
1961           rtmp33[idx] = v3[j];
1962         }
1963         rtmp11[ics[r[row]]]   += sctx.shift_amount;
1964         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
1965         rtmp33[ics[r[row+2]]] += sctx.shift_amount;
1966 
1967         /* loop over all pivot row blocks above this row block */
1968         prow = *bjtmp++ ;
1969         while (prow < row) {
1970           pc1 = rtmp11 + prow;
1971           pc2 = rtmp22 + prow;
1972           pc3 = rtmp33 + prow;
1973           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){
1974             pv   = ba  + bd[prow];
1975             pj   = nbj + bd[prow];
1976             mul1 = *pc1 * *pv;
1977             mul2 = *pc2 * *pv;
1978             mul3 = *pc3 * *pv;
1979             ++pv;
1980             *pc1 = mul1;
1981             *pc2 = mul2;
1982             *pc3 = mul3;
1983 
1984             nz_tmp = bi[prow+1] - bd[prow] - 1;
1985             /* update this row based on pivot row */
1986             for (j=0; j<nz_tmp; j++) {
1987               tmp = pv[j];
1988               idx = pj[j];
1989               rtmp11[idx] -= mul1 * tmp;
1990               rtmp22[idx] -= mul2 * tmp;
1991               rtmp33[idx] -= mul3 * tmp;
1992             }
1993             ierr = PetscLogFlops(6*nz_tmp);CHKERRQ(ierr);
1994           }
1995           prow = *bjtmp++ ;
1996         }
1997 
1998         /* Now take care of diagonal 3x3 block in this set of rows */
1999         /* note: prow = row here */
2000         pc1 = rtmp11 + prow;
2001         pc2 = rtmp22 + prow;
2002         pc3 = rtmp33 + prow;
2003 
2004         sctx.pv = *pc1;
2005         pj      = bj + bi[prow];
2006         rs      = 0.0;
2007         for (j=0; j<nz; j++){
2008           idx = pj[j];
2009           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2010         }
2011         sctx.rs = rs;
2012         ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr);
2013         if (newshift == 1) goto endofwhile;
2014 
2015         if (*pc2 != 0.0 || *pc3 != 0.0){
2016           mul2 = (*pc2)/(*pc1);
2017           mul3 = (*pc3)/(*pc1);
2018           *pc2 = mul2;
2019           *pc3 = mul3;
2020           nz_tmp = bi[prow+1] - bd[prow] - 1;
2021           pj     = nbj + bd[prow];
2022           for (j=0; j<nz_tmp; j++) {
2023             idx = pj[j] ;
2024             tmp = rtmp11[idx];
2025             rtmp22[idx] -= mul2 * tmp;
2026             rtmp33[idx] -= mul3 * tmp;
2027           }
2028           ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr);
2029         }
2030         ++prow;
2031 
2032         pc2 = rtmp22 + prow;
2033         pc3 = rtmp33 + prow;
2034         sctx.pv = *pc2;
2035         pj      = bj + bi[prow];
2036         rs      = 0.0;
2037         for (j=0; j<nz; j++){
2038           idx = pj[j];
2039           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2040         }
2041         sctx.rs = rs;
2042         ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr);
2043         if (newshift == 1) goto endofwhile;
2044 
2045         if (*pc3 != 0.0){
2046           mul3   = (*pc3)/(*pc2);
2047           *pc3   = mul3;
2048           pj     = nbj + bd[prow];
2049           nz_tmp = bi[prow+1] - bd[prow] - 1;
2050           for (j=0; j<nz_tmp; j++) {
2051             idx = pj[j] ;
2052             tmp = rtmp22[idx];
2053             rtmp33[idx] -= mul3 * tmp;
2054           }
2055           ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr);
2056         }
2057 
2058         pj  = bj + bi[row];
2059         pc1 = ba + bi[row];
2060         pc2 = ba + bi[row+1];
2061         pc3 = ba + bi[row+2];
2062 
2063         sctx.pv = rtmp33[row+2];
2064         rs = 0.0;
2065         rtmp11[row]   = 1.0/rtmp11[row];
2066         rtmp22[row+1] = 1.0/rtmp22[row+1];
2067         rtmp33[row+2] = 1.0/rtmp33[row+2];
2068         /* copy row entries from dense representation to sparse */
2069         for (j=0; j<nz; j++) {
2070           idx    = pj[j];
2071           pc1[j] = rtmp11[idx];
2072           pc2[j] = rtmp22[idx];
2073           pc3[j] = rtmp33[idx];
2074           if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
2075         }
2076 
2077         sctx.rs = rs;
2078         ierr = MatLUCheckShift_inline(info,sctx,row+2,newshift);CHKERRQ(ierr);
2079         if (newshift == 1) goto endofwhile;
2080         break;
2081 
2082       default:
2083         SETERRQ(PETSC_ERR_SUP,"Node size not yet supported \n");
2084       }
2085       row += nodesz;                 /* Update the row */
2086     }
2087     endofwhile:;
2088   } while (sctx.useshift);
2089   ierr = PetscFree(rtmp11);CHKERRQ(ierr);
2090   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
2091   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
2092   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
2093   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
2094   (B)->ops->solve           = MatSolve_SeqAIJ_Inode_inplace;
2095   /* do not set solve add, since MatSolve_Inode + Add is faster */
2096   C->ops->solvetranspose     = MatSolveTranspose_SeqAIJ_inplace;
2097   C->ops->solvetransposeadd  = MatSolveTransposeAdd_SeqAIJ_inplace;
2098   C->assembled   = PETSC_TRUE;
2099   C->preallocated = PETSC_TRUE;
2100   if (sctx.nshift) {
2101     if (info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE) {
2102       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr);
2103     } else if (info->shifttype == MAT_SHIFT_NONZERO) {
2104       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
2105     }
2106   }
2107   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
2108   PetscFunctionReturn(0);
2109 }
2110 
2111 
2112 /* ----------------------------------------------------------- */
2113 #undef __FUNCT__
2114 #define __FUNCT__ "MatSolve_SeqAIJ_Inode"
2115 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
2116 {
2117   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
2118   IS                iscol = a->col,isrow = a->row;
2119   PetscErrorCode    ierr;
2120   const PetscInt    *r,*c,*rout,*cout;
2121   PetscInt          i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
2122   PetscInt          node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
2123   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
2124   PetscScalar       sum1,sum2,sum3,sum4,sum5;
2125   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
2126   const PetscScalar *b;
2127 
2128   PetscFunctionBegin;
2129   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
2130   node_max = a->inode.node_count;
2131   ns       = a->inode.size;     /* Node Size array */
2132 
2133   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2134   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2135   tmp  = a->solve_work;
2136 
2137   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2138   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2139 
2140   /* forward solve the lower triangular */
2141   tmps = tmp ;
2142   aa   = a_a ;
2143   aj   = a_j ;
2144   ad   = a->diag;
2145 
2146   for (i = 0,row = 0; i< node_max; ++i){
2147     nsz = ns[i];
2148     aii = ai[row];
2149     v1  = aa + aii;
2150     vi  = aj + aii;
2151     nz  = ai[row+1]- ai[row];
2152 
2153     if (i < node_max-1) {
2154       /* Prefetch the indices for the next block */
2155       PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,0); /* indices */
2156       /* Prefetch the data for the next block */
2157       PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,0);
2158     }
2159 
2160     switch (nsz){               /* Each loop in 'case' is unrolled */
2161     case 1 :
2162       sum1 = b[r[row]];
2163       for(j=0; j<nz-1; j+=2){
2164         i0   = vi[j];
2165         i1   = vi[j+1];
2166         tmp0 = tmps[i0];
2167         tmp1 = tmps[i1];
2168         sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1;
2169       }
2170       if(j == nz-1){
2171         tmp0 = tmps[vi[j]];
2172         sum1 -= v1[j]*tmp0;
2173       }
2174       tmp[row++]=sum1;
2175       break;
2176     case 2:
2177       sum1 = b[r[row]];
2178       sum2 = b[r[row+1]];
2179       v2   = aa + ai[row+1];
2180 
2181       for(j=0; j<nz-1; j+=2){
2182         i0   = vi[j];
2183         i1   = vi[j+1];
2184         tmp0 = tmps[i0];
2185         tmp1 = tmps[i1];
2186         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2187         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2188       }
2189       if(j == nz-1){
2190         tmp0 = tmps[vi[j]];
2191         sum1 -= v1[j] *tmp0;
2192         sum2 -= v2[j] *tmp0;
2193       }
2194       sum2 -= v2[nz] * sum1;
2195       tmp[row ++]=sum1;
2196       tmp[row ++]=sum2;
2197       break;
2198     case 3:
2199       sum1 = b[r[row]];
2200       sum2 = b[r[row+1]];
2201       sum3 = b[r[row+2]];
2202       v2   = aa + ai[row+1];
2203       v3   = aa + ai[row+2];
2204 
2205       for (j=0; j<nz-1; j+=2){
2206         i0   = vi[j];
2207         i1   = vi[j+1];
2208         tmp0 = tmps[i0];
2209         tmp1 = tmps[i1];
2210         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2211         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2212         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2213       }
2214       if (j == nz-1){
2215         tmp0 = tmps[vi[j]];
2216         sum1 -= v1[j] *tmp0;
2217         sum2 -= v2[j] *tmp0;
2218         sum3 -= v3[j] *tmp0;
2219       }
2220       sum2 -= v2[nz] * sum1;
2221       sum3 -= v3[nz] * sum1;
2222       sum3 -= v3[nz+1] * sum2;
2223       tmp[row ++]=sum1;
2224       tmp[row ++]=sum2;
2225       tmp[row ++]=sum3;
2226       break;
2227 
2228     case 4:
2229       sum1 = b[r[row]];
2230       sum2 = b[r[row+1]];
2231       sum3 = b[r[row+2]];
2232       sum4 = b[r[row+3]];
2233       v2   = aa + ai[row+1];
2234       v3   = aa + ai[row+2];
2235       v4   = aa + ai[row+3];
2236 
2237       for (j=0; j<nz-1; j+=2){
2238         i0   = vi[j];
2239         i1   = vi[j+1];
2240         tmp0 = tmps[i0];
2241         tmp1 = tmps[i1];
2242         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2243         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2244         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2245         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2246       }
2247       if (j == nz-1){
2248         tmp0 = tmps[vi[j]];
2249         sum1 -= v1[j] *tmp0;
2250         sum2 -= v2[j] *tmp0;
2251         sum3 -= v3[j] *tmp0;
2252         sum4 -= v4[j] *tmp0;
2253       }
2254       sum2 -= v2[nz] * sum1;
2255       sum3 -= v3[nz] * sum1;
2256       sum4 -= v4[nz] * sum1;
2257       sum3 -= v3[nz+1] * sum2;
2258       sum4 -= v4[nz+1] * sum2;
2259       sum4 -= v4[nz+2] * sum3;
2260 
2261       tmp[row ++]=sum1;
2262       tmp[row ++]=sum2;
2263       tmp[row ++]=sum3;
2264       tmp[row ++]=sum4;
2265       break;
2266     case 5:
2267       sum1 = b[r[row]];
2268       sum2 = b[r[row+1]];
2269       sum3 = b[r[row+2]];
2270       sum4 = b[r[row+3]];
2271       sum5 = b[r[row+4]];
2272       v2   = aa + ai[row+1];
2273       v3   = aa + ai[row+2];
2274       v4   = aa + ai[row+3];
2275       v5   = aa + ai[row+4];
2276 
2277       for (j=0; j<nz-1; j+=2){
2278         i0   = vi[j];
2279         i1   = vi[j+1];
2280         tmp0 = tmps[i0];
2281         tmp1 = tmps[i1];
2282         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2283         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2284         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2285         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2286         sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1;
2287       }
2288       if (j == nz-1){
2289         tmp0 = tmps[vi[j]];
2290         sum1 -= v1[j] *tmp0;
2291         sum2 -= v2[j] *tmp0;
2292         sum3 -= v3[j] *tmp0;
2293         sum4 -= v4[j] *tmp0;
2294         sum5 -= v5[j] *tmp0;
2295       }
2296 
2297       sum2 -= v2[nz] * sum1;
2298       sum3 -= v3[nz] * sum1;
2299       sum4 -= v4[nz] * sum1;
2300       sum5 -= v5[nz] * sum1;
2301       sum3 -= v3[nz+1] * sum2;
2302       sum4 -= v4[nz+1] * sum2;
2303       sum5 -= v5[nz+1] * sum2;
2304       sum4 -= v4[nz+2] * sum3;
2305       sum5 -= v5[nz+2] * sum3;
2306       sum5 -= v5[nz+3] * sum4;
2307 
2308       tmp[row ++]=sum1;
2309       tmp[row ++]=sum2;
2310       tmp[row ++]=sum3;
2311       tmp[row ++]=sum4;
2312       tmp[row ++]=sum5;
2313       break;
2314     default:
2315       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
2316     }
2317   }
2318   /* backward solve the upper triangular */
2319   for (i=node_max -1 ,row = n-1 ; i>=0; i--){
2320     nsz = ns[i];
2321     aii = ad[row+1] + 1;
2322     v1  = aa + aii;
2323     vi  = aj + aii;
2324     nz  = ad[row]- ad[row+1] - 1;
2325 
2326     if (i > 0) {
2327       /* Prefetch the indices for the next block */
2328       PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,0); /* indices */
2329       /* Prefetch the data for the next block */
2330       PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,0);
2331     }
2332 
2333     switch (nsz){               /* Each loop in 'case' is unrolled */
2334     case 1 :
2335       sum1 = tmp[row];
2336 
2337       for(j=0 ; j<nz-1; j+=2){
2338         i0   = vi[j];
2339         i1   = vi[j+1];
2340         tmp0 = tmps[i0];
2341         tmp1 = tmps[i1];
2342         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2343       }
2344       if (j == nz-1){
2345         tmp0  = tmps[vi[j]];
2346         sum1 -= v1[j]*tmp0;
2347       }
2348       x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2349       break;
2350     case 2 :
2351       sum1 = tmp[row];
2352       sum2 = tmp[row-1];
2353       v2   = aa + ad[row] + 1;
2354       for (j=0 ; j<nz-1; j+=2){
2355         i0   = vi[j];
2356         i1   = vi[j+1];
2357         tmp0 = tmps[i0];
2358         tmp1 = tmps[i1];
2359         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2360         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2361       }
2362       if (j == nz-1){
2363         tmp0  = tmps[vi[j]];
2364         sum1 -= v1[j]* tmp0;
2365         sum2 -= v2[j+1]* tmp0;
2366       }
2367 
2368       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2369       sum2   -= v2[0] * tmp0;
2370       x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2371       break;
2372     case 3 :
2373       sum1 = tmp[row];
2374       sum2 = tmp[row -1];
2375       sum3 = tmp[row -2];
2376       v2   = aa + ad[row] + 1;
2377       v3   = aa + ad[row -1] + 1;
2378       for (j=0 ; j<nz-1; j+=2){
2379         i0   = vi[j];
2380         i1   = vi[j+1];
2381         tmp0 = tmps[i0];
2382         tmp1 = tmps[i1];
2383         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2384         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2385         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2386       }
2387       if (j== nz-1){
2388         tmp0  = tmps[vi[j]];
2389         sum1 -= v1[j] * tmp0;
2390         sum2 -= v2[j+1] * tmp0;
2391         sum3 -= v3[j+2] * tmp0;
2392       }
2393       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2394       sum2   -= v2[0]* tmp0;
2395       sum3   -= v3[1] * tmp0;
2396       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2397       sum3   -= v3[0]* tmp0;
2398       x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2399 
2400       break;
2401     case 4 :
2402       sum1 = tmp[row];
2403       sum2 = tmp[row -1];
2404       sum3 = tmp[row -2];
2405       sum4 = tmp[row -3];
2406       v2   = aa + ad[row]+1;
2407       v3   = aa + ad[row -1]+1;
2408       v4   = aa + ad[row -2]+1;
2409 
2410       for (j=0 ; j<nz-1; j+=2){
2411         i0   = vi[j];
2412         i1   = vi[j+1];
2413         tmp0 = tmps[i0];
2414         tmp1 = tmps[i1];
2415         sum1 -= v1[j] * tmp0   + v1[j+1] * tmp1;
2416         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2417         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2418         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2419       }
2420       if (j== nz-1){
2421         tmp0  = tmps[vi[j]];
2422         sum1 -= v1[j] * tmp0;
2423         sum2 -= v2[j+1] * tmp0;
2424         sum3 -= v3[j+2] * tmp0;
2425         sum4 -= v4[j+3] * tmp0;
2426       }
2427 
2428       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2429       sum2   -= v2[0] * tmp0;
2430       sum3   -= v3[1] * tmp0;
2431       sum4   -= v4[2] * tmp0;
2432       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2433       sum3   -= v3[0] * tmp0;
2434       sum4   -= v4[1] * tmp0;
2435       tmp0    = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2436       sum4   -= v4[0] * tmp0;
2437       x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2438       break;
2439     case 5 :
2440       sum1 = tmp[row];
2441       sum2 = tmp[row -1];
2442       sum3 = tmp[row -2];
2443       sum4 = tmp[row -3];
2444       sum5 = tmp[row -4];
2445       v2   = aa + ad[row]+1;
2446       v3   = aa + ad[row -1]+1;
2447       v4   = aa + ad[row -2]+1;
2448       v5   = aa + ad[row -3]+1;
2449       for (j=0 ; j<nz-1; j+=2){
2450         i0   = vi[j];
2451         i1   = vi[j+1];
2452         tmp0 = tmps[i0];
2453         tmp1 = tmps[i1];
2454         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2455         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2456         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2457         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2458         sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1;
2459       }
2460       if (j==nz-1){
2461         tmp0  = tmps[vi[j]];
2462         sum1 -= v1[j] * tmp0;
2463         sum2 -= v2[j+1] * tmp0;
2464         sum3 -= v3[j+2] * tmp0;
2465         sum4 -= v4[j+3] * tmp0;
2466         sum5 -= v5[j+4] * tmp0;
2467       }
2468 
2469       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2470       sum2   -= v2[0] * tmp0;
2471       sum3   -= v3[1] * tmp0;
2472       sum4   -= v4[2] * tmp0;
2473       sum5   -= v5[3] * tmp0;
2474       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2475       sum3   -= v3[0] * tmp0;
2476       sum4   -= v4[1] * tmp0;
2477       sum5   -= v5[2] * tmp0;
2478       tmp0    = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2479       sum4   -= v4[0] * tmp0;
2480       sum5   -= v5[1] * tmp0;
2481       tmp0    = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2482       sum5   -= v5[0] * tmp0;
2483       x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--;
2484       break;
2485     default:
2486       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
2487     }
2488   }
2489   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2490   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2491   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2492   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2493   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
2494   PetscFunctionReturn(0);
2495 }
2496 
2497 
2498 /*
2499      Makes a longer coloring[] array and calls the usual code with that
2500 */
2501 #undef __FUNCT__
2502 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode"
2503 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
2504 {
2505   Mat_SeqAIJ       *a = (Mat_SeqAIJ*)mat->data;
2506   PetscErrorCode  ierr;
2507   PetscInt        n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
2508   PetscInt        *colorused,i;
2509   ISColoringValue *newcolor;
2510 
2511   PetscFunctionBegin;
2512   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr);
2513   /* loop over inodes, marking a color for each column*/
2514   row = 0;
2515   for (i=0; i<m; i++){
2516     for (j=0; j<ns[i]; j++) {
2517       newcolor[row++] = coloring[i] + j*ncolors;
2518     }
2519   }
2520 
2521   /* eliminate unneeded colors */
2522   ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr);
2523   ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr);
2524   for (i=0; i<n; i++) {
2525     colorused[newcolor[i]] = 1;
2526   }
2527 
2528   for (i=1; i<5*ncolors; i++) {
2529     colorused[i] += colorused[i-1];
2530   }
2531   ncolors = colorused[5*ncolors-1];
2532   for (i=0; i<n; i++) {
2533     newcolor[i] = colorused[newcolor[i]]-1;
2534   }
2535   ierr = PetscFree(colorused);CHKERRQ(ierr);
2536   ierr = ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr);
2537   ierr = PetscFree(coloring);CHKERRQ(ierr);
2538   PetscFunctionReturn(0);
2539 }
2540 
2541 #include "../src/mat/blockinvert.h"
2542 
2543 #undef __FUNCT__
2544 #define __FUNCT__ "MatSOR_SeqAIJ_Inode"
2545 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
2546 {
2547   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
2548   PetscScalar        sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3;
2549   MatScalar          *ibdiag,*bdiag,work[25];
2550   PetscScalar        *x,*xb,tmp4,tmp5,x1,x2,x3,x4,x5;
2551   const MatScalar    *v = a->a,*v1,*v2,*v3,*v4,*v5;
2552   PetscReal          zeropivot = 1.0e-15, shift = 0.0;
2553   PetscErrorCode     ierr;
2554   PetscInt           n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2;
2555   PetscInt           *idx,*diag = a->diag,*ii = a->i,sz,k,ipvt[5];
2556 
2557   PetscFunctionBegin;
2558   if (omega != 1.0) SETERRQ(PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
2559   if (fshift != 0.0) SETERRQ(PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
2560   if (its > 1) {
2561     /* switch to non-inode version */
2562     ierr = MatSOR_SeqAIJ(A,bb,omega,flag,fshift,its,lits,xx);CHKERRQ(ierr);
2563     PetscFunctionReturn(0);
2564   }
2565 
2566   if (!a->inode.ibdiagvalid) {
2567     if (!a->inode.ibdiag) {
2568       /* calculate space needed for diagonal blocks */
2569       for (i=0; i<m; i++) {
2570 	cnt += sizes[i]*sizes[i];
2571       }
2572       a->inode.bdiagsize = cnt;
2573       ierr   = PetscMalloc3(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag,A->rmap->n,MatScalar,&a->inode.ssor_work);CHKERRQ(ierr);
2574     }
2575 
2576     /* copy over the diagonal blocks and invert them */
2577     ibdiag = a->inode.ibdiag;
2578     bdiag  = a->inode.bdiag;
2579     cnt = 0;
2580     for (i=0, row = 0; i<m; i++) {
2581       for (j=0; j<sizes[i]; j++) {
2582         for (k=0; k<sizes[i]; k++) {
2583           bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
2584         }
2585       }
2586       ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr);
2587 
2588       switch(sizes[i]) {
2589         case 1:
2590           /* Create matrix data structure */
2591           if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
2592           ibdiag[cnt] = 1.0/ibdiag[cnt];
2593           break;
2594         case 2:
2595           ierr = Kernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr);
2596           break;
2597         case 3:
2598           ierr = Kernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr);
2599           break;
2600         case 4:
2601           ierr = Kernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr);
2602           break;
2603         case 5:
2604           ierr = Kernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift);CHKERRQ(ierr);
2605           break;
2606        default:
2607 	 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2608       }
2609       cnt += sizes[i]*sizes[i];
2610       row += sizes[i];
2611     }
2612     a->inode.ibdiagvalid = PETSC_TRUE;
2613   }
2614   ibdiag = a->inode.ibdiag;
2615   bdiag  = a->inode.bdiag;
2616 
2617 
2618   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2619   if (flag & SOR_ZERO_INITIAL_GUESS) {
2620     PetscScalar *b;
2621     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2622     if (xx != bb) {
2623       ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2624     } else {
2625       b = x;
2626     }
2627     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){
2628 
2629       for (i=0, row=0; i<m; i++) {
2630         sz  = diag[row] - ii[row];
2631         v1  = a->a + ii[row];
2632         idx = a->j + ii[row];
2633 
2634         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2635         switch (sizes[i]){
2636           case 1:
2637 
2638             sum1  = b[row];
2639             for(n = 0; n<sz-1; n+=2) {
2640               i1   = idx[0];
2641               i2   = idx[1];
2642               idx += 2;
2643               tmp0 = x[i1];
2644               tmp1 = x[i2];
2645               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2646             }
2647 
2648             if (n == sz-1){
2649               tmp0  = x[*idx];
2650               sum1 -= *v1 * tmp0;
2651             }
2652             x[row++] = sum1*(*ibdiag++);
2653             break;
2654           case 2:
2655             v2    = a->a + ii[row+1];
2656             sum1  = b[row];
2657             sum2  = b[row+1];
2658             for(n = 0; n<sz-1; n+=2) {
2659               i1   = idx[0];
2660               i2   = idx[1];
2661               idx += 2;
2662               tmp0 = x[i1];
2663               tmp1 = x[i2];
2664               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2665               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2666             }
2667 
2668             if (n == sz-1){
2669               tmp0  = x[*idx];
2670               sum1 -= v1[0] * tmp0;
2671               sum2 -= v2[0] * tmp0;
2672             }
2673             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
2674             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
2675             ibdiag  += 4;
2676             break;
2677           case 3:
2678             v2    = a->a + ii[row+1];
2679             v3    = a->a + ii[row+2];
2680             sum1  = b[row];
2681             sum2  = b[row+1];
2682             sum3  = b[row+2];
2683             for(n = 0; n<sz-1; n+=2) {
2684               i1   = idx[0];
2685               i2   = idx[1];
2686               idx += 2;
2687               tmp0 = x[i1];
2688               tmp1 = x[i2];
2689               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2690               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2691               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2692             }
2693 
2694             if (n == sz-1){
2695               tmp0  = x[*idx];
2696               sum1 -= v1[0] * tmp0;
2697               sum2 -= v2[0] * tmp0;
2698               sum3 -= v3[0] * tmp0;
2699             }
2700             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2701             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2702             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2703             ibdiag  += 9;
2704             break;
2705           case 4:
2706             v2    = a->a + ii[row+1];
2707             v3    = a->a + ii[row+2];
2708             v4    = a->a + ii[row+3];
2709             sum1  = b[row];
2710             sum2  = b[row+1];
2711             sum3  = b[row+2];
2712             sum4  = b[row+3];
2713             for(n = 0; n<sz-1; n+=2) {
2714               i1   = idx[0];
2715               i2   = idx[1];
2716               idx += 2;
2717               tmp0 = x[i1];
2718               tmp1 = x[i2];
2719               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2720               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2721               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2722               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2723             }
2724 
2725             if (n == sz-1){
2726               tmp0  = x[*idx];
2727               sum1 -= v1[0] * tmp0;
2728               sum2 -= v2[0] * tmp0;
2729               sum3 -= v3[0] * tmp0;
2730               sum4 -= v4[0] * tmp0;
2731             }
2732             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2733             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2734             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2735             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2736             ibdiag  += 16;
2737             break;
2738           case 5:
2739             v2    = a->a + ii[row+1];
2740             v3    = a->a + ii[row+2];
2741             v4    = a->a + ii[row+3];
2742             v5    = a->a + ii[row+4];
2743             sum1  = b[row];
2744             sum2  = b[row+1];
2745             sum3  = b[row+2];
2746             sum4  = b[row+3];
2747             sum5  = b[row+4];
2748             for(n = 0; n<sz-1; n+=2) {
2749               i1   = idx[0];
2750               i2   = idx[1];
2751               idx += 2;
2752               tmp0 = x[i1];
2753               tmp1 = x[i2];
2754               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2755               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2756               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2757               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2758               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2759             }
2760 
2761             if (n == sz-1){
2762               tmp0  = x[*idx];
2763               sum1 -= v1[0] * tmp0;
2764               sum2 -= v2[0] * tmp0;
2765               sum3 -= v3[0] * tmp0;
2766               sum4 -= v4[0] * tmp0;
2767               sum5 -= v5[0] * tmp0;
2768             }
2769             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2770             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
2771             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
2772             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
2773             x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
2774             ibdiag  += 25;
2775             break;
2776 	  default:
2777    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2778         }
2779       }
2780 
2781       xb = x;
2782       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2783     } else xb = b;
2784     if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) &&
2785         (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) {
2786       cnt = 0;
2787       for (i=0, row=0; i<m; i++) {
2788 
2789         switch (sizes[i]){
2790           case 1:
2791             x[row++] *= bdiag[cnt++];
2792             break;
2793           case 2:
2794             x1   = x[row]; x2 = x[row+1];
2795             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
2796             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
2797             x[row++] = tmp1;
2798             x[row++] = tmp2;
2799             cnt += 4;
2800             break;
2801           case 3:
2802             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2];
2803             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
2804             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
2805             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
2806             x[row++] = tmp1;
2807             x[row++] = tmp2;
2808             x[row++] = tmp3;
2809             cnt += 9;
2810             break;
2811           case 4:
2812             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
2813             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
2814             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
2815             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
2816             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
2817             x[row++] = tmp1;
2818             x[row++] = tmp2;
2819             x[row++] = tmp3;
2820             x[row++] = tmp4;
2821             cnt += 16;
2822             break;
2823           case 5:
2824             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
2825             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
2826             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
2827             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
2828             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
2829             tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
2830             x[row++] = tmp1;
2831             x[row++] = tmp2;
2832             x[row++] = tmp3;
2833             x[row++] = tmp4;
2834             x[row++] = tmp5;
2835             cnt += 25;
2836             break;
2837 	  default:
2838    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2839         }
2840       }
2841       ierr = PetscLogFlops(m);CHKERRQ(ierr);
2842     }
2843     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){
2844 
2845       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
2846       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
2847         ibdiag -= sizes[i]*sizes[i];
2848         sz      = ii[row+1] - diag[row] - 1;
2849         v1      = a->a + diag[row] + 1;
2850         idx     = a->j + diag[row] + 1;
2851 
2852         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2853         switch (sizes[i]){
2854           case 1:
2855 
2856             sum1  = xb[row];
2857             for(n = 0; n<sz-1; n+=2) {
2858               i1   = idx[0];
2859               i2   = idx[1];
2860               idx += 2;
2861               tmp0 = x[i1];
2862               tmp1 = x[i2];
2863               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2864             }
2865 
2866             if (n == sz-1){
2867               tmp0  = x[*idx];
2868               sum1 -= *v1*tmp0;
2869             }
2870             x[row--] = sum1*(*ibdiag);
2871             break;
2872 
2873           case 2:
2874 
2875             sum1  = xb[row];
2876             sum2  = xb[row-1];
2877             /* note that sum1 is associated with the second of the two rows */
2878             v2    = a->a + diag[row-1] + 2;
2879             for(n = 0; n<sz-1; n+=2) {
2880               i1   = idx[0];
2881               i2   = idx[1];
2882               idx += 2;
2883               tmp0 = x[i1];
2884               tmp1 = x[i2];
2885               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2886               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2887             }
2888 
2889             if (n == sz-1){
2890               tmp0  = x[*idx];
2891               sum1 -= *v1*tmp0;
2892               sum2 -= *v2*tmp0;
2893             }
2894             x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
2895             x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
2896             break;
2897           case 3:
2898 
2899             sum1  = xb[row];
2900             sum2  = xb[row-1];
2901             sum3  = xb[row-2];
2902             v2    = a->a + diag[row-1] + 2;
2903             v3    = a->a + diag[row-2] + 3;
2904             for(n = 0; n<sz-1; n+=2) {
2905               i1   = idx[0];
2906               i2   = idx[1];
2907               idx += 2;
2908               tmp0 = x[i1];
2909               tmp1 = x[i2];
2910               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2911               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2912               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2913             }
2914 
2915             if (n == sz-1){
2916               tmp0  = x[*idx];
2917               sum1 -= *v1*tmp0;
2918               sum2 -= *v2*tmp0;
2919               sum3 -= *v3*tmp0;
2920             }
2921             x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
2922             x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
2923             x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
2924             break;
2925           case 4:
2926 
2927             sum1  = xb[row];
2928             sum2  = xb[row-1];
2929             sum3  = xb[row-2];
2930             sum4  = xb[row-3];
2931             v2    = a->a + diag[row-1] + 2;
2932             v3    = a->a + diag[row-2] + 3;
2933             v4    = a->a + diag[row-3] + 4;
2934             for(n = 0; n<sz-1; n+=2) {
2935               i1   = idx[0];
2936               i2   = idx[1];
2937               idx += 2;
2938               tmp0 = x[i1];
2939               tmp1 = x[i2];
2940               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2941               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2942               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2943               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2944             }
2945 
2946             if (n == sz-1){
2947               tmp0  = x[*idx];
2948               sum1 -= *v1*tmp0;
2949               sum2 -= *v2*tmp0;
2950               sum3 -= *v3*tmp0;
2951               sum4 -= *v4*tmp0;
2952             }
2953             x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
2954             x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
2955             x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
2956             x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
2957             break;
2958           case 5:
2959 
2960             sum1  = xb[row];
2961             sum2  = xb[row-1];
2962             sum3  = xb[row-2];
2963             sum4  = xb[row-3];
2964             sum5  = xb[row-4];
2965             v2    = a->a + diag[row-1] + 2;
2966             v3    = a->a + diag[row-2] + 3;
2967             v4    = a->a + diag[row-3] + 4;
2968             v5    = a->a + diag[row-4] + 5;
2969             for(n = 0; n<sz-1; n+=2) {
2970               i1   = idx[0];
2971               i2   = idx[1];
2972               idx += 2;
2973               tmp0 = x[i1];
2974               tmp1 = x[i2];
2975               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2976               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2977               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2978               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2979               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2980             }
2981 
2982             if (n == sz-1){
2983               tmp0  = x[*idx];
2984               sum1 -= *v1*tmp0;
2985               sum2 -= *v2*tmp0;
2986               sum3 -= *v3*tmp0;
2987               sum4 -= *v4*tmp0;
2988               sum5 -= *v5*tmp0;
2989             }
2990             x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
2991             x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
2992             x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
2993             x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
2994             x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
2995             break;
2996 	  default:
2997    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2998         }
2999       }
3000 
3001       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3002     }
3003     its--;
3004     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3005     if (bb != xx) {ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);}
3006   }
3007   if (flag & SOR_EISENSTAT) {
3008     const PetscScalar *b;
3009     MatScalar         *t = a->inode.ssor_work;
3010 
3011     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3012     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3013     /*
3014           Apply  (U + D)^-1  where D is now the block diagonal
3015     */
3016     ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3017     for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3018       ibdiag -= sizes[i]*sizes[i];
3019       sz      = ii[row+1] - diag[row] - 1;
3020       v1      = a->a + diag[row] + 1;
3021       idx     = a->j + diag[row] + 1;
3022       CHKMEMQ;
3023       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3024       switch (sizes[i]){
3025         case 1:
3026 
3027 	  sum1  = b[row];
3028 	  for(n = 0; n<sz-1; n+=2) {
3029 	    i1   = idx[0];
3030 	    i2   = idx[1];
3031 	    idx += 2;
3032 	    tmp0 = x[i1];
3033 	    tmp1 = x[i2];
3034 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3035 	  }
3036 
3037 	  if (n == sz-1){
3038 	    tmp0  = x[*idx];
3039 	    sum1 -= *v1*tmp0;
3040 	  }
3041 	  x[row] = sum1*(*ibdiag);row--;
3042 	  break;
3043 
3044         case 2:
3045 
3046 	  sum1  = b[row];
3047 	  sum2  = b[row-1];
3048 	  /* note that sum1 is associated with the second of the two rows */
3049 	  v2    = a->a + diag[row-1] + 2;
3050 	  for(n = 0; n<sz-1; n+=2) {
3051 	    i1   = idx[0];
3052 	    i2   = idx[1];
3053 	    idx += 2;
3054 	    tmp0 = x[i1];
3055 	    tmp1 = x[i2];
3056 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3057 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3058 	  }
3059 
3060 	  if (n == sz-1){
3061 	    tmp0  = x[*idx];
3062 	    sum1 -= *v1*tmp0;
3063 	    sum2 -= *v2*tmp0;
3064 	  }
3065 	  x[row] = sum2*ibdiag[1] + sum1*ibdiag[3];
3066 	  x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
3067           row -= 2;
3068 	  break;
3069         case 3:
3070 
3071 	  sum1  = b[row];
3072 	  sum2  = b[row-1];
3073 	  sum3  = b[row-2];
3074 	  v2    = a->a + diag[row-1] + 2;
3075 	  v3    = a->a + diag[row-2] + 3;
3076 	  for(n = 0; n<sz-1; n+=2) {
3077 	    i1   = idx[0];
3078 	    i2   = idx[1];
3079 	    idx += 2;
3080 	    tmp0 = x[i1];
3081 	    tmp1 = x[i2];
3082 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3083 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3084 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3085 	  }
3086 
3087 	  if (n == sz-1){
3088 	    tmp0  = x[*idx];
3089 	    sum1 -= *v1*tmp0;
3090 	    sum2 -= *v2*tmp0;
3091 	    sum3 -= *v3*tmp0;
3092 	  }
3093 	  x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3094 	  x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3095 	  x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3096           row -= 3;
3097 	  break;
3098         case 4:
3099 
3100 	  sum1  = b[row];
3101 	  sum2  = b[row-1];
3102 	  sum3  = b[row-2];
3103 	  sum4  = b[row-3];
3104 	  v2    = a->a + diag[row-1] + 2;
3105 	  v3    = a->a + diag[row-2] + 3;
3106 	  v4    = a->a + diag[row-3] + 4;
3107 	  for(n = 0; n<sz-1; n+=2) {
3108 	    i1   = idx[0];
3109 	    i2   = idx[1];
3110 	    idx += 2;
3111 	    tmp0 = x[i1];
3112 	    tmp1 = x[i2];
3113 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3114 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3115 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3116 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3117 	  }
3118 
3119 	  if (n == sz-1){
3120 	    tmp0  = x[*idx];
3121 	    sum1 -= *v1*tmp0;
3122 	    sum2 -= *v2*tmp0;
3123 	    sum3 -= *v3*tmp0;
3124 	    sum4 -= *v4*tmp0;
3125 	  }
3126 	  x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3127 	  x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3128 	  x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3129 	  x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3130           row -= 4;
3131 	  break;
3132         case 5:
3133 
3134 	  sum1  = b[row];
3135 	  sum2  = b[row-1];
3136 	  sum3  = b[row-2];
3137 	  sum4  = b[row-3];
3138 	  sum5  = b[row-4];
3139 	  v2    = a->a + diag[row-1] + 2;
3140 	  v3    = a->a + diag[row-2] + 3;
3141 	  v4    = a->a + diag[row-3] + 4;
3142 	  v5    = a->a + diag[row-4] + 5;
3143 	  for(n = 0; n<sz-1; n+=2) {
3144 	    i1   = idx[0];
3145 	    i2   = idx[1];
3146 	    idx += 2;
3147 	    tmp0 = x[i1];
3148 	    tmp1 = x[i2];
3149 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3150 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3151 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3152 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3153 	    sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3154 	  }
3155 
3156 	  if (n == sz-1){
3157 	    tmp0  = x[*idx];
3158 	    sum1 -= *v1*tmp0;
3159 	    sum2 -= *v2*tmp0;
3160 	    sum3 -= *v3*tmp0;
3161 	    sum4 -= *v4*tmp0;
3162 	    sum5 -= *v5*tmp0;
3163 	  }
3164 	  x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3165 	  x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3166 	  x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3167 	  x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3168 	  x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3169           row -= 5;
3170 	  break;
3171         default:
3172 	  SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3173       }
3174       CHKMEMQ;
3175     }
3176     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3177 
3178     /*
3179            t = b - D x    where D is the block diagonal
3180     */
3181     cnt = 0;
3182     for (i=0, row=0; i<m; i++) {
3183       CHKMEMQ;
3184       switch (sizes[i]){
3185         case 1:
3186 	  t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
3187 	  break;
3188         case 2:
3189 	  x1   = x[row]; x2 = x[row+1];
3190 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3191 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3192 	  t[row]   = b[row] - tmp1;
3193 	  t[row+1] = b[row+1] - tmp2; row += 2;
3194 	  cnt += 4;
3195 	  break;
3196         case 3:
3197 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2];
3198 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3199 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3200 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3201 	  t[row] = b[row] - tmp1;
3202 	  t[row+1] = b[row+1] - tmp2;
3203 	  t[row+2] = b[row+2] - tmp3; row += 3;
3204 	  cnt += 9;
3205 	  break;
3206         case 4:
3207 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3208 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3209 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3210 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3211 	  tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3212 	  t[row] = b[row] - tmp1;
3213 	  t[row+1] = b[row+1] - tmp2;
3214 	  t[row+2] = b[row+2] - tmp3;
3215 	  t[row+3] = b[row+3] - tmp4; row += 4;
3216 	  cnt += 16;
3217 	  break;
3218         case 5:
3219 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3220 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3221 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3222 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3223 	  tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3224 	  tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3225 	  t[row] = b[row] - tmp1;
3226 	  t[row+1] = b[row+1] - tmp2;
3227 	  t[row+2] = b[row+2] - tmp3;
3228 	  t[row+3] = b[row+3] - tmp4;
3229 	  t[row+4] = b[row+4] - tmp5;row += 5;
3230 	  cnt += 25;
3231 	  break;
3232         default:
3233 	  SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3234       }
3235       CHKMEMQ;
3236     }
3237     ierr = PetscLogFlops(m);CHKERRQ(ierr);
3238 
3239 
3240 
3241     /*
3242           Apply (L + D)^-1 where D is the block diagonal
3243     */
3244     for (i=0, row=0; i<m; i++) {
3245       sz  = diag[row] - ii[row];
3246       v1  = a->a + ii[row];
3247       idx = a->j + ii[row];
3248       CHKMEMQ;
3249       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3250       switch (sizes[i]){
3251         case 1:
3252 
3253 	  sum1  = t[row];
3254 	  for(n = 0; n<sz-1; n+=2) {
3255 	    i1   = idx[0];
3256 	    i2   = idx[1];
3257 	    idx += 2;
3258 	    tmp0 = t[i1];
3259 	    tmp1 = t[i2];
3260 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3261 	  }
3262 
3263 	  if (n == sz-1){
3264 	    tmp0  = t[*idx];
3265 	    sum1 -= *v1 * tmp0;
3266 	  }
3267 	  x[row] += t[row] = sum1*(*ibdiag++); row++;
3268 	  break;
3269         case 2:
3270 	  v2    = a->a + ii[row+1];
3271 	  sum1  = t[row];
3272 	  sum2  = t[row+1];
3273 	  for(n = 0; n<sz-1; n+=2) {
3274 	    i1   = idx[0];
3275 	    i2   = idx[1];
3276 	    idx += 2;
3277 	    tmp0 = t[i1];
3278 	    tmp1 = t[i2];
3279 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3280 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3281 	  }
3282 
3283 	  if (n == sz-1){
3284 	    tmp0  = t[*idx];
3285 	    sum1 -= v1[0] * tmp0;
3286 	    sum2 -= v2[0] * tmp0;
3287 	  }
3288 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3289 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3290 	  ibdiag  += 4; row += 2;
3291 	  break;
3292         case 3:
3293 	  v2    = a->a + ii[row+1];
3294 	  v3    = a->a + ii[row+2];
3295 	  sum1  = t[row];
3296 	  sum2  = t[row+1];
3297 	  sum3  = t[row+2];
3298 	  for(n = 0; n<sz-1; n+=2) {
3299 	    i1   = idx[0];
3300 	    i2   = idx[1];
3301 	    idx += 2;
3302 	    tmp0 = t[i1];
3303 	    tmp1 = t[i2];
3304 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3305 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3306 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3307 	  }
3308 
3309 	  if (n == sz-1){
3310 	    tmp0  = t[*idx];
3311 	    sum1 -= v1[0] * tmp0;
3312 	    sum2 -= v2[0] * tmp0;
3313 	    sum3 -= v3[0] * tmp0;
3314 	  }
3315 	  x[row]  += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3316 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3317 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3318 	  ibdiag  += 9; row += 3;
3319 	  break;
3320         case 4:
3321 	  v2    = a->a + ii[row+1];
3322 	  v3    = a->a + ii[row+2];
3323 	  v4    = a->a + ii[row+3];
3324 	  sum1  = t[row];
3325 	  sum2  = t[row+1];
3326 	  sum3  = t[row+2];
3327 	  sum4  = t[row+3];
3328 	  for(n = 0; n<sz-1; n+=2) {
3329 	    i1   = idx[0];
3330 	    i2   = idx[1];
3331 	    idx += 2;
3332 	    tmp0 = t[i1];
3333 	    tmp1 = t[i2];
3334 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3335 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3336 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3337 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3338 	  }
3339 
3340 	  if (n == sz-1){
3341 	    tmp0  = t[*idx];
3342 	    sum1 -= v1[0] * tmp0;
3343 	    sum2 -= v2[0] * tmp0;
3344 	    sum3 -= v3[0] * tmp0;
3345 	    sum4 -= v4[0] * tmp0;
3346 	  }
3347 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3348 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3349 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3350 	  x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3351 	  ibdiag  += 16; row += 4;
3352 	  break;
3353         case 5:
3354 	  v2    = a->a + ii[row+1];
3355 	  v3    = a->a + ii[row+2];
3356 	  v4    = a->a + ii[row+3];
3357 	  v5    = a->a + ii[row+4];
3358 	  sum1  = t[row];
3359 	  sum2  = t[row+1];
3360 	  sum3  = t[row+2];
3361 	  sum4  = t[row+3];
3362 	  sum5  = t[row+4];
3363 	  for(n = 0; n<sz-1; n+=2) {
3364 	    i1   = idx[0];
3365 	    i2   = idx[1];
3366 	    idx += 2;
3367 	    tmp0 = t[i1];
3368 	    tmp1 = t[i2];
3369 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3370 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3371 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3372 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3373 	    sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3374 	  }
3375 
3376 	  if (n == sz-1){
3377 	    tmp0  = t[*idx];
3378 	    sum1 -= v1[0] * tmp0;
3379 	    sum2 -= v2[0] * tmp0;
3380 	    sum3 -= v3[0] * tmp0;
3381 	    sum4 -= v4[0] * tmp0;
3382 	    sum5 -= v5[0] * tmp0;
3383 	  }
3384 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3385 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3386 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3387 	  x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3388 	  x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3389 	  ibdiag  += 25; row += 5;
3390 	  break;
3391         default:
3392 	  SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3393       }
3394       CHKMEMQ;
3395     }
3396     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3397     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3398     ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3399   }
3400   PetscFunctionReturn(0);
3401 }
3402 
3403 #undef __FUNCT__
3404 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode"
3405 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
3406 {
3407   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3408   PetscScalar        *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
3409   const MatScalar    *bdiag = a->inode.bdiag;
3410   const PetscScalar  *b;
3411   PetscErrorCode      ierr;
3412   PetscInt            m = a->inode.node_count,cnt = 0,i,row;
3413   const PetscInt      *sizes = a->inode.size;
3414 
3415   PetscFunctionBegin;
3416   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3417   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3418   cnt = 0;
3419   for (i=0, row=0; i<m; i++) {
3420     switch (sizes[i]){
3421       case 1:
3422 	x[row] = b[row]*bdiag[cnt++];row++;
3423 	break;
3424       case 2:
3425 	x1   = b[row]; x2 = b[row+1];
3426 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3427 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3428 	x[row++] = tmp1;
3429 	x[row++] = tmp2;
3430 	cnt += 4;
3431 	break;
3432       case 3:
3433 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2];
3434 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3435 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3436 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3437 	x[row++] = tmp1;
3438 	x[row++] = tmp2;
3439 	x[row++] = tmp3;
3440 	cnt += 9;
3441 	break;
3442       case 4:
3443 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
3444 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3445 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3446 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3447 	tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3448 	x[row++] = tmp1;
3449 	x[row++] = tmp2;
3450 	x[row++] = tmp3;
3451 	x[row++] = tmp4;
3452 	cnt += 16;
3453 	break;
3454       case 5:
3455 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
3456 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3457 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3458 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3459 	tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3460 	tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3461 	x[row++] = tmp1;
3462 	x[row++] = tmp2;
3463 	x[row++] = tmp3;
3464 	x[row++] = tmp4;
3465 	x[row++] = tmp5;
3466 	cnt += 25;
3467 	break;
3468       default:
3469 	SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3470     }
3471   }
3472   ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr);
3473   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3474   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3475   PetscFunctionReturn(0);
3476 }
3477 
3478 /*
3479     samestructure indicates that the matrix has not changed its nonzero structure so we
3480     do not need to recompute the inodes
3481 */
3482 #undef __FUNCT__
3483 #define __FUNCT__ "Mat_CheckInode"
3484 PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure)
3485 {
3486   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3487   PetscErrorCode ierr;
3488   PetscInt       i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size;
3489   PetscTruth     flag;
3490 
3491   PetscFunctionBegin;
3492   if (!a->inode.use)                     PetscFunctionReturn(0);
3493   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
3494 
3495 
3496   m = A->rmap->n;
3497   if (a->inode.size) {ns = a->inode.size;}
3498   else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);}
3499 
3500   i          = 0;
3501   node_count = 0;
3502   idx        = a->j;
3503   ii         = a->i;
3504   while (i < m){                /* For each row */
3505     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
3506     /* Limits the number of elements in a node to 'a->inode.limit' */
3507     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
3508       nzy     = ii[j+1] - ii[j]; /* Same number of nonzeros */
3509       if (nzy != nzx) break;
3510       idy  += nzx;             /* Same nonzero pattern */
3511       ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
3512       if (!flag) break;
3513     }
3514     ns[node_count++] = blk_size;
3515     idx += blk_size*nzx;
3516     i    = j;
3517   }
3518   /* If not enough inodes found,, do not use inode version of the routines */
3519   if (!a->inode.size && m && node_count > .9*m) {
3520     ierr = PetscFree(ns);CHKERRQ(ierr);
3521     a->inode.node_count     = 0;
3522     a->inode.size           = PETSC_NULL;
3523     a->inode.use            = PETSC_FALSE;
3524     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
3525   } else {
3526     A->ops->mult              = MatMult_SeqAIJ_Inode;
3527     A->ops->sor               = MatSOR_SeqAIJ_Inode;
3528     A->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
3529     A->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
3530     A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
3531     A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
3532     A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
3533     A->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
3534     A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
3535     A->ops->lufactornumeric   = MatLUFactorNumeric_SeqAIJ_Inode_inplace;
3536     a->inode.node_count       = node_count;
3537     a->inode.size             = ns;
3538     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
3539   }
3540   PetscFunctionReturn(0);
3541 }
3542 
3543 #define MatGetRow_FactoredLU(cols,nzl,nzu,nz,ai,aj,adiag,row) {	\
3544 PetscInt __k, *__vi; \
3545 __vi = aj + ai[row];				\
3546 for(__k=0;__k<nzl;__k++) cols[__k] = __vi[__k]; \
3547 __vi = aj + adiag[row];				\
3548 cols[nzl] = __vi[0];\
3549 __vi = aj + adiag[row+1]+1;\
3550 for(__k=0;__k<nzu;__k++) cols[nzl+1+__k] = __vi[__k];}
3551 
3552 
3553 /*
3554    Mat_CheckInode_FactorLU - Check Inode for factored seqaij matrix.
3555    Modified from Mat_CheckInode().
3556 
3557    Input Parameters:
3558 +  Mat A - ILU or LU matrix factor
3559 -  samestructure - TURE indicates that the matrix has not changed its nonzero structure so we
3560     do not need to recompute the inodes
3561 */
3562 #undef __FUNCT__
3563 #define __FUNCT__ "Mat_CheckInode_FactorLU"
3564 PetscErrorCode Mat_CheckInode_FactorLU(Mat A,PetscTruth samestructure)
3565 {
3566   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3567   PetscErrorCode ierr;
3568   PetscInt       i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size;
3569   PetscInt       *cols1,*cols2,*ns,*ai = a->i,*aj = a->j, *adiag = a->diag;
3570   PetscTruth     flag;
3571 
3572   PetscFunctionBegin;
3573   if (!a->inode.use)                     PetscFunctionReturn(0);
3574   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
3575 
3576   m = A->rmap->n;
3577   if (a->inode.size) {ns = a->inode.size;}
3578   else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);}
3579 
3580   i          = 0;
3581   node_count = 0;
3582   while (i < m){                /* For each row */
3583     nzl1 = ai[i+1] - ai[i];       /* Number of nonzeros in L */
3584     nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/
3585     nzx  = nzl1 + nzu1 + 1;
3586     ierr = PetscMalloc((nzx+1)*sizeof(PetscInt),&cols1);CHKERRQ(ierr);
3587     MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i);
3588 
3589     /* Limits the number of elements in a node to 'a->inode.limit' */
3590     for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
3591       nzl2    = ai[j+1] - ai[j];
3592       nzu2    = adiag[j] - adiag[j+1] - 1;
3593       nzy     = nzl2 + nzu2 + 1;
3594       if( nzy != nzx) break;
3595       ierr    = PetscMalloc((nzy+1)*sizeof(PetscInt),&cols2);CHKERRQ(ierr);
3596       MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);
3597       ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
3598       if (!flag) {ierr = PetscFree(cols2);CHKERRQ(ierr);break;}
3599       ierr = PetscFree(cols2);CHKERRQ(ierr);
3600     }
3601     ns[node_count++] = blk_size;
3602     ierr = PetscFree(cols1);CHKERRQ(ierr);
3603     i    = j;
3604   }
3605   /* If not enough inodes found,, do not use inode version of the routines */
3606   if (!a->inode.size && m && node_count > .9*m) {
3607     ierr = PetscFree(ns);CHKERRQ(ierr);
3608     a->inode.node_count     = 0;
3609     a->inode.size           = PETSC_NULL;
3610     a->inode.use            = PETSC_FALSE;
3611     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
3612   } else {
3613     A->ops->mult              = 0;
3614     A->ops->sor               = 0;
3615     A->ops->multadd           = 0;
3616     A->ops->getrowij          = 0;
3617     A->ops->restorerowij      = 0;
3618     A->ops->getcolumnij       = 0;
3619     A->ops->restorecolumnij   = 0;
3620     A->ops->coloringpatch     = 0;
3621     A->ops->multdiagonalblock = 0;
3622     A->ops->lufactornumeric   = MatLUFactorNumeric_SeqAIJ_Inode; /* not done yet */
3623     a->inode.node_count       = node_count;
3624     a->inode.size             = ns;
3625     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
3626   }
3627   PetscFunctionReturn(0);
3628 }
3629 
3630 /*
3631      This is really ugly. if inodes are used this replaces the
3632   permutations with ones that correspond to rows/cols of the matrix
3633   rather then inode blocks
3634 */
3635 #undef __FUNCT__
3636 #define __FUNCT__ "MatInodeAdjustForInodes"
3637 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
3638 {
3639   PetscErrorCode ierr,(*f)(Mat,IS*,IS*);
3640 
3641   PetscFunctionBegin;
3642   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);CHKERRQ(ierr);
3643   if (f) {
3644     ierr = (*f)(A,rperm,cperm);CHKERRQ(ierr);
3645   }
3646   PetscFunctionReturn(0);
3647 }
3648 
3649 EXTERN_C_BEGIN
3650 #undef __FUNCT__
3651 #define __FUNCT__ "MatAdjustForInodes_SeqAIJ_Inode"
3652 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
3653 {
3654   Mat_SeqAIJ      *a=(Mat_SeqAIJ*)A->data;
3655   PetscErrorCode ierr;
3656   PetscInt       m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
3657   const PetscInt *ridx,*cidx;
3658   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
3659   PetscInt       nslim_col,*ns_col;
3660   IS             ris = *rperm,cis = *cperm;
3661 
3662   PetscFunctionBegin;
3663   if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */
3664   if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */
3665 
3666   ierr  = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
3667   ierr  = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
3668   ierr  = PetscMalloc2(m,PetscInt,&permr,n,PetscInt,&permc);CHKERRQ(ierr);
3669 
3670   ierr  = ISGetIndices(ris,&ridx);CHKERRQ(ierr);
3671   ierr  = ISGetIndices(cis,&cidx);CHKERRQ(ierr);
3672 
3673   /* Form the inode structure for the rows of permuted matric using inv perm*/
3674   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
3675 
3676   /* Construct the permutations for rows*/
3677   for (i=0,row = 0; i<nslim_row; ++i){
3678     indx      = ridx[i];
3679     start_val = tns[indx];
3680     end_val   = tns[indx + 1];
3681     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
3682   }
3683 
3684   /* Form the inode structure for the columns of permuted matrix using inv perm*/
3685   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
3686 
3687  /* Construct permutations for columns */
3688   for (i=0,col=0; i<nslim_col; ++i){
3689     indx      = cidx[i];
3690     start_val = tns[indx];
3691     end_val   = tns[indx + 1];
3692     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
3693   }
3694 
3695   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);CHKERRQ(ierr);
3696   ierr = ISSetPermutation(*rperm);CHKERRQ(ierr);
3697   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);CHKERRQ(ierr);
3698   ierr = ISSetPermutation(*cperm);CHKERRQ(ierr);
3699 
3700   ierr  = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr);
3701   ierr  = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr);
3702 
3703   ierr = PetscFree(ns_col);CHKERRQ(ierr);
3704   ierr = PetscFree2(permr,permc);CHKERRQ(ierr);
3705   ierr = ISDestroy(cis);CHKERRQ(ierr);
3706   ierr = ISDestroy(ris);CHKERRQ(ierr);
3707   ierr = PetscFree(tns);CHKERRQ(ierr);
3708   PetscFunctionReturn(0);
3709 }
3710 EXTERN_C_END
3711 
3712 #undef __FUNCT__
3713 #define __FUNCT__ "MatInodeGetInodeSizes"
3714 /*@C
3715    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
3716 
3717    Collective on Mat
3718 
3719    Input Parameter:
3720 .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
3721 
3722    Output Parameter:
3723 +  node_count - no of inodes present in the matrix.
3724 .  sizes      - an array of size node_count,with sizes of each inode.
3725 -  limit      - the max size used to generate the inodes.
3726 
3727    Level: advanced
3728 
3729    Notes: This routine returns some internal storage information
3730    of the matrix, it is intended to be used by advanced users.
3731    It should be called after the matrix is assembled.
3732    The contents of the sizes[] array should not be changed.
3733    PETSC_NULL may be passed for information not requested.
3734 
3735 .keywords: matrix, seqaij, get, inode
3736 
3737 .seealso: MatGetInfo()
3738 @*/
3739 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
3740 {
3741   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
3742 
3743   PetscFunctionBegin;
3744   if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
3745   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr);
3746   if (f) {
3747     ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr);
3748   }
3749   PetscFunctionReturn(0);
3750 }
3751 
3752 EXTERN_C_BEGIN
3753 #undef __FUNCT__
3754 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode"
3755 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
3756 {
3757   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
3758 
3759   PetscFunctionBegin;
3760   if (node_count) *node_count = a->inode.node_count;
3761   if (sizes)      *sizes      = a->inode.size;
3762   if (limit)      *limit      = a->inode.limit;
3763   PetscFunctionReturn(0);
3764 }
3765 EXTERN_C_END
3766