xref: /petsc/src/mat/impls/aij/seq/inode.c (revision 628f99d79ee051acbd24c51bb4edfc658064ee22)
1 #define PETSCMAT_DLL
2 
3 /*
4   This file provides high performance routines for the Inode format (compressed sparse row)
5   by taking advantage of rows with identical nonzero structure (I-nodes).
6 */
7 #include "../src/mat/impls/aij/seq/aij.h"
8 
9 #undef __FUNCT__
10 #define __FUNCT__ "Mat_CreateColInode"
11 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns)
12 {
13   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
14   PetscErrorCode ierr;
15   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;
16 
17   PetscFunctionBegin;
18   n      = A->cmap->n;
19   m      = A->rmap->n;
20   ns_row = a->inode.size;
21 
22   min_mn = (m < n) ? m : n;
23   if (!ns) {
24     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++);
25     for(; count+1 < n; count++,i++);
26     if (count < n)  {
27       i++;
28     }
29     *size = i;
30     PetscFunctionReturn(0);
31   }
32   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr);
33 
34   /* Use the same row structure wherever feasible. */
35   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
36     ns_col[i] = ns_row[i];
37   }
38 
39   /* if m < n; pad up the remainder with inode_limit */
40   for(; count+1 < n; count++,i++) {
41     ns_col[i] = 1;
42   }
43   /* The last node is the odd ball. padd it up with the remaining rows; */
44   if (count < n)  {
45     ns_col[i] = n - count;
46     i++;
47   } else if (count > n) {
48     /* Adjust for the over estimation */
49     ns_col[i-1] += n - count;
50   }
51   *size = i;
52   *ns   = ns_col;
53   PetscFunctionReturn(0);
54 }
55 
56 
57 /*
58       This builds symmetric version of nonzero structure,
59 */
60 #undef __FUNCT__
61 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric"
62 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
63 {
64   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
65   PetscErrorCode ierr;
66   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n;
67   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j;
68 
69   PetscFunctionBegin;
70   nslim_row = a->inode.node_count;
71   m         = A->rmap->n;
72   n         = A->cmap->n;
73   if (m != n) SETERRQ(PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
74 
75   /* Use the row_inode as column_inode */
76   nslim_col = nslim_row;
77   ns_col    = ns_row;
78 
79   /* allocate space for reformated inode structure */
80   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
81   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
82   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
83 
84   for (i1=0,col=0; i1<nslim_col; ++i1){
85     nsz = ns_col[i1];
86     for (i2=0; i2<nsz; ++i2,++col)
87       tvc[col] = i1;
88   }
89   /* allocate space for row pointers */
90   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
91   *iia = ia;
92   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
93   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
94 
95   /* determine the number of columns in each row */
96   ia[0] = oshift;
97   for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) {
98 
99     j    = aj + ai[row] + ishift;
100     jmax = aj + ai[row+1] + ishift;
101     i2   = 0;
102     col  = *j++ + ishift;
103     i2   = tvc[col];
104     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
105       ia[i1+1]++;
106       ia[i2+1]++;
107       i2++;                     /* Start col of next node */
108       while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j;
109       i2 = tvc[col];
110     }
111     if(i2 == i1) ia[i2+1]++;    /* now the diagonal element */
112   }
113 
114   /* shift ia[i] to point to next row */
115   for (i1=1; i1<nslim_row+1; i1++) {
116     row        = ia[i1-1];
117     ia[i1]    += row;
118     work[i1-1] = row - oshift;
119   }
120 
121   /* allocate space for column pointers */
122   nz   = ia[nslim_row] + (!ishift);
123   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
124   *jja = ja;
125 
126  /* loop over lower triangular part putting into ja */
127   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
128     j    = aj + ai[row] + ishift;
129     jmax = aj + ai[row+1] + ishift;
130     i2   = 0;                     /* Col inode index */
131     col  = *j++ + ishift;
132     i2   = tvc[col];
133     while (i2<i1 && j<jmax) {
134       ja[work[i2]++] = i1 + oshift;
135       ja[work[i1]++] = i2 + oshift;
136       ++i2;
137       while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */
138       i2 = tvc[col];
139     }
140     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
141 
142   }
143   ierr = PetscFree(work);CHKERRQ(ierr);
144   ierr = PetscFree(tns);CHKERRQ(ierr);
145   ierr = PetscFree(tvc);CHKERRQ(ierr);
146   PetscFunctionReturn(0);
147 }
148 
149 /*
150       This builds nonsymmetric version of nonzero structure,
151 */
152 #undef __FUNCT__
153 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric"
154 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
155 {
156   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
157   PetscErrorCode ierr;
158   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col;
159   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
160 
161   PetscFunctionBegin;
162   nslim_row = a->inode.node_count;
163   n         = A->cmap->n;
164 
165   /* Create The column_inode for this matrix */
166   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
167 
168   /* allocate space for reformated column_inode structure */
169   ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
170   ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
171   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
172 
173   for (i1=0,col=0; i1<nslim_col; ++i1){
174     nsz = ns_col[i1];
175     for (i2=0; i2<nsz; ++i2,++col)
176       tvc[col] = i1;
177   }
178   /* allocate space for row pointers */
179   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
180   *iia = ia;
181   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
182   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
183 
184   /* determine the number of columns in each row */
185   ia[0] = oshift;
186   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
187     j   = aj + ai[row] + ishift;
188     col = *j++ + ishift;
189     i2  = tvc[col];
190     nz  = ai[row+1] - ai[row];
191     while (nz-- > 0) {           /* off-diagonal elemets */
192       ia[i1+1]++;
193       i2++;                     /* Start col of next node */
194       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
195       if (nz > 0) i2 = tvc[col];
196     }
197   }
198 
199   /* shift ia[i] to point to next row */
200   for (i1=1; i1<nslim_row+1; i1++) {
201     row        = ia[i1-1];
202     ia[i1]    += row;
203     work[i1-1] = row - oshift;
204   }
205 
206   /* allocate space for column pointers */
207   nz   = ia[nslim_row] + (!ishift);
208   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
209   *jja = ja;
210 
211  /* loop over matrix putting into ja */
212   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
213     j   = aj + ai[row] + ishift;
214     i2  = 0;                     /* Col inode index */
215     col = *j++ + ishift;
216     i2  = tvc[col];
217     nz  = ai[row+1] - ai[row];
218     while (nz-- > 0) {
219       ja[work[i1]++] = i2 + oshift;
220       ++i2;
221       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
222       if (nz > 0) i2 = tvc[col];
223     }
224   }
225   ierr = PetscFree(ns_col);CHKERRQ(ierr);
226   ierr = PetscFree(work);CHKERRQ(ierr);
227   ierr = PetscFree(tns);CHKERRQ(ierr);
228   ierr = PetscFree(tvc);CHKERRQ(ierr);
229   PetscFunctionReturn(0);
230 }
231 
232 #undef __FUNCT__
233 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode"
234 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
235 {
236   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
237   PetscErrorCode ierr;
238 
239   PetscFunctionBegin;
240   *n     = a->inode.node_count;
241   if (!ia) PetscFunctionReturn(0);
242   if (!blockcompressed) {
243     ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
244   } else if (symmetric) {
245     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
246   } else {
247     ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
248   }
249   PetscFunctionReturn(0);
250 }
251 
252 #undef __FUNCT__
253 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode"
254 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
255 {
256   PetscErrorCode ierr;
257 
258   PetscFunctionBegin;
259   if (!ia) PetscFunctionReturn(0);
260 
261   if (!blockcompressed) {
262     ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
263   } else {
264     ierr = PetscFree(*ia);CHKERRQ(ierr);
265     ierr = PetscFree(*ja);CHKERRQ(ierr);
266   }
267 
268   PetscFunctionReturn(0);
269 }
270 
271 /* ----------------------------------------------------------- */
272 
273 #undef __FUNCT__
274 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric"
275 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
276 {
277   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
278   PetscErrorCode ierr;
279   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
280   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
281 
282   PetscFunctionBegin;
283   nslim_row = a->inode.node_count;
284   n         = A->cmap->n;
285 
286   /* Create The column_inode for this matrix */
287   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
288 
289   /* allocate space for reformated column_inode structure */
290   ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
291   ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
292   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
293 
294   for (i1=0,col=0; i1<nslim_col; ++i1){
295     nsz = ns_col[i1];
296     for (i2=0; i2<nsz; ++i2,++col)
297       tvc[col] = i1;
298   }
299   /* allocate space for column pointers */
300   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
301   *iia = ia;
302   ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr);
303   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
304 
305   /* determine the number of columns in each row */
306   ia[0] = oshift;
307   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
308     j   = aj + ai[row] + ishift;
309     col = *j++ + ishift;
310     i2  = tvc[col];
311     nz  = ai[row+1] - ai[row];
312     while (nz-- > 0) {           /* off-diagonal elemets */
313       /* ia[i1+1]++; */
314       ia[i2+1]++;
315       i2++;
316       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
317       if (nz > 0) i2 = tvc[col];
318     }
319   }
320 
321   /* shift ia[i] to point to next col */
322   for (i1=1; i1<nslim_col+1; i1++) {
323     col        = ia[i1-1];
324     ia[i1]    += col;
325     work[i1-1] = col - oshift;
326   }
327 
328   /* allocate space for column pointers */
329   nz   = ia[nslim_col] + (!ishift);
330   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
331   *jja = ja;
332 
333  /* loop over matrix putting into ja */
334   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
335     j   = aj + ai[row] + ishift;
336     i2  = 0;                     /* Col inode index */
337     col = *j++ + ishift;
338     i2  = tvc[col];
339     nz  = ai[row+1] - ai[row];
340     while (nz-- > 0) {
341       /* ja[work[i1]++] = i2 + oshift; */
342       ja[work[i2]++] = i1 + oshift;
343       i2++;
344       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
345       if (nz > 0) i2 = tvc[col];
346     }
347   }
348   ierr = PetscFree(ns_col);CHKERRQ(ierr);
349   ierr = PetscFree(work);CHKERRQ(ierr);
350   ierr = PetscFree(tns);CHKERRQ(ierr);
351   ierr = PetscFree(tvc);CHKERRQ(ierr);
352   PetscFunctionReturn(0);
353 }
354 
355 #undef __FUNCT__
356 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode"
357 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
358 {
359   PetscErrorCode ierr;
360 
361   PetscFunctionBegin;
362   ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr);
363   if (!ia) PetscFunctionReturn(0);
364 
365   if (!blockcompressed) {
366     ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
367   } else if (symmetric) {
368     /* Since the indices are symmetric it does'nt matter */
369     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
370   } else {
371     ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
372   }
373   PetscFunctionReturn(0);
374 }
375 
376 #undef __FUNCT__
377 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode"
378 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
379 {
380   PetscErrorCode ierr;
381 
382   PetscFunctionBegin;
383   if (!ia) PetscFunctionReturn(0);
384   if (!blockcompressed) {
385     ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
386   } else {
387     ierr = PetscFree(*ia);CHKERRQ(ierr);
388     ierr = PetscFree(*ja);CHKERRQ(ierr);
389   }
390   PetscFunctionReturn(0);
391 }
392 
393 /* ----------------------------------------------------------- */
394 
395 #undef __FUNCT__
396 #define __FUNCT__ "MatMult_SeqAIJ_Inode"
397 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
398 {
399   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
400   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
401   PetscScalar       *y;
402   const PetscScalar *x;
403   const MatScalar   *v1,*v2,*v3,*v4,*v5;
404   PetscErrorCode    ierr;
405   PetscInt          *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0;
406 
407 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
408 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
409 #endif
410 
411   PetscFunctionBegin;
412   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
413   node_max = a->inode.node_count;
414   ns       = a->inode.size;     /* Node Size array */
415   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
416   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
417   idx  = a->j;
418   v1   = a->a;
419   ii   = a->i;
420 
421   for (i = 0,row = 0; i< node_max; ++i){
422     nsz  = ns[i];
423     n    = ii[1] - ii[0];
424     nonzerorow += (n>0)*nsz;
425     ii  += nsz;
426     PetscPrefetchBlock(idx+nsz*n,n,0,0);    /* Prefetch the indices for the block row after the current one */
427     PetscPrefetchBlock(v1+nsz*n,nsz*n,0,0); /* Prefetch the values for the block row after the current one  */
428     sz   = n;                   /* No of non zeros in this row */
429                                 /* Switch on the size of Node */
430     switch (nsz){               /* Each loop in 'case' is unrolled */
431     case 1 :
432       sum1  = 0.;
433 
434       for(n = 0; n< sz-1; n+=2) {
435         i1   = idx[0];          /* The instructions are ordered to */
436         i2   = idx[1];          /* make the compiler's job easy */
437         idx += 2;
438         tmp0 = x[i1];
439         tmp1 = x[i2];
440         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
441        }
442 
443       if (n == sz-1){          /* Take care of the last nonzero  */
444         tmp0  = x[*idx++];
445         sum1 += *v1++ * tmp0;
446       }
447       y[row++]=sum1;
448       break;
449     case 2:
450       sum1  = 0.;
451       sum2  = 0.;
452       v2    = v1 + n;
453 
454       for (n = 0; n< sz-1; n+=2) {
455         i1   = idx[0];
456         i2   = idx[1];
457         idx += 2;
458         tmp0 = x[i1];
459         tmp1 = x[i2];
460         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
461         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
462       }
463       if (n == sz-1){
464         tmp0  = x[*idx++];
465         sum1 += *v1++ * tmp0;
466         sum2 += *v2++ * tmp0;
467       }
468       y[row++]=sum1;
469       y[row++]=sum2;
470       v1      =v2;              /* Since the next block to be processed starts there*/
471       idx    +=sz;
472       break;
473     case 3:
474       sum1  = 0.;
475       sum2  = 0.;
476       sum3  = 0.;
477       v2    = v1 + n;
478       v3    = v2 + n;
479 
480       for (n = 0; n< sz-1; n+=2) {
481         i1   = idx[0];
482         i2   = idx[1];
483         idx += 2;
484         tmp0 = x[i1];
485         tmp1 = x[i2];
486         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
487         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
488         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
489       }
490       if (n == sz-1){
491         tmp0  = x[*idx++];
492         sum1 += *v1++ * tmp0;
493         sum2 += *v2++ * tmp0;
494         sum3 += *v3++ * tmp0;
495       }
496       y[row++]=sum1;
497       y[row++]=sum2;
498       y[row++]=sum3;
499       v1       =v3;             /* Since the next block to be processed starts there*/
500       idx     +=2*sz;
501       break;
502     case 4:
503       sum1  = 0.;
504       sum2  = 0.;
505       sum3  = 0.;
506       sum4  = 0.;
507       v2    = v1 + n;
508       v3    = v2 + n;
509       v4    = v3 + n;
510 
511       for (n = 0; n< sz-1; n+=2) {
512         i1   = idx[0];
513         i2   = idx[1];
514         idx += 2;
515         tmp0 = x[i1];
516         tmp1 = x[i2];
517         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
518         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
519         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
520         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
521       }
522       if (n == sz-1){
523         tmp0  = x[*idx++];
524         sum1 += *v1++ * tmp0;
525         sum2 += *v2++ * tmp0;
526         sum3 += *v3++ * tmp0;
527         sum4 += *v4++ * tmp0;
528       }
529       y[row++]=sum1;
530       y[row++]=sum2;
531       y[row++]=sum3;
532       y[row++]=sum4;
533       v1      =v4;              /* Since the next block to be processed starts there*/
534       idx    +=3*sz;
535       break;
536     case 5:
537       sum1  = 0.;
538       sum2  = 0.;
539       sum3  = 0.;
540       sum4  = 0.;
541       sum5  = 0.;
542       v2    = v1 + n;
543       v3    = v2 + n;
544       v4    = v3 + n;
545       v5    = v4 + n;
546 
547       for (n = 0; n<sz-1; n+=2) {
548         i1   = idx[0];
549         i2   = idx[1];
550         idx += 2;
551         tmp0 = x[i1];
552         tmp1 = x[i2];
553         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
554         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
555         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
556         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
557         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
558       }
559       if (n == sz-1){
560         tmp0  = x[*idx++];
561         sum1 += *v1++ * tmp0;
562         sum2 += *v2++ * tmp0;
563         sum3 += *v3++ * tmp0;
564         sum4 += *v4++ * tmp0;
565         sum5 += *v5++ * tmp0;
566       }
567       y[row++]=sum1;
568       y[row++]=sum2;
569       y[row++]=sum3;
570       y[row++]=sum4;
571       y[row++]=sum5;
572       v1      =v5;       /* Since the next block to be processed starts there */
573       idx    +=4*sz;
574       break;
575     default :
576       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
577     }
578   }
579   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
580   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
581   ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr);
582   PetscFunctionReturn(0);
583 }
584 /* ----------------------------------------------------------- */
585 /* Almost same code as the MatMult_SeqAIJ_Inode() */
586 #undef __FUNCT__
587 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode"
588 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
589 {
590   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
591   PetscScalar    sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
592   MatScalar      *v1,*v2,*v3,*v4,*v5;
593   PetscScalar    *x,*y,*z,*zt;
594   PetscErrorCode ierr;
595   PetscInt       *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
596 
597   PetscFunctionBegin;
598   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
599   node_max = a->inode.node_count;
600   ns       = a->inode.size;     /* Node Size array */
601   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
602   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
603   if (zz != yy) {
604     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
605   } else {
606     z = y;
607   }
608   zt = z;
609 
610   idx  = a->j;
611   v1   = a->a;
612   ii   = a->i;
613 
614   for (i = 0,row = 0; i< node_max; ++i){
615     nsz  = ns[i];
616     n    = ii[1] - ii[0];
617     ii  += nsz;
618     sz   = n;                   /* No of non zeros in this row */
619                                 /* Switch on the size of Node */
620     switch (nsz){               /* Each loop in 'case' is unrolled */
621     case 1 :
622       sum1  = *zt++;
623 
624       for(n = 0; n< sz-1; n+=2) {
625         i1   = idx[0];          /* The instructions are ordered to */
626         i2   = idx[1];          /* make the compiler's job easy */
627         idx += 2;
628         tmp0 = x[i1];
629         tmp1 = x[i2];
630         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
631        }
632 
633       if(n   == sz-1){          /* Take care of the last nonzero  */
634         tmp0  = x[*idx++];
635         sum1 += *v1++ * tmp0;
636       }
637       y[row++]=sum1;
638       break;
639     case 2:
640       sum1  = *zt++;
641       sum2  = *zt++;
642       v2    = v1 + n;
643 
644       for(n = 0; n< sz-1; n+=2) {
645         i1   = idx[0];
646         i2   = idx[1];
647         idx += 2;
648         tmp0 = x[i1];
649         tmp1 = x[i2];
650         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
651         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
652       }
653       if(n   == sz-1){
654         tmp0  = x[*idx++];
655         sum1 += *v1++ * tmp0;
656         sum2 += *v2++ * tmp0;
657       }
658       y[row++]=sum1;
659       y[row++]=sum2;
660       v1      =v2;              /* Since the next block to be processed starts there*/
661       idx    +=sz;
662       break;
663     case 3:
664       sum1  = *zt++;
665       sum2  = *zt++;
666       sum3  = *zt++;
667       v2    = v1 + n;
668       v3    = v2 + n;
669 
670       for (n = 0; n< sz-1; n+=2) {
671         i1   = idx[0];
672         i2   = idx[1];
673         idx += 2;
674         tmp0 = x[i1];
675         tmp1 = x[i2];
676         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
677         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
678         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
679       }
680       if (n == sz-1){
681         tmp0  = x[*idx++];
682         sum1 += *v1++ * tmp0;
683         sum2 += *v2++ * tmp0;
684         sum3 += *v3++ * tmp0;
685       }
686       y[row++]=sum1;
687       y[row++]=sum2;
688       y[row++]=sum3;
689       v1       =v3;             /* Since the next block to be processed starts there*/
690       idx     +=2*sz;
691       break;
692     case 4:
693       sum1  = *zt++;
694       sum2  = *zt++;
695       sum3  = *zt++;
696       sum4  = *zt++;
697       v2    = v1 + n;
698       v3    = v2 + n;
699       v4    = v3 + n;
700 
701       for (n = 0; n< sz-1; n+=2) {
702         i1   = idx[0];
703         i2   = idx[1];
704         idx += 2;
705         tmp0 = x[i1];
706         tmp1 = x[i2];
707         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
708         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
709         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
710         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
711       }
712       if (n == sz-1){
713         tmp0  = x[*idx++];
714         sum1 += *v1++ * tmp0;
715         sum2 += *v2++ * tmp0;
716         sum3 += *v3++ * tmp0;
717         sum4 += *v4++ * tmp0;
718       }
719       y[row++]=sum1;
720       y[row++]=sum2;
721       y[row++]=sum3;
722       y[row++]=sum4;
723       v1      =v4;              /* Since the next block to be processed starts there*/
724       idx    +=3*sz;
725       break;
726     case 5:
727       sum1  = *zt++;
728       sum2  = *zt++;
729       sum3  = *zt++;
730       sum4  = *zt++;
731       sum5  = *zt++;
732       v2    = v1 + n;
733       v3    = v2 + n;
734       v4    = v3 + n;
735       v5    = v4 + n;
736 
737       for (n = 0; n<sz-1; n+=2) {
738         i1   = idx[0];
739         i2   = idx[1];
740         idx += 2;
741         tmp0 = x[i1];
742         tmp1 = x[i2];
743         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
744         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
745         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
746         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
747         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
748       }
749       if(n   == sz-1){
750         tmp0  = x[*idx++];
751         sum1 += *v1++ * tmp0;
752         sum2 += *v2++ * tmp0;
753         sum3 += *v3++ * tmp0;
754         sum4 += *v4++ * tmp0;
755         sum5 += *v5++ * tmp0;
756       }
757       y[row++]=sum1;
758       y[row++]=sum2;
759       y[row++]=sum3;
760       y[row++]=sum4;
761       y[row++]=sum5;
762       v1      =v5;       /* Since the next block to be processed starts there */
763       idx    +=4*sz;
764       break;
765     default :
766       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
767     }
768   }
769   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
770   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
771   if (zz != yy) {
772     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
773   }
774   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
775   PetscFunctionReturn(0);
776 }
777 
778 /* ----------------------------------------------------------- */
779 #undef __FUNCT__
780 #define __FUNCT__ "MatSolve_SeqAIJ_Inode_inplace"
781 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)
782 {
783   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
784   IS                iscol = a->col,isrow = a->row;
785   PetscErrorCode    ierr;
786   const PetscInt    *r,*c,*rout,*cout;
787   PetscInt          i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
788   PetscInt          node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
789   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
790   PetscScalar       sum1,sum2,sum3,sum4,sum5;
791   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
792   const PetscScalar *b;
793 
794   PetscFunctionBegin;
795   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
796   node_max = a->inode.node_count;
797   ns       = a->inode.size;     /* Node Size array */
798 
799   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
800   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
801   tmp  = a->solve_work;
802 
803   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
804   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
805 
806   /* forward solve the lower triangular */
807   tmps = tmp ;
808   aa   = a_a ;
809   aj   = a_j ;
810   ad   = a->diag;
811 
812   for (i = 0,row = 0; i< node_max; ++i){
813     nsz = ns[i];
814     aii = ai[row];
815     v1  = aa + aii;
816     vi  = aj + aii;
817     nz  = ad[row]- aii;
818     if (i < node_max-1) {
819       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
820       * but our indexing to determine it's size could. */
821       PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,0); /* indices */
822       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
823       PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,0);
824       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
825     }
826 
827     switch (nsz){               /* Each loop in 'case' is unrolled */
828     case 1 :
829       sum1 = b[*r++];
830       for(j=0; j<nz-1; j+=2){
831         i0   = vi[0];
832         i1   = vi[1];
833         vi  +=2;
834         tmp0 = tmps[i0];
835         tmp1 = tmps[i1];
836         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
837       }
838       if(j == nz-1){
839         tmp0 = tmps[*vi++];
840         sum1 -= *v1++ *tmp0;
841       }
842       tmp[row ++]=sum1;
843       break;
844     case 2:
845       sum1 = b[*r++];
846       sum2 = b[*r++];
847       v2   = aa + ai[row+1];
848 
849       for(j=0; j<nz-1; j+=2){
850         i0   = vi[0];
851         i1   = vi[1];
852         vi  +=2;
853         tmp0 = tmps[i0];
854         tmp1 = tmps[i1];
855         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
856         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
857       }
858       if(j == nz-1){
859         tmp0 = tmps[*vi++];
860         sum1 -= *v1++ *tmp0;
861         sum2 -= *v2++ *tmp0;
862       }
863       sum2 -= *v2++ * sum1;
864       tmp[row ++]=sum1;
865       tmp[row ++]=sum2;
866       break;
867     case 3:
868       sum1 = b[*r++];
869       sum2 = b[*r++];
870       sum3 = b[*r++];
871       v2   = aa + ai[row+1];
872       v3   = aa + ai[row+2];
873 
874       for (j=0; j<nz-1; j+=2){
875         i0   = vi[0];
876         i1   = vi[1];
877         vi  +=2;
878         tmp0 = tmps[i0];
879         tmp1 = tmps[i1];
880         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
881         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
882         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
883       }
884       if (j == nz-1){
885         tmp0 = tmps[*vi++];
886         sum1 -= *v1++ *tmp0;
887         sum2 -= *v2++ *tmp0;
888         sum3 -= *v3++ *tmp0;
889       }
890       sum2 -= *v2++ * sum1;
891       sum3 -= *v3++ * sum1;
892       sum3 -= *v3++ * sum2;
893       tmp[row ++]=sum1;
894       tmp[row ++]=sum2;
895       tmp[row ++]=sum3;
896       break;
897 
898     case 4:
899       sum1 = b[*r++];
900       sum2 = b[*r++];
901       sum3 = b[*r++];
902       sum4 = b[*r++];
903       v2   = aa + ai[row+1];
904       v3   = aa + ai[row+2];
905       v4   = aa + ai[row+3];
906 
907       for (j=0; j<nz-1; j+=2){
908         i0   = vi[0];
909         i1   = vi[1];
910         vi  +=2;
911         tmp0 = tmps[i0];
912         tmp1 = tmps[i1];
913         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
914         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
915         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
916         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
917       }
918       if (j == nz-1){
919         tmp0 = tmps[*vi++];
920         sum1 -= *v1++ *tmp0;
921         sum2 -= *v2++ *tmp0;
922         sum3 -= *v3++ *tmp0;
923         sum4 -= *v4++ *tmp0;
924       }
925       sum2 -= *v2++ * sum1;
926       sum3 -= *v3++ * sum1;
927       sum4 -= *v4++ * sum1;
928       sum3 -= *v3++ * sum2;
929       sum4 -= *v4++ * sum2;
930       sum4 -= *v4++ * sum3;
931 
932       tmp[row ++]=sum1;
933       tmp[row ++]=sum2;
934       tmp[row ++]=sum3;
935       tmp[row ++]=sum4;
936       break;
937     case 5:
938       sum1 = b[*r++];
939       sum2 = b[*r++];
940       sum3 = b[*r++];
941       sum4 = b[*r++];
942       sum5 = b[*r++];
943       v2   = aa + ai[row+1];
944       v3   = aa + ai[row+2];
945       v4   = aa + ai[row+3];
946       v5   = aa + ai[row+4];
947 
948       for (j=0; j<nz-1; j+=2){
949         i0   = vi[0];
950         i1   = vi[1];
951         vi  +=2;
952         tmp0 = tmps[i0];
953         tmp1 = tmps[i1];
954         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
955         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
956         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
957         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
958         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
959       }
960       if (j == nz-1){
961         tmp0 = tmps[*vi++];
962         sum1 -= *v1++ *tmp0;
963         sum2 -= *v2++ *tmp0;
964         sum3 -= *v3++ *tmp0;
965         sum4 -= *v4++ *tmp0;
966         sum5 -= *v5++ *tmp0;
967       }
968 
969       sum2 -= *v2++ * sum1;
970       sum3 -= *v3++ * sum1;
971       sum4 -= *v4++ * sum1;
972       sum5 -= *v5++ * sum1;
973       sum3 -= *v3++ * sum2;
974       sum4 -= *v4++ * sum2;
975       sum5 -= *v5++ * sum2;
976       sum4 -= *v4++ * sum3;
977       sum5 -= *v5++ * sum3;
978       sum5 -= *v5++ * sum4;
979 
980       tmp[row ++]=sum1;
981       tmp[row ++]=sum2;
982       tmp[row ++]=sum3;
983       tmp[row ++]=sum4;
984       tmp[row ++]=sum5;
985       break;
986     default:
987       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
988     }
989   }
990   /* backward solve the upper triangular */
991   for (i=node_max -1 ,row = n-1 ; i>=0; i--){
992     nsz = ns[i];
993     aii = ai[row+1] -1;
994     v1  = aa + aii;
995     vi  = aj + aii;
996     nz  = aii- ad[row];
997     switch (nsz){               /* Each loop in 'case' is unrolled */
998     case 1 :
999       sum1 = tmp[row];
1000 
1001       for(j=nz ; j>1; j-=2){
1002         vi  -=2;
1003         i0   = vi[2];
1004         i1   = vi[1];
1005         tmp0 = tmps[i0];
1006         tmp1 = tmps[i1];
1007         v1   -= 2;
1008         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1009       }
1010       if (j==1){
1011         tmp0  = tmps[*vi--];
1012         sum1 -= *v1-- * tmp0;
1013       }
1014       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1015       break;
1016     case 2 :
1017       sum1 = tmp[row];
1018       sum2 = tmp[row -1];
1019       v2   = aa + ai[row]-1;
1020       for (j=nz ; j>1; j-=2){
1021         vi  -=2;
1022         i0   = vi[2];
1023         i1   = vi[1];
1024         tmp0 = tmps[i0];
1025         tmp1 = tmps[i1];
1026         v1   -= 2;
1027         v2   -= 2;
1028         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1029         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1030       }
1031       if (j==1){
1032         tmp0  = tmps[*vi--];
1033         sum1 -= *v1-- * tmp0;
1034         sum2 -= *v2-- * tmp0;
1035       }
1036 
1037       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1038       sum2   -= *v2-- * tmp0;
1039       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1040       break;
1041     case 3 :
1042       sum1 = tmp[row];
1043       sum2 = tmp[row -1];
1044       sum3 = tmp[row -2];
1045       v2   = aa + ai[row]-1;
1046       v3   = aa + ai[row -1]-1;
1047       for (j=nz ; j>1; j-=2){
1048         vi  -=2;
1049         i0   = vi[2];
1050         i1   = vi[1];
1051         tmp0 = tmps[i0];
1052         tmp1 = tmps[i1];
1053         v1   -= 2;
1054         v2   -= 2;
1055         v3   -= 2;
1056         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1057         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1058         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1059       }
1060       if (j==1){
1061         tmp0  = tmps[*vi--];
1062         sum1 -= *v1-- * tmp0;
1063         sum2 -= *v2-- * tmp0;
1064         sum3 -= *v3-- * tmp0;
1065       }
1066       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1067       sum2   -= *v2-- * tmp0;
1068       sum3   -= *v3-- * tmp0;
1069       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1070       sum3   -= *v3-- * tmp0;
1071       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1072 
1073       break;
1074     case 4 :
1075       sum1 = tmp[row];
1076       sum2 = tmp[row -1];
1077       sum3 = tmp[row -2];
1078       sum4 = tmp[row -3];
1079       v2   = aa + ai[row]-1;
1080       v3   = aa + ai[row -1]-1;
1081       v4   = aa + ai[row -2]-1;
1082 
1083       for (j=nz ; j>1; j-=2){
1084         vi  -=2;
1085         i0   = vi[2];
1086         i1   = vi[1];
1087         tmp0 = tmps[i0];
1088         tmp1 = tmps[i1];
1089         v1  -= 2;
1090         v2  -= 2;
1091         v3  -= 2;
1092         v4  -= 2;
1093         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1094         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1095         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1096         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1097       }
1098       if (j==1){
1099         tmp0  = tmps[*vi--];
1100         sum1 -= *v1-- * tmp0;
1101         sum2 -= *v2-- * tmp0;
1102         sum3 -= *v3-- * tmp0;
1103         sum4 -= *v4-- * tmp0;
1104       }
1105 
1106       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1107       sum2   -= *v2-- * tmp0;
1108       sum3   -= *v3-- * tmp0;
1109       sum4   -= *v4-- * tmp0;
1110       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1111       sum3   -= *v3-- * tmp0;
1112       sum4   -= *v4-- * tmp0;
1113       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1114       sum4   -= *v4-- * tmp0;
1115       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1116       break;
1117     case 5 :
1118       sum1 = tmp[row];
1119       sum2 = tmp[row -1];
1120       sum3 = tmp[row -2];
1121       sum4 = tmp[row -3];
1122       sum5 = tmp[row -4];
1123       v2   = aa + ai[row]-1;
1124       v3   = aa + ai[row -1]-1;
1125       v4   = aa + ai[row -2]-1;
1126       v5   = aa + ai[row -3]-1;
1127       for (j=nz ; j>1; j-=2){
1128         vi  -= 2;
1129         i0   = vi[2];
1130         i1   = vi[1];
1131         tmp0 = tmps[i0];
1132         tmp1 = tmps[i1];
1133         v1   -= 2;
1134         v2   -= 2;
1135         v3   -= 2;
1136         v4   -= 2;
1137         v5   -= 2;
1138         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1139         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1140         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1141         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1142         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1143       }
1144       if (j==1){
1145         tmp0  = tmps[*vi--];
1146         sum1 -= *v1-- * tmp0;
1147         sum2 -= *v2-- * tmp0;
1148         sum3 -= *v3-- * tmp0;
1149         sum4 -= *v4-- * tmp0;
1150         sum5 -= *v5-- * tmp0;
1151       }
1152 
1153       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1154       sum2   -= *v2-- * tmp0;
1155       sum3   -= *v3-- * tmp0;
1156       sum4   -= *v4-- * tmp0;
1157       sum5   -= *v5-- * tmp0;
1158       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1159       sum3   -= *v3-- * tmp0;
1160       sum4   -= *v4-- * tmp0;
1161       sum5   -= *v5-- * tmp0;
1162       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1163       sum4   -= *v4-- * tmp0;
1164       sum5   -= *v5-- * tmp0;
1165       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1166       sum5   -= *v5-- * tmp0;
1167       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1168       break;
1169     default:
1170       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1171     }
1172   }
1173   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1174   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1175   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1176   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1177   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
1178   PetscFunctionReturn(0);
1179 }
1180 
1181 
1182 #undef __FUNCT__
1183 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode_inplace"
1184 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info)
1185 {
1186   Mat               C = B;
1187   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1188   IS                iscol = b->col,isrow = b->row,isicol = b->icol;
1189   PetscErrorCode    ierr;
1190   const PetscInt    *r,*ic,*c,*ics;
1191   PetscInt          n = A->rmap->n,*bi = b->i;
1192   PetscInt          *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1193   PetscInt          i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz;
1194   PetscInt          *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1195   PetscScalar       mul1,mul2,mul3,tmp;
1196   MatScalar         *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1197   const MatScalar   *v1,*v2,*v3,*aa = a->a,*rtmp1;
1198   PetscReal         rs=0.0;
1199   FactorShiftCtx    sctx;
1200   PetscInt          newshift;
1201 
1202   PetscFunctionBegin;
1203   sctx.shift_top      = 0;
1204   sctx.nshift_max     = 0;
1205   sctx.shift_lo       = 0;
1206   sctx.shift_hi       = 0;
1207   sctx.shift_fraction = 0;
1208 
1209   /* if both shift schemes are chosen by user, only use info->shiftpd */
1210   if (info->shifttype==MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1211     sctx.shift_top = 0;
1212     for (i=0; i<n; i++) {
1213       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1214       rs    = 0.0;
1215       ajtmp = aj + ai[i];
1216       rtmp1 = aa + ai[i];
1217       nz = ai[i+1] - ai[i];
1218       for (j=0; j<nz; j++){
1219         if (*ajtmp != i){
1220           rs += PetscAbsScalar(*rtmp1++);
1221         } else {
1222           rs -= PetscRealPart(*rtmp1++);
1223         }
1224         ajtmp++;
1225       }
1226       if (rs>sctx.shift_top) sctx.shift_top = rs;
1227     }
1228     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1229     sctx.shift_top *= 1.1;
1230     sctx.nshift_max = 5;
1231     sctx.shift_lo   = 0.;
1232     sctx.shift_hi   = 1.;
1233   }
1234   sctx.shift_amount = 0;
1235   sctx.nshift       = 0;
1236 
1237   ierr  = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1238   ierr  = ISGetIndices(iscol,&c);CHKERRQ(ierr);
1239   ierr  = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1240   ierr  = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp11);CHKERRQ(ierr);
1241   ierr  = PetscMemzero(rtmp11,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr);
1242   ics   = ic ;
1243   rtmp22 = rtmp11 + n;
1244   rtmp33 = rtmp22 + n;
1245 
1246   node_max = a->inode.node_count;
1247   ns       = a->inode.size;
1248   if (!ns){
1249     SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information");
1250   }
1251 
1252   /* If max inode size > 3, split it into two inodes.*/
1253   /* also map the inode sizes according to the ordering */
1254   ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr);
1255   for (i=0,j=0; i<node_max; ++i,++j){
1256     if (ns[i]>3) {
1257       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1258       ++j;
1259       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1260     } else {
1261       tmp_vec1[j] = ns[i];
1262     }
1263   }
1264   /* Use the correct node_max */
1265   node_max = j;
1266 
1267   /* Now reorder the inode info based on mat re-ordering info */
1268   /* First create a row -> inode_size_array_index map */
1269   ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr);
1270   ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr);
1271   for (i=0,row=0; i<node_max; i++) {
1272     nodesz = tmp_vec1[i];
1273     for (j=0; j<nodesz; j++,row++) {
1274       nsmap[row] = i;
1275     }
1276   }
1277   /* Using nsmap, create a reordered ns structure */
1278   for (i=0,j=0; i< node_max; i++) {
1279     nodesz       = tmp_vec1[nsmap[r[j]]];    /* here the reordered row_no is in r[] */
1280     tmp_vec2[i]  = nodesz;
1281     j           += nodesz;
1282   }
1283   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1284   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1285   /* Now use the correct ns */
1286   ns = tmp_vec2;
1287 
1288   do {
1289     sctx.useshift = PETSC_FALSE;
1290     /* Now loop over each block-row, and do the factorization */
1291     for (i=0,row=0; i<node_max; i++) {
1292       nodesz = ns[i];
1293       nz     = bi[row+1] - bi[row];
1294       bjtmp  = bj + bi[row];
1295 
1296       switch (nodesz){
1297       case 1:
1298         for  (j=0; j<nz; j++){
1299           idx        = bjtmp[j];
1300           rtmp11[idx] = 0.0;
1301         }
1302 
1303         /* load in initial (unfactored row) */
1304         idx    = r[row];
1305         nz_tmp = ai[idx+1] - ai[idx];
1306         ajtmp  = aj + ai[idx];
1307         v1     = aa + ai[idx];
1308 
1309         for (j=0; j<nz_tmp; j++) {
1310           idx        = ics[ajtmp[j]];
1311           rtmp11[idx] = v1[j];
1312         }
1313         rtmp11[ics[r[row]]] += sctx.shift_amount;
1314 
1315         prow = *bjtmp++ ;
1316         while (prow < row) {
1317           pc1 = rtmp11 + prow;
1318           if (*pc1 != 0.0){
1319             pv   = ba + bd[prow];
1320             pj   = nbj + bd[prow];
1321             mul1 = *pc1 * *pv++;
1322             *pc1 = mul1;
1323             nz_tmp = bi[prow+1] - bd[prow] - 1;
1324             ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr);
1325             for (j=0; j<nz_tmp; j++) {
1326               tmp = pv[j];
1327               idx = pj[j];
1328               rtmp11[idx] -= mul1 * tmp;
1329             }
1330           }
1331           prow = *bjtmp++ ;
1332         }
1333         pj  = bj + bi[row];
1334         pc1 = ba + bi[row];
1335 
1336         sctx.pv    = rtmp11[row];
1337         rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
1338         rs         = 0.0;
1339         for (j=0; j<nz; j++) {
1340           idx    = pj[j];
1341           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
1342           if (idx != row) rs += PetscAbsScalar(pc1[j]);
1343         }
1344         sctx.rs  = rs;
1345         ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr);
1346         if (newshift == 1) goto endofwhile;
1347         break;
1348 
1349       case 2:
1350         for (j=0; j<nz; j++) {
1351           idx        = bjtmp[j];
1352           rtmp11[idx] = 0.0;
1353           rtmp22[idx] = 0.0;
1354         }
1355 
1356         /* load in initial (unfactored row) */
1357         idx    = r[row];
1358         nz_tmp = ai[idx+1] - ai[idx];
1359         ajtmp  = aj + ai[idx];
1360         v1     = aa + ai[idx];
1361         v2     = aa + ai[idx+1];
1362         for (j=0; j<nz_tmp; j++) {
1363           idx        = ics[ajtmp[j]];
1364           rtmp11[idx] = v1[j];
1365           rtmp22[idx] = v2[j];
1366         }
1367         rtmp11[ics[r[row]]]   += sctx.shift_amount;
1368         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
1369 
1370         prow = *bjtmp++ ;
1371         while (prow < row) {
1372           pc1 = rtmp11 + prow;
1373           pc2 = rtmp22 + prow;
1374           if (*pc1 != 0.0 || *pc2 != 0.0){
1375             pv   = ba + bd[prow];
1376             pj   = nbj + bd[prow];
1377             mul1 = *pc1 * *pv;
1378             mul2 = *pc2 * *pv;
1379             ++pv;
1380             *pc1 = mul1;
1381             *pc2 = mul2;
1382 
1383             nz_tmp = bi[prow+1] - bd[prow] - 1;
1384             for (j=0; j<nz_tmp; j++) {
1385               tmp = pv[j];
1386               idx = pj[j];
1387               rtmp11[idx] -= mul1 * tmp;
1388               rtmp22[idx] -= mul2 * tmp;
1389             }
1390             ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr);
1391           }
1392           prow = *bjtmp++ ;
1393         }
1394 
1395         /* Now take care of diagonal 2x2 block. Note: prow = row here */
1396         pc1 = rtmp11 + prow;
1397         pc2 = rtmp22 + prow;
1398 
1399         sctx.pv = *pc1;
1400         pj      = bj + bi[prow];
1401         rs      = 0.0;
1402         for (j=0; j<nz; j++){
1403           idx = pj[j];
1404           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
1405         }
1406         sctx.rs = rs;
1407         ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr);
1408         if (newshift == 1) goto endofwhile;
1409 
1410         if (*pc2 != 0.0){
1411           pj     = nbj + bd[prow];
1412           mul2   = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
1413           *pc2   = mul2;
1414           nz_tmp = bi[prow+1] - bd[prow] - 1;
1415           for (j=0; j<nz_tmp; j++) {
1416             idx = pj[j] ;
1417             tmp = rtmp11[idx];
1418             rtmp22[idx] -= mul2 * tmp;
1419           }
1420           ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr);
1421         }
1422 
1423         pj  = bj + bi[row];
1424         pc1 = ba + bi[row];
1425         pc2 = ba + bi[row+1];
1426 
1427         sctx.pv = rtmp22[row+1];
1428         rs = 0.0;
1429         rtmp11[row]   = 1.0/rtmp11[row];
1430         rtmp22[row+1] = 1.0/rtmp22[row+1];
1431         /* copy row entries from dense representation to sparse */
1432         for (j=0; j<nz; j++) {
1433           idx    = pj[j];
1434           pc1[j] = rtmp11[idx];
1435           pc2[j] = rtmp22[idx];
1436           if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
1437         }
1438         sctx.rs = rs;
1439         ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr);
1440         if (newshift == 1) goto endofwhile;
1441         break;
1442 
1443       case 3:
1444         for  (j=0; j<nz; j++) {
1445           idx        = bjtmp[j];
1446           rtmp11[idx] = 0.0;
1447           rtmp22[idx] = 0.0;
1448           rtmp33[idx] = 0.0;
1449         }
1450         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
1451         idx    = r[row];
1452         nz_tmp = ai[idx+1] - ai[idx];
1453         ajtmp = aj + ai[idx];
1454         v1    = aa + ai[idx];
1455         v2    = aa + ai[idx+1];
1456         v3    = aa + ai[idx+2];
1457         for (j=0; j<nz_tmp; j++) {
1458           idx        = ics[ajtmp[j]];
1459           rtmp11[idx] = v1[j];
1460           rtmp22[idx] = v2[j];
1461           rtmp33[idx] = v3[j];
1462         }
1463         rtmp11[ics[r[row]]]   += sctx.shift_amount;
1464         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
1465         rtmp33[ics[r[row+2]]] += sctx.shift_amount;
1466 
1467         /* loop over all pivot row blocks above this row block */
1468         prow = *bjtmp++ ;
1469         while (prow < row) {
1470           pc1 = rtmp11 + prow;
1471           pc2 = rtmp22 + prow;
1472           pc3 = rtmp33 + prow;
1473           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){
1474             pv   = ba  + bd[prow];
1475             pj   = nbj + bd[prow];
1476             mul1 = *pc1 * *pv;
1477             mul2 = *pc2 * *pv;
1478             mul3 = *pc3 * *pv;
1479             ++pv;
1480             *pc1 = mul1;
1481             *pc2 = mul2;
1482             *pc3 = mul3;
1483 
1484             nz_tmp = bi[prow+1] - bd[prow] - 1;
1485             /* update this row based on pivot row */
1486             for (j=0; j<nz_tmp; j++) {
1487               tmp = pv[j];
1488               idx = pj[j];
1489               rtmp11[idx] -= mul1 * tmp;
1490               rtmp22[idx] -= mul2 * tmp;
1491               rtmp33[idx] -= mul3 * tmp;
1492             }
1493             ierr = PetscLogFlops(6*nz_tmp);CHKERRQ(ierr);
1494           }
1495           prow = *bjtmp++ ;
1496         }
1497 
1498         /* Now take care of diagonal 3x3 block in this set of rows */
1499         /* note: prow = row here */
1500         pc1 = rtmp11 + prow;
1501         pc2 = rtmp22 + prow;
1502         pc3 = rtmp33 + prow;
1503 
1504         sctx.pv = *pc1;
1505         pj      = bj + bi[prow];
1506         rs      = 0.0;
1507         for (j=0; j<nz; j++){
1508           idx = pj[j];
1509           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
1510         }
1511         sctx.rs = rs;
1512         ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr);
1513         if (newshift == 1) goto endofwhile;
1514 
1515         if (*pc2 != 0.0 || *pc3 != 0.0){
1516           mul2 = (*pc2)/(*pc1);
1517           mul3 = (*pc3)/(*pc1);
1518           *pc2 = mul2;
1519           *pc3 = mul3;
1520           nz_tmp = bi[prow+1] - bd[prow] - 1;
1521           pj     = nbj + bd[prow];
1522           for (j=0; j<nz_tmp; j++) {
1523             idx = pj[j] ;
1524             tmp = rtmp11[idx];
1525             rtmp22[idx] -= mul2 * tmp;
1526             rtmp33[idx] -= mul3 * tmp;
1527           }
1528           ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr);
1529         }
1530         ++prow;
1531 
1532         pc2 = rtmp22 + prow;
1533         pc3 = rtmp33 + prow;
1534         sctx.pv = *pc2;
1535         pj      = bj + bi[prow];
1536         rs      = 0.0;
1537         for (j=0; j<nz; j++){
1538           idx = pj[j];
1539           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
1540         }
1541         sctx.rs = rs;
1542         ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr);
1543         if (newshift == 1) goto endofwhile;
1544 
1545         if (*pc3 != 0.0){
1546           mul3   = (*pc3)/(*pc2);
1547           *pc3   = mul3;
1548           pj     = nbj + bd[prow];
1549           nz_tmp = bi[prow+1] - bd[prow] - 1;
1550           for (j=0; j<nz_tmp; j++) {
1551             idx = pj[j] ;
1552             tmp = rtmp22[idx];
1553             rtmp33[idx] -= mul3 * tmp;
1554           }
1555           ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr);
1556         }
1557 
1558         pj  = bj + bi[row];
1559         pc1 = ba + bi[row];
1560         pc2 = ba + bi[row+1];
1561         pc3 = ba + bi[row+2];
1562 
1563         sctx.pv = rtmp33[row+2];
1564         rs = 0.0;
1565         rtmp11[row]   = 1.0/rtmp11[row];
1566         rtmp22[row+1] = 1.0/rtmp22[row+1];
1567         rtmp33[row+2] = 1.0/rtmp33[row+2];
1568         /* copy row entries from dense representation to sparse */
1569         for (j=0; j<nz; j++) {
1570           idx    = pj[j];
1571           pc1[j] = rtmp11[idx];
1572           pc2[j] = rtmp22[idx];
1573           pc3[j] = rtmp33[idx];
1574           if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
1575         }
1576 
1577         sctx.rs = rs;
1578         ierr = MatLUCheckShift_inline(info,sctx,row+2,newshift);CHKERRQ(ierr);
1579         if (newshift == 1) goto endofwhile;
1580         break;
1581 
1582       default:
1583         SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1584       }
1585       row += nodesz;                 /* Update the row */
1586     }
1587     endofwhile:;
1588   } while (sctx.useshift);
1589   ierr = PetscFree(rtmp11);CHKERRQ(ierr);
1590   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
1591   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
1592   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1593   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
1594   (B)->ops->solve           = MatSolve_SeqAIJ_Inode_inplace;
1595   /* do not set solve add, since MatSolve_Inode + Add is faster */
1596   C->ops->solvetranspose     = MatSolveTranspose_SeqAIJ_inplace;
1597   C->ops->solvetransposeadd  = MatSolveTransposeAdd_SeqAIJ_inplace;
1598   C->assembled   = PETSC_TRUE;
1599   C->preallocated = PETSC_TRUE;
1600   if (sctx.nshift) {
1601     if (info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE) {
1602       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr);
1603     } else if (info->shifttype == MAT_SHIFT_NONZERO) {
1604       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
1605     }
1606   }
1607   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
1608   PetscFunctionReturn(0);
1609 }
1610 
1611 
1612 /* ----------------------------------------------------------- */
1613 #undef __FUNCT__
1614 #define __FUNCT__ "MatSolve_SeqAIJ_Inode"
1615 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
1616 {
1617   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
1618   IS                iscol = a->col,isrow = a->row;
1619   PetscErrorCode    ierr;
1620   const PetscInt    *r,*c,*rout,*cout;
1621   PetscInt          i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
1622   PetscInt          node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
1623   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
1624   PetscScalar       sum1,sum2,sum3,sum4,sum5;
1625   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
1626   const PetscScalar *b;
1627 
1628   PetscFunctionBegin;
1629   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
1630   node_max = a->inode.node_count;
1631   ns       = a->inode.size;     /* Node Size array */
1632 
1633   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1634   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1635   tmp  = a->solve_work;
1636 
1637   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1638   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1639 
1640   /* forward solve the lower triangular */
1641   tmps = tmp ;
1642   aa   = a_a ;
1643   aj   = a_j ;
1644   ad   = a->diag;
1645 
1646   for (i = 0,row = 0; i< node_max; ++i){
1647     nsz = ns[i];
1648     aii = ai[row];
1649     v1  = aa + aii;
1650     vi  = aj + aii;
1651     nz  = ai[row+1]- ai[row];
1652 
1653     if (i < node_max-1) {
1654       /* Prefetch the indices for the next block */
1655       PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,0); /* indices */
1656       /* Prefetch the data for the next block */
1657       PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,0);
1658     }
1659 
1660     switch (nsz){               /* Each loop in 'case' is unrolled */
1661     case 1 :
1662       sum1 = b[r[row]];
1663       for(j=0; j<nz-1; j+=2){
1664         i0   = vi[j];
1665         i1   = vi[j+1];
1666         tmp0 = tmps[i0];
1667         tmp1 = tmps[i1];
1668         sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1;
1669       }
1670       if(j == nz-1){
1671         tmp0 = tmps[vi[j]];
1672         sum1 -= v1[j]*tmp0;
1673       }
1674       tmp[row++]=sum1;
1675       break;
1676     case 2:
1677       sum1 = b[r[row]];
1678       sum2 = b[r[row+1]];
1679       v2   = aa + ai[row+1];
1680 
1681       for(j=0; j<nz-1; j+=2){
1682         i0   = vi[j];
1683         i1   = vi[j+1];
1684         tmp0 = tmps[i0];
1685         tmp1 = tmps[i1];
1686         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
1687         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
1688       }
1689       if(j == nz-1){
1690         tmp0 = tmps[vi[j]];
1691         sum1 -= v1[j] *tmp0;
1692         sum2 -= v2[j] *tmp0;
1693       }
1694       sum2 -= v2[nz] * sum1;
1695       tmp[row ++]=sum1;
1696       tmp[row ++]=sum2;
1697       break;
1698     case 3:
1699       sum1 = b[r[row]];
1700       sum2 = b[r[row+1]];
1701       sum3 = b[r[row+2]];
1702       v2   = aa + ai[row+1];
1703       v3   = aa + ai[row+2];
1704 
1705       for (j=0; j<nz-1; j+=2){
1706         i0   = vi[j];
1707         i1   = vi[j+1];
1708         tmp0 = tmps[i0];
1709         tmp1 = tmps[i1];
1710         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
1711         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
1712         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
1713       }
1714       if (j == nz-1){
1715         tmp0 = tmps[vi[j]];
1716         sum1 -= v1[j] *tmp0;
1717         sum2 -= v2[j] *tmp0;
1718         sum3 -= v3[j] *tmp0;
1719       }
1720       sum2 -= v2[nz] * sum1;
1721       sum3 -= v3[nz] * sum1;
1722       sum3 -= v3[nz+1] * sum2;
1723       tmp[row ++]=sum1;
1724       tmp[row ++]=sum2;
1725       tmp[row ++]=sum3;
1726       break;
1727 
1728     case 4:
1729       sum1 = b[r[row]];
1730       sum2 = b[r[row+1]];
1731       sum3 = b[r[row+2]];
1732       sum4 = b[r[row+3]];
1733       v2   = aa + ai[row+1];
1734       v3   = aa + ai[row+2];
1735       v4   = aa + ai[row+3];
1736 
1737       for (j=0; j<nz-1; j+=2){
1738         i0   = vi[j];
1739         i1   = vi[j+1];
1740         tmp0 = tmps[i0];
1741         tmp1 = tmps[i1];
1742         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
1743         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
1744         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
1745         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
1746       }
1747       if (j == nz-1){
1748         tmp0 = tmps[vi[j]];
1749         sum1 -= v1[j] *tmp0;
1750         sum2 -= v2[j] *tmp0;
1751         sum3 -= v3[j] *tmp0;
1752         sum4 -= v4[j] *tmp0;
1753       }
1754       sum2 -= v2[nz] * sum1;
1755       sum3 -= v3[nz] * sum1;
1756       sum4 -= v4[nz] * sum1;
1757       sum3 -= v3[nz+1] * sum2;
1758       sum4 -= v4[nz+1] * sum2;
1759       sum4 -= v4[nz+2] * sum3;
1760 
1761       tmp[row ++]=sum1;
1762       tmp[row ++]=sum2;
1763       tmp[row ++]=sum3;
1764       tmp[row ++]=sum4;
1765       break;
1766     case 5:
1767       sum1 = b[r[row]];
1768       sum2 = b[r[row+1]];
1769       sum3 = b[r[row+2]];
1770       sum4 = b[r[row+3]];
1771       sum5 = b[r[row+4]];
1772       v2   = aa + ai[row+1];
1773       v3   = aa + ai[row+2];
1774       v4   = aa + ai[row+3];
1775       v5   = aa + ai[row+4];
1776 
1777       for (j=0; j<nz-1; j+=2){
1778         i0   = vi[j];
1779         i1   = vi[j+1];
1780         tmp0 = tmps[i0];
1781         tmp1 = tmps[i1];
1782         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
1783         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
1784         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
1785         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
1786         sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1;
1787       }
1788       if (j == nz-1){
1789         tmp0 = tmps[vi[j]];
1790         sum1 -= v1[j] *tmp0;
1791         sum2 -= v2[j] *tmp0;
1792         sum3 -= v3[j] *tmp0;
1793         sum4 -= v4[j] *tmp0;
1794         sum5 -= v5[j] *tmp0;
1795       }
1796 
1797       sum2 -= v2[nz] * sum1;
1798       sum3 -= v3[nz] * sum1;
1799       sum4 -= v4[nz] * sum1;
1800       sum5 -= v5[nz] * sum1;
1801       sum3 -= v3[nz+1] * sum2;
1802       sum4 -= v4[nz+1] * sum2;
1803       sum5 -= v5[nz+1] * sum2;
1804       sum4 -= v4[nz+2] * sum3;
1805       sum5 -= v5[nz+2] * sum3;
1806       sum5 -= v5[nz+3] * sum4;
1807 
1808       tmp[row ++]=sum1;
1809       tmp[row ++]=sum2;
1810       tmp[row ++]=sum3;
1811       tmp[row ++]=sum4;
1812       tmp[row ++]=sum5;
1813       break;
1814     default:
1815       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1816     }
1817   }
1818   /* backward solve the upper triangular */
1819   for (i=node_max -1 ,row = n-1 ; i>=0; i--){
1820     nsz = ns[i];
1821     aii = ad[row+1] + 1;
1822     v1  = aa + aii;
1823     vi  = aj + aii;
1824     nz  = ad[row]- ad[row+1] - 1;
1825 
1826     if (i > 0) {
1827       /* Prefetch the indices for the next block */
1828       PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,0); /* indices */
1829       /* Prefetch the data for the next block */
1830       PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,0);
1831     }
1832 
1833     switch (nsz){               /* Each loop in 'case' is unrolled */
1834     case 1 :
1835       sum1 = tmp[row];
1836 
1837       for(j=0 ; j<nz-1; j+=2){
1838         i0   = vi[j];
1839         i1   = vi[j+1];
1840         tmp0 = tmps[i0];
1841         tmp1 = tmps[i1];
1842         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
1843       }
1844       if (j == nz-1){
1845         tmp0  = tmps[vi[j]];
1846         sum1 -= v1[j]*tmp0;
1847       }
1848       x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
1849       break;
1850     case 2 :
1851       sum1 = tmp[row];
1852       sum2 = tmp[row-1];
1853       v2   = aa + ad[row] + 1;
1854       for (j=0 ; j<nz-1; j+=2){
1855         i0   = vi[j];
1856         i1   = vi[j+1];
1857         tmp0 = tmps[i0];
1858         tmp1 = tmps[i1];
1859         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
1860         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
1861       }
1862       if (j == nz-1){
1863         tmp0  = tmps[vi[j]];
1864         sum1 -= v1[j]* tmp0;
1865         sum2 -= v2[j+1]* tmp0;
1866       }
1867 
1868       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
1869       sum2   -= v2[0] * tmp0;
1870       x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
1871       break;
1872     case 3 :
1873       sum1 = tmp[row];
1874       sum2 = tmp[row -1];
1875       sum3 = tmp[row -2];
1876       v2   = aa + ad[row] + 1;
1877       v3   = aa + ad[row -1] + 1;
1878       for (j=0 ; j<nz-1; j+=2){
1879         i0   = vi[j];
1880         i1   = vi[j+1];
1881         tmp0 = tmps[i0];
1882         tmp1 = tmps[i1];
1883         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
1884         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
1885         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
1886       }
1887       if (j== nz-1){
1888         tmp0  = tmps[vi[j]];
1889         sum1 -= v1[j] * tmp0;
1890         sum2 -= v2[j+1] * tmp0;
1891         sum3 -= v3[j+2] * tmp0;
1892       }
1893       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
1894       sum2   -= v2[0]* tmp0;
1895       sum3   -= v3[1] * tmp0;
1896       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
1897       sum3   -= v3[0]* tmp0;
1898       x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
1899 
1900       break;
1901     case 4 :
1902       sum1 = tmp[row];
1903       sum2 = tmp[row -1];
1904       sum3 = tmp[row -2];
1905       sum4 = tmp[row -3];
1906       v2   = aa + ad[row]+1;
1907       v3   = aa + ad[row -1]+1;
1908       v4   = aa + ad[row -2]+1;
1909 
1910       for (j=0 ; j<nz-1; j+=2){
1911         i0   = vi[j];
1912         i1   = vi[j+1];
1913         tmp0 = tmps[i0];
1914         tmp1 = tmps[i1];
1915         sum1 -= v1[j] * tmp0   + v1[j+1] * tmp1;
1916         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
1917         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
1918         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
1919       }
1920       if (j== nz-1){
1921         tmp0  = tmps[vi[j]];
1922         sum1 -= v1[j] * tmp0;
1923         sum2 -= v2[j+1] * tmp0;
1924         sum3 -= v3[j+2] * tmp0;
1925         sum4 -= v4[j+3] * tmp0;
1926       }
1927 
1928       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
1929       sum2   -= v2[0] * tmp0;
1930       sum3   -= v3[1] * tmp0;
1931       sum4   -= v4[2] * tmp0;
1932       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
1933       sum3   -= v3[0] * tmp0;
1934       sum4   -= v4[1] * tmp0;
1935       tmp0    = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
1936       sum4   -= v4[0] * tmp0;
1937       x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
1938       break;
1939     case 5 :
1940       sum1 = tmp[row];
1941       sum2 = tmp[row -1];
1942       sum3 = tmp[row -2];
1943       sum4 = tmp[row -3];
1944       sum5 = tmp[row -4];
1945       v2   = aa + ad[row]+1;
1946       v3   = aa + ad[row -1]+1;
1947       v4   = aa + ad[row -2]+1;
1948       v5   = aa + ad[row -3]+1;
1949       for (j=0 ; j<nz-1; j+=2){
1950         i0   = vi[j];
1951         i1   = vi[j+1];
1952         tmp0 = tmps[i0];
1953         tmp1 = tmps[i1];
1954         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
1955         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
1956         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
1957         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
1958         sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1;
1959       }
1960       if (j==nz-1){
1961         tmp0  = tmps[vi[j]];
1962         sum1 -= v1[j] * tmp0;
1963         sum2 -= v2[j+1] * tmp0;
1964         sum3 -= v3[j+2] * tmp0;
1965         sum4 -= v4[j+3] * tmp0;
1966         sum5 -= v5[j+4] * tmp0;
1967       }
1968 
1969       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
1970       sum2   -= v2[0] * tmp0;
1971       sum3   -= v3[1] * tmp0;
1972       sum4   -= v4[2] * tmp0;
1973       sum5   -= v5[3] * tmp0;
1974       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
1975       sum3   -= v3[0] * tmp0;
1976       sum4   -= v4[1] * tmp0;
1977       sum5   -= v5[2] * tmp0;
1978       tmp0    = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
1979       sum4   -= v4[0] * tmp0;
1980       sum5   -= v5[1] * tmp0;
1981       tmp0    = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
1982       sum5   -= v5[0] * tmp0;
1983       x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--;
1984       break;
1985     default:
1986       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1987     }
1988   }
1989   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1990   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1991   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1992   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1993   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
1994   PetscFunctionReturn(0);
1995 }
1996 
1997 
1998 /*
1999      Makes a longer coloring[] array and calls the usual code with that
2000 */
2001 #undef __FUNCT__
2002 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode"
2003 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
2004 {
2005   Mat_SeqAIJ       *a = (Mat_SeqAIJ*)mat->data;
2006   PetscErrorCode  ierr;
2007   PetscInt        n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
2008   PetscInt        *colorused,i;
2009   ISColoringValue *newcolor;
2010 
2011   PetscFunctionBegin;
2012   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr);
2013   /* loop over inodes, marking a color for each column*/
2014   row = 0;
2015   for (i=0; i<m; i++){
2016     for (j=0; j<ns[i]; j++) {
2017       newcolor[row++] = coloring[i] + j*ncolors;
2018     }
2019   }
2020 
2021   /* eliminate unneeded colors */
2022   ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr);
2023   ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr);
2024   for (i=0; i<n; i++) {
2025     colorused[newcolor[i]] = 1;
2026   }
2027 
2028   for (i=1; i<5*ncolors; i++) {
2029     colorused[i] += colorused[i-1];
2030   }
2031   ncolors = colorused[5*ncolors-1];
2032   for (i=0; i<n; i++) {
2033     newcolor[i] = colorused[newcolor[i]]-1;
2034   }
2035   ierr = PetscFree(colorused);CHKERRQ(ierr);
2036   ierr = ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr);
2037   ierr = PetscFree(coloring);CHKERRQ(ierr);
2038   PetscFunctionReturn(0);
2039 }
2040 
2041 #include "../src/mat/blockinvert.h"
2042 
2043 #undef __FUNCT__
2044 #define __FUNCT__ "MatSOR_SeqAIJ_Inode"
2045 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
2046 {
2047   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
2048   PetscScalar        sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3;
2049   MatScalar          *ibdiag,*bdiag,work[25];
2050   PetscScalar        *x,*xb,tmp4,tmp5,x1,x2,x3,x4,x5;
2051   const MatScalar    *v = a->a,*v1,*v2,*v3,*v4,*v5;
2052   PetscReal          zeropivot = 1.0e-15, shift = 0.0;
2053   PetscErrorCode     ierr;
2054   PetscInt           n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2;
2055   PetscInt           *idx,*diag = a->diag,*ii = a->i,sz,k,ipvt[5];
2056 
2057   PetscFunctionBegin;
2058   if (omega != 1.0) SETERRQ(PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
2059   if (fshift != 0.0) SETERRQ(PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
2060   if (its > 1) {
2061     /* switch to non-inode version */
2062     ierr = MatSOR_SeqAIJ(A,bb,omega,flag,fshift,its,lits,xx);CHKERRQ(ierr);
2063     PetscFunctionReturn(0);
2064   }
2065 
2066   if (!a->inode.ibdiagvalid) {
2067     if (!a->inode.ibdiag) {
2068       /* calculate space needed for diagonal blocks */
2069       for (i=0; i<m; i++) {
2070 	cnt += sizes[i]*sizes[i];
2071       }
2072       a->inode.bdiagsize = cnt;
2073       ierr   = PetscMalloc3(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag,A->rmap->n,MatScalar,&a->inode.ssor_work);CHKERRQ(ierr);
2074     }
2075 
2076     /* copy over the diagonal blocks and invert them */
2077     ibdiag = a->inode.ibdiag;
2078     bdiag  = a->inode.bdiag;
2079     cnt = 0;
2080     for (i=0, row = 0; i<m; i++) {
2081       for (j=0; j<sizes[i]; j++) {
2082         for (k=0; k<sizes[i]; k++) {
2083           bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
2084         }
2085       }
2086       ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr);
2087 
2088       switch(sizes[i]) {
2089         case 1:
2090           /* Create matrix data structure */
2091           if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
2092           ibdiag[cnt] = 1.0/ibdiag[cnt];
2093           break;
2094         case 2:
2095           ierr = Kernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr);
2096           break;
2097         case 3:
2098           ierr = Kernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr);
2099           break;
2100         case 4:
2101           ierr = Kernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr);
2102           break;
2103         case 5:
2104           ierr = Kernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift);CHKERRQ(ierr);
2105           break;
2106        default:
2107 	 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2108       }
2109       cnt += sizes[i]*sizes[i];
2110       row += sizes[i];
2111     }
2112     a->inode.ibdiagvalid = PETSC_TRUE;
2113   }
2114   ibdiag = a->inode.ibdiag;
2115   bdiag  = a->inode.bdiag;
2116 
2117 
2118   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2119   if (flag & SOR_ZERO_INITIAL_GUESS) {
2120     PetscScalar *b;
2121     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2122     if (xx != bb) {
2123       ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2124     } else {
2125       b = x;
2126     }
2127     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){
2128 
2129       for (i=0, row=0; i<m; i++) {
2130         sz  = diag[row] - ii[row];
2131         v1  = a->a + ii[row];
2132         idx = a->j + ii[row];
2133 
2134         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2135         switch (sizes[i]){
2136           case 1:
2137 
2138             sum1  = b[row];
2139             for(n = 0; n<sz-1; n+=2) {
2140               i1   = idx[0];
2141               i2   = idx[1];
2142               idx += 2;
2143               tmp0 = x[i1];
2144               tmp1 = x[i2];
2145               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2146             }
2147 
2148             if (n == sz-1){
2149               tmp0  = x[*idx];
2150               sum1 -= *v1 * tmp0;
2151             }
2152             x[row++] = sum1*(*ibdiag++);
2153             break;
2154           case 2:
2155             v2    = a->a + ii[row+1];
2156             sum1  = b[row];
2157             sum2  = b[row+1];
2158             for(n = 0; n<sz-1; n+=2) {
2159               i1   = idx[0];
2160               i2   = idx[1];
2161               idx += 2;
2162               tmp0 = x[i1];
2163               tmp1 = x[i2];
2164               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2165               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2166             }
2167 
2168             if (n == sz-1){
2169               tmp0  = x[*idx];
2170               sum1 -= v1[0] * tmp0;
2171               sum2 -= v2[0] * tmp0;
2172             }
2173             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
2174             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
2175             ibdiag  += 4;
2176             break;
2177           case 3:
2178             v2    = a->a + ii[row+1];
2179             v3    = a->a + ii[row+2];
2180             sum1  = b[row];
2181             sum2  = b[row+1];
2182             sum3  = b[row+2];
2183             for(n = 0; n<sz-1; n+=2) {
2184               i1   = idx[0];
2185               i2   = idx[1];
2186               idx += 2;
2187               tmp0 = x[i1];
2188               tmp1 = x[i2];
2189               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2190               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2191               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2192             }
2193 
2194             if (n == sz-1){
2195               tmp0  = x[*idx];
2196               sum1 -= v1[0] * tmp0;
2197               sum2 -= v2[0] * tmp0;
2198               sum3 -= v3[0] * tmp0;
2199             }
2200             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2201             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2202             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2203             ibdiag  += 9;
2204             break;
2205           case 4:
2206             v2    = a->a + ii[row+1];
2207             v3    = a->a + ii[row+2];
2208             v4    = a->a + ii[row+3];
2209             sum1  = b[row];
2210             sum2  = b[row+1];
2211             sum3  = b[row+2];
2212             sum4  = b[row+3];
2213             for(n = 0; n<sz-1; n+=2) {
2214               i1   = idx[0];
2215               i2   = idx[1];
2216               idx += 2;
2217               tmp0 = x[i1];
2218               tmp1 = x[i2];
2219               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2220               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2221               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2222               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2223             }
2224 
2225             if (n == sz-1){
2226               tmp0  = x[*idx];
2227               sum1 -= v1[0] * tmp0;
2228               sum2 -= v2[0] * tmp0;
2229               sum3 -= v3[0] * tmp0;
2230               sum4 -= v4[0] * tmp0;
2231             }
2232             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2233             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2234             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2235             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2236             ibdiag  += 16;
2237             break;
2238           case 5:
2239             v2    = a->a + ii[row+1];
2240             v3    = a->a + ii[row+2];
2241             v4    = a->a + ii[row+3];
2242             v5    = a->a + ii[row+4];
2243             sum1  = b[row];
2244             sum2  = b[row+1];
2245             sum3  = b[row+2];
2246             sum4  = b[row+3];
2247             sum5  = b[row+4];
2248             for(n = 0; n<sz-1; n+=2) {
2249               i1   = idx[0];
2250               i2   = idx[1];
2251               idx += 2;
2252               tmp0 = x[i1];
2253               tmp1 = x[i2];
2254               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2255               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2256               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2257               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2258               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2259             }
2260 
2261             if (n == sz-1){
2262               tmp0  = x[*idx];
2263               sum1 -= v1[0] * tmp0;
2264               sum2 -= v2[0] * tmp0;
2265               sum3 -= v3[0] * tmp0;
2266               sum4 -= v4[0] * tmp0;
2267               sum5 -= v5[0] * tmp0;
2268             }
2269             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2270             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
2271             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
2272             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
2273             x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
2274             ibdiag  += 25;
2275             break;
2276 	  default:
2277    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2278         }
2279       }
2280 
2281       xb = x;
2282       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2283     } else xb = b;
2284     if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) &&
2285         (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) {
2286       cnt = 0;
2287       for (i=0, row=0; i<m; i++) {
2288 
2289         switch (sizes[i]){
2290           case 1:
2291             x[row++] *= bdiag[cnt++];
2292             break;
2293           case 2:
2294             x1   = x[row]; x2 = x[row+1];
2295             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
2296             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
2297             x[row++] = tmp1;
2298             x[row++] = tmp2;
2299             cnt += 4;
2300             break;
2301           case 3:
2302             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2];
2303             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
2304             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
2305             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
2306             x[row++] = tmp1;
2307             x[row++] = tmp2;
2308             x[row++] = tmp3;
2309             cnt += 9;
2310             break;
2311           case 4:
2312             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
2313             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
2314             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
2315             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
2316             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
2317             x[row++] = tmp1;
2318             x[row++] = tmp2;
2319             x[row++] = tmp3;
2320             x[row++] = tmp4;
2321             cnt += 16;
2322             break;
2323           case 5:
2324             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
2325             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
2326             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
2327             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
2328             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
2329             tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
2330             x[row++] = tmp1;
2331             x[row++] = tmp2;
2332             x[row++] = tmp3;
2333             x[row++] = tmp4;
2334             x[row++] = tmp5;
2335             cnt += 25;
2336             break;
2337 	  default:
2338    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2339         }
2340       }
2341       ierr = PetscLogFlops(m);CHKERRQ(ierr);
2342     }
2343     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){
2344 
2345       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
2346       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
2347         ibdiag -= sizes[i]*sizes[i];
2348         sz      = ii[row+1] - diag[row] - 1;
2349         v1      = a->a + diag[row] + 1;
2350         idx     = a->j + diag[row] + 1;
2351 
2352         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2353         switch (sizes[i]){
2354           case 1:
2355 
2356             sum1  = xb[row];
2357             for(n = 0; n<sz-1; n+=2) {
2358               i1   = idx[0];
2359               i2   = idx[1];
2360               idx += 2;
2361               tmp0 = x[i1];
2362               tmp1 = x[i2];
2363               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2364             }
2365 
2366             if (n == sz-1){
2367               tmp0  = x[*idx];
2368               sum1 -= *v1*tmp0;
2369             }
2370             x[row--] = sum1*(*ibdiag);
2371             break;
2372 
2373           case 2:
2374 
2375             sum1  = xb[row];
2376             sum2  = xb[row-1];
2377             /* note that sum1 is associated with the second of the two rows */
2378             v2    = a->a + diag[row-1] + 2;
2379             for(n = 0; n<sz-1; n+=2) {
2380               i1   = idx[0];
2381               i2   = idx[1];
2382               idx += 2;
2383               tmp0 = x[i1];
2384               tmp1 = x[i2];
2385               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2386               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2387             }
2388 
2389             if (n == sz-1){
2390               tmp0  = x[*idx];
2391               sum1 -= *v1*tmp0;
2392               sum2 -= *v2*tmp0;
2393             }
2394             x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
2395             x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
2396             break;
2397           case 3:
2398 
2399             sum1  = xb[row];
2400             sum2  = xb[row-1];
2401             sum3  = xb[row-2];
2402             v2    = a->a + diag[row-1] + 2;
2403             v3    = a->a + diag[row-2] + 3;
2404             for(n = 0; n<sz-1; n+=2) {
2405               i1   = idx[0];
2406               i2   = idx[1];
2407               idx += 2;
2408               tmp0 = x[i1];
2409               tmp1 = x[i2];
2410               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2411               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2412               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2413             }
2414 
2415             if (n == sz-1){
2416               tmp0  = x[*idx];
2417               sum1 -= *v1*tmp0;
2418               sum2 -= *v2*tmp0;
2419               sum3 -= *v3*tmp0;
2420             }
2421             x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
2422             x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
2423             x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
2424             break;
2425           case 4:
2426 
2427             sum1  = xb[row];
2428             sum2  = xb[row-1];
2429             sum3  = xb[row-2];
2430             sum4  = xb[row-3];
2431             v2    = a->a + diag[row-1] + 2;
2432             v3    = a->a + diag[row-2] + 3;
2433             v4    = a->a + diag[row-3] + 4;
2434             for(n = 0; n<sz-1; n+=2) {
2435               i1   = idx[0];
2436               i2   = idx[1];
2437               idx += 2;
2438               tmp0 = x[i1];
2439               tmp1 = x[i2];
2440               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2441               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2442               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2443               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2444             }
2445 
2446             if (n == sz-1){
2447               tmp0  = x[*idx];
2448               sum1 -= *v1*tmp0;
2449               sum2 -= *v2*tmp0;
2450               sum3 -= *v3*tmp0;
2451               sum4 -= *v4*tmp0;
2452             }
2453             x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
2454             x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
2455             x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
2456             x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
2457             break;
2458           case 5:
2459 
2460             sum1  = xb[row];
2461             sum2  = xb[row-1];
2462             sum3  = xb[row-2];
2463             sum4  = xb[row-3];
2464             sum5  = xb[row-4];
2465             v2    = a->a + diag[row-1] + 2;
2466             v3    = a->a + diag[row-2] + 3;
2467             v4    = a->a + diag[row-3] + 4;
2468             v5    = a->a + diag[row-4] + 5;
2469             for(n = 0; n<sz-1; n+=2) {
2470               i1   = idx[0];
2471               i2   = idx[1];
2472               idx += 2;
2473               tmp0 = x[i1];
2474               tmp1 = x[i2];
2475               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2476               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2477               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2478               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2479               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2480             }
2481 
2482             if (n == sz-1){
2483               tmp0  = x[*idx];
2484               sum1 -= *v1*tmp0;
2485               sum2 -= *v2*tmp0;
2486               sum3 -= *v3*tmp0;
2487               sum4 -= *v4*tmp0;
2488               sum5 -= *v5*tmp0;
2489             }
2490             x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
2491             x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
2492             x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
2493             x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
2494             x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
2495             break;
2496 	  default:
2497    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2498         }
2499       }
2500 
2501       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2502     }
2503     its--;
2504     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2505     if (bb != xx) {ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);}
2506   }
2507   if (flag & SOR_EISENSTAT) {
2508     const PetscScalar *b;
2509     MatScalar         *t = a->inode.ssor_work;
2510 
2511     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2512     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2513     /*
2514           Apply  (U + D)^-1  where D is now the block diagonal
2515     */
2516     ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
2517     for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
2518       ibdiag -= sizes[i]*sizes[i];
2519       sz      = ii[row+1] - diag[row] - 1;
2520       v1      = a->a + diag[row] + 1;
2521       idx     = a->j + diag[row] + 1;
2522       CHKMEMQ;
2523       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2524       switch (sizes[i]){
2525         case 1:
2526 
2527 	  sum1  = b[row];
2528 	  for(n = 0; n<sz-1; n+=2) {
2529 	    i1   = idx[0];
2530 	    i2   = idx[1];
2531 	    idx += 2;
2532 	    tmp0 = x[i1];
2533 	    tmp1 = x[i2];
2534 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2535 	  }
2536 
2537 	  if (n == sz-1){
2538 	    tmp0  = x[*idx];
2539 	    sum1 -= *v1*tmp0;
2540 	  }
2541 	  x[row] = sum1*(*ibdiag);row--;
2542 	  break;
2543 
2544         case 2:
2545 
2546 	  sum1  = b[row];
2547 	  sum2  = b[row-1];
2548 	  /* note that sum1 is associated with the second of the two rows */
2549 	  v2    = a->a + diag[row-1] + 2;
2550 	  for(n = 0; n<sz-1; n+=2) {
2551 	    i1   = idx[0];
2552 	    i2   = idx[1];
2553 	    idx += 2;
2554 	    tmp0 = x[i1];
2555 	    tmp1 = x[i2];
2556 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2557 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2558 	  }
2559 
2560 	  if (n == sz-1){
2561 	    tmp0  = x[*idx];
2562 	    sum1 -= *v1*tmp0;
2563 	    sum2 -= *v2*tmp0;
2564 	  }
2565 	  x[row] = sum2*ibdiag[1] + sum1*ibdiag[3];
2566 	  x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
2567           row -= 2;
2568 	  break;
2569         case 3:
2570 
2571 	  sum1  = b[row];
2572 	  sum2  = b[row-1];
2573 	  sum3  = b[row-2];
2574 	  v2    = a->a + diag[row-1] + 2;
2575 	  v3    = a->a + diag[row-2] + 3;
2576 	  for(n = 0; n<sz-1; n+=2) {
2577 	    i1   = idx[0];
2578 	    i2   = idx[1];
2579 	    idx += 2;
2580 	    tmp0 = x[i1];
2581 	    tmp1 = x[i2];
2582 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2583 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2584 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2585 	  }
2586 
2587 	  if (n == sz-1){
2588 	    tmp0  = x[*idx];
2589 	    sum1 -= *v1*tmp0;
2590 	    sum2 -= *v2*tmp0;
2591 	    sum3 -= *v3*tmp0;
2592 	  }
2593 	  x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
2594 	  x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
2595 	  x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
2596           row -= 3;
2597 	  break;
2598         case 4:
2599 
2600 	  sum1  = b[row];
2601 	  sum2  = b[row-1];
2602 	  sum3  = b[row-2];
2603 	  sum4  = b[row-3];
2604 	  v2    = a->a + diag[row-1] + 2;
2605 	  v3    = a->a + diag[row-2] + 3;
2606 	  v4    = a->a + diag[row-3] + 4;
2607 	  for(n = 0; n<sz-1; n+=2) {
2608 	    i1   = idx[0];
2609 	    i2   = idx[1];
2610 	    idx += 2;
2611 	    tmp0 = x[i1];
2612 	    tmp1 = x[i2];
2613 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2614 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2615 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2616 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2617 	  }
2618 
2619 	  if (n == sz-1){
2620 	    tmp0  = x[*idx];
2621 	    sum1 -= *v1*tmp0;
2622 	    sum2 -= *v2*tmp0;
2623 	    sum3 -= *v3*tmp0;
2624 	    sum4 -= *v4*tmp0;
2625 	  }
2626 	  x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
2627 	  x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
2628 	  x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
2629 	  x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
2630           row -= 4;
2631 	  break;
2632         case 5:
2633 
2634 	  sum1  = b[row];
2635 	  sum2  = b[row-1];
2636 	  sum3  = b[row-2];
2637 	  sum4  = b[row-3];
2638 	  sum5  = b[row-4];
2639 	  v2    = a->a + diag[row-1] + 2;
2640 	  v3    = a->a + diag[row-2] + 3;
2641 	  v4    = a->a + diag[row-3] + 4;
2642 	  v5    = a->a + diag[row-4] + 5;
2643 	  for(n = 0; n<sz-1; n+=2) {
2644 	    i1   = idx[0];
2645 	    i2   = idx[1];
2646 	    idx += 2;
2647 	    tmp0 = x[i1];
2648 	    tmp1 = x[i2];
2649 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2650 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2651 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2652 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2653 	    sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2654 	  }
2655 
2656 	  if (n == sz-1){
2657 	    tmp0  = x[*idx];
2658 	    sum1 -= *v1*tmp0;
2659 	    sum2 -= *v2*tmp0;
2660 	    sum3 -= *v3*tmp0;
2661 	    sum4 -= *v4*tmp0;
2662 	    sum5 -= *v5*tmp0;
2663 	  }
2664 	  x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
2665 	  x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
2666 	  x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
2667 	  x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
2668 	  x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
2669           row -= 5;
2670 	  break;
2671         default:
2672 	  SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2673       }
2674       CHKMEMQ;
2675     }
2676     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2677 
2678     /*
2679            t = b - D x    where D is the block diagonal
2680     */
2681     cnt = 0;
2682     for (i=0, row=0; i<m; i++) {
2683       CHKMEMQ;
2684       switch (sizes[i]){
2685         case 1:
2686 	  t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
2687 	  break;
2688         case 2:
2689 	  x1   = x[row]; x2 = x[row+1];
2690 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
2691 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
2692 	  t[row]   = b[row] - tmp1;
2693 	  t[row+1] = b[row+1] - tmp2; row += 2;
2694 	  cnt += 4;
2695 	  break;
2696         case 3:
2697 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2];
2698 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
2699 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
2700 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
2701 	  t[row] = b[row] - tmp1;
2702 	  t[row+1] = b[row+1] - tmp2;
2703 	  t[row+2] = b[row+2] - tmp3; row += 3;
2704 	  cnt += 9;
2705 	  break;
2706         case 4:
2707 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
2708 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
2709 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
2710 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
2711 	  tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
2712 	  t[row] = b[row] - tmp1;
2713 	  t[row+1] = b[row+1] - tmp2;
2714 	  t[row+2] = b[row+2] - tmp3;
2715 	  t[row+3] = b[row+3] - tmp4; row += 4;
2716 	  cnt += 16;
2717 	  break;
2718         case 5:
2719 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
2720 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
2721 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
2722 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
2723 	  tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
2724 	  tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
2725 	  t[row] = b[row] - tmp1;
2726 	  t[row+1] = b[row+1] - tmp2;
2727 	  t[row+2] = b[row+2] - tmp3;
2728 	  t[row+3] = b[row+3] - tmp4;
2729 	  t[row+4] = b[row+4] - tmp5;row += 5;
2730 	  cnt += 25;
2731 	  break;
2732         default:
2733 	  SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2734       }
2735       CHKMEMQ;
2736     }
2737     ierr = PetscLogFlops(m);CHKERRQ(ierr);
2738 
2739 
2740 
2741     /*
2742           Apply (L + D)^-1 where D is the block diagonal
2743     */
2744     for (i=0, row=0; i<m; i++) {
2745       sz  = diag[row] - ii[row];
2746       v1  = a->a + ii[row];
2747       idx = a->j + ii[row];
2748       CHKMEMQ;
2749       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2750       switch (sizes[i]){
2751         case 1:
2752 
2753 	  sum1  = t[row];
2754 	  for(n = 0; n<sz-1; n+=2) {
2755 	    i1   = idx[0];
2756 	    i2   = idx[1];
2757 	    idx += 2;
2758 	    tmp0 = t[i1];
2759 	    tmp1 = t[i2];
2760 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2761 	  }
2762 
2763 	  if (n == sz-1){
2764 	    tmp0  = t[*idx];
2765 	    sum1 -= *v1 * tmp0;
2766 	  }
2767 	  x[row] += t[row] = sum1*(*ibdiag++); row++;
2768 	  break;
2769         case 2:
2770 	  v2    = a->a + ii[row+1];
2771 	  sum1  = t[row];
2772 	  sum2  = t[row+1];
2773 	  for(n = 0; n<sz-1; n+=2) {
2774 	    i1   = idx[0];
2775 	    i2   = idx[1];
2776 	    idx += 2;
2777 	    tmp0 = t[i1];
2778 	    tmp1 = t[i2];
2779 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2780 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2781 	  }
2782 
2783 	  if (n == sz-1){
2784 	    tmp0  = t[*idx];
2785 	    sum1 -= v1[0] * tmp0;
2786 	    sum2 -= v2[0] * tmp0;
2787 	  }
2788 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
2789 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
2790 	  ibdiag  += 4; row += 2;
2791 	  break;
2792         case 3:
2793 	  v2    = a->a + ii[row+1];
2794 	  v3    = a->a + ii[row+2];
2795 	  sum1  = t[row];
2796 	  sum2  = t[row+1];
2797 	  sum3  = t[row+2];
2798 	  for(n = 0; n<sz-1; n+=2) {
2799 	    i1   = idx[0];
2800 	    i2   = idx[1];
2801 	    idx += 2;
2802 	    tmp0 = t[i1];
2803 	    tmp1 = t[i2];
2804 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2805 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2806 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2807 	  }
2808 
2809 	  if (n == sz-1){
2810 	    tmp0  = t[*idx];
2811 	    sum1 -= v1[0] * tmp0;
2812 	    sum2 -= v2[0] * tmp0;
2813 	    sum3 -= v3[0] * tmp0;
2814 	  }
2815 	  x[row]  += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2816 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2817 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2818 	  ibdiag  += 9; row += 3;
2819 	  break;
2820         case 4:
2821 	  v2    = a->a + ii[row+1];
2822 	  v3    = a->a + ii[row+2];
2823 	  v4    = a->a + ii[row+3];
2824 	  sum1  = t[row];
2825 	  sum2  = t[row+1];
2826 	  sum3  = t[row+2];
2827 	  sum4  = t[row+3];
2828 	  for(n = 0; n<sz-1; n+=2) {
2829 	    i1   = idx[0];
2830 	    i2   = idx[1];
2831 	    idx += 2;
2832 	    tmp0 = t[i1];
2833 	    tmp1 = t[i2];
2834 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2835 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2836 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2837 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2838 	  }
2839 
2840 	  if (n == sz-1){
2841 	    tmp0  = t[*idx];
2842 	    sum1 -= v1[0] * tmp0;
2843 	    sum2 -= v2[0] * tmp0;
2844 	    sum3 -= v3[0] * tmp0;
2845 	    sum4 -= v4[0] * tmp0;
2846 	  }
2847 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2848 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2849 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2850 	  x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2851 	  ibdiag  += 16; row += 4;
2852 	  break;
2853         case 5:
2854 	  v2    = a->a + ii[row+1];
2855 	  v3    = a->a + ii[row+2];
2856 	  v4    = a->a + ii[row+3];
2857 	  v5    = a->a + ii[row+4];
2858 	  sum1  = t[row];
2859 	  sum2  = t[row+1];
2860 	  sum3  = t[row+2];
2861 	  sum4  = t[row+3];
2862 	  sum5  = t[row+4];
2863 	  for(n = 0; n<sz-1; n+=2) {
2864 	    i1   = idx[0];
2865 	    i2   = idx[1];
2866 	    idx += 2;
2867 	    tmp0 = t[i1];
2868 	    tmp1 = t[i2];
2869 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2870 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2871 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2872 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2873 	    sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2874 	  }
2875 
2876 	  if (n == sz-1){
2877 	    tmp0  = t[*idx];
2878 	    sum1 -= v1[0] * tmp0;
2879 	    sum2 -= v2[0] * tmp0;
2880 	    sum3 -= v3[0] * tmp0;
2881 	    sum4 -= v4[0] * tmp0;
2882 	    sum5 -= v5[0] * tmp0;
2883 	  }
2884 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2885 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
2886 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
2887 	  x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
2888 	  x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
2889 	  ibdiag  += 25; row += 5;
2890 	  break;
2891         default:
2892 	  SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2893       }
2894       CHKMEMQ;
2895     }
2896     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2897     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2898     ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2899   }
2900   PetscFunctionReturn(0);
2901 }
2902 
2903 #undef __FUNCT__
2904 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode"
2905 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
2906 {
2907   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
2908   PetscScalar        *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
2909   const MatScalar    *bdiag = a->inode.bdiag;
2910   const PetscScalar  *b;
2911   PetscErrorCode      ierr;
2912   PetscInt            m = a->inode.node_count,cnt = 0,i,row;
2913   const PetscInt      *sizes = a->inode.size;
2914 
2915   PetscFunctionBegin;
2916   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2917   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2918   cnt = 0;
2919   for (i=0, row=0; i<m; i++) {
2920     switch (sizes[i]){
2921       case 1:
2922 	x[row] = b[row]*bdiag[cnt++];row++;
2923 	break;
2924       case 2:
2925 	x1   = b[row]; x2 = b[row+1];
2926 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
2927 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
2928 	x[row++] = tmp1;
2929 	x[row++] = tmp2;
2930 	cnt += 4;
2931 	break;
2932       case 3:
2933 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2];
2934 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
2935 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
2936 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
2937 	x[row++] = tmp1;
2938 	x[row++] = tmp2;
2939 	x[row++] = tmp3;
2940 	cnt += 9;
2941 	break;
2942       case 4:
2943 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
2944 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
2945 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
2946 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
2947 	tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
2948 	x[row++] = tmp1;
2949 	x[row++] = tmp2;
2950 	x[row++] = tmp3;
2951 	x[row++] = tmp4;
2952 	cnt += 16;
2953 	break;
2954       case 5:
2955 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
2956 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
2957 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
2958 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
2959 	tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
2960 	tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
2961 	x[row++] = tmp1;
2962 	x[row++] = tmp2;
2963 	x[row++] = tmp3;
2964 	x[row++] = tmp4;
2965 	x[row++] = tmp5;
2966 	cnt += 25;
2967 	break;
2968       default:
2969 	SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2970     }
2971   }
2972   ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr);
2973   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2974   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2975   PetscFunctionReturn(0);
2976 }
2977 
2978 /*
2979     samestructure indicates that the matrix has not changed its nonzero structure so we
2980     do not need to recompute the inodes
2981 */
2982 #undef __FUNCT__
2983 #define __FUNCT__ "Mat_CheckInode"
2984 PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure)
2985 {
2986   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
2987   PetscErrorCode ierr;
2988   PetscInt       i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size;
2989   PetscTruth     flag;
2990 
2991   PetscFunctionBegin;
2992   if (!a->inode.use)                     PetscFunctionReturn(0);
2993   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
2994 
2995 
2996   m = A->rmap->n;
2997   if (a->inode.size) {ns = a->inode.size;}
2998   else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);}
2999 
3000   i          = 0;
3001   node_count = 0;
3002   idx        = a->j;
3003   ii         = a->i;
3004   while (i < m){                /* For each row */
3005     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
3006     /* Limits the number of elements in a node to 'a->inode.limit' */
3007     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
3008       nzy     = ii[j+1] - ii[j]; /* Same number of nonzeros */
3009       if (nzy != nzx) break;
3010       idy  += nzx;             /* Same nonzero pattern */
3011       ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
3012       if (!flag) break;
3013     }
3014     ns[node_count++] = blk_size;
3015     idx += blk_size*nzx;
3016     i    = j;
3017   }
3018   /* If not enough inodes found,, do not use inode version of the routines */
3019   if (!a->inode.size && m && node_count > .9*m) {
3020     ierr = PetscFree(ns);CHKERRQ(ierr);
3021     a->inode.node_count     = 0;
3022     a->inode.size           = PETSC_NULL;
3023     a->inode.use            = PETSC_FALSE;
3024     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
3025   } else {
3026     A->ops->mult              = MatMult_SeqAIJ_Inode;
3027     A->ops->sor               = MatSOR_SeqAIJ_Inode;
3028     A->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
3029     A->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
3030     A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
3031     A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
3032     A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
3033     A->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
3034     A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
3035     a->inode.node_count       = node_count;
3036     a->inode.size             = ns;
3037     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
3038   }
3039   PetscFunctionReturn(0);
3040 }
3041 
3042 #define MatGetRow_FactoredLU(cols,nzl,nzu,nz,ai,aj,adiag,row) {	\
3043 PetscInt __k, *__vi; \
3044 __vi = aj + ai[row];				\
3045 for(__k=0;__k<nzl;__k++) cols[__k] = __vi[__k]; \
3046 __vi = aj + adiag[row];				\
3047 cols[nzl] = __vi[0];\
3048 __vi = aj + adiag[row+1]+1;\
3049 for(__k=0;__k<nzu;__k++) cols[nzl+1+__k] = __vi[__k];}
3050 
3051 
3052 /*
3053    Mat_CheckInode_FactorLU - Check Inode for factored seqaij matrix.
3054    Modified from Mat_CheckInode().
3055 
3056    Input Parameters:
3057 +  Mat A - ILU or LU matrix factor
3058 -  samestructure - TURE indicates that the matrix has not changed its nonzero structure so we
3059     do not need to recompute the inodes
3060 */
3061 #undef __FUNCT__
3062 #define __FUNCT__ "Mat_CheckInode_FactorLU"
3063 PetscErrorCode Mat_CheckInode_FactorLU(Mat A,PetscTruth samestructure)
3064 {
3065   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3066   PetscErrorCode ierr;
3067   PetscInt       i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size;
3068   PetscInt       *cols1,*cols2,*ns,*ai = a->i,*aj = a->j, *adiag = a->diag;
3069   PetscTruth     flag;
3070 
3071   PetscFunctionBegin;
3072   if (!a->inode.use)                     PetscFunctionReturn(0);
3073   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
3074 
3075   m = A->rmap->n;
3076   if (a->inode.size) {ns = a->inode.size;}
3077   else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);}
3078 
3079   i          = 0;
3080   node_count = 0;
3081   while (i < m){                /* For each row */
3082     nzl1 = ai[i+1] - ai[i];       /* Number of nonzeros in L */
3083     nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/
3084     nzx  = nzl1 + nzu1 + 1;
3085     ierr = PetscMalloc((nzx+1)*sizeof(PetscInt),&cols1);CHKERRQ(ierr);
3086     MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i);
3087 
3088     /* Limits the number of elements in a node to 'a->inode.limit' */
3089     for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
3090       nzl2    = ai[j+1] - ai[j];
3091       nzu2    = adiag[j] - adiag[j+1] - 1;
3092       nzy     = nzl2 + nzu2 + 1;
3093       if( nzy != nzx) break;
3094       ierr    = PetscMalloc((nzy+1)*sizeof(PetscInt),&cols2);CHKERRQ(ierr);
3095       MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);
3096       ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
3097       if (!flag) {ierr = PetscFree(cols2);CHKERRQ(ierr);break;}
3098       ierr = PetscFree(cols2);CHKERRQ(ierr);
3099     }
3100     ns[node_count++] = blk_size;
3101     ierr = PetscFree(cols1);CHKERRQ(ierr);
3102     i    = j;
3103   }
3104   /* If not enough inodes found,, do not use inode version of the routines */
3105   if (!a->inode.size && m && node_count > .9*m) {
3106     ierr = PetscFree(ns);CHKERRQ(ierr);
3107     a->inode.node_count     = 0;
3108     a->inode.size           = PETSC_NULL;
3109     a->inode.use            = PETSC_FALSE;
3110     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
3111   } else {
3112     A->ops->mult              = 0;
3113     A->ops->sor               = 0;
3114     A->ops->multadd           = 0;
3115     A->ops->getrowij          = 0;
3116     A->ops->restorerowij      = 0;
3117     A->ops->getcolumnij       = 0;
3118     A->ops->restorecolumnij   = 0;
3119     A->ops->coloringpatch     = 0;
3120     A->ops->multdiagonalblock = 0;
3121     a->inode.node_count       = node_count;
3122     a->inode.size             = ns;
3123     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
3124   }
3125   PetscFunctionReturn(0);
3126 }
3127 
3128 /*
3129      This is really ugly. if inodes are used this replaces the
3130   permutations with ones that correspond to rows/cols of the matrix
3131   rather then inode blocks
3132 */
3133 #undef __FUNCT__
3134 #define __FUNCT__ "MatInodeAdjustForInodes"
3135 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
3136 {
3137   PetscErrorCode ierr,(*f)(Mat,IS*,IS*);
3138 
3139   PetscFunctionBegin;
3140   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);CHKERRQ(ierr);
3141   if (f) {
3142     ierr = (*f)(A,rperm,cperm);CHKERRQ(ierr);
3143   }
3144   PetscFunctionReturn(0);
3145 }
3146 
3147 EXTERN_C_BEGIN
3148 #undef __FUNCT__
3149 #define __FUNCT__ "MatAdjustForInodes_SeqAIJ_Inode"
3150 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
3151 {
3152   Mat_SeqAIJ      *a=(Mat_SeqAIJ*)A->data;
3153   PetscErrorCode ierr;
3154   PetscInt       m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
3155   const PetscInt *ridx,*cidx;
3156   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
3157   PetscInt       nslim_col,*ns_col;
3158   IS             ris = *rperm,cis = *cperm;
3159 
3160   PetscFunctionBegin;
3161   if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */
3162   if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */
3163 
3164   ierr  = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
3165   ierr  = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
3166   ierr  = PetscMalloc2(m,PetscInt,&permr,n,PetscInt,&permc);CHKERRQ(ierr);
3167 
3168   ierr  = ISGetIndices(ris,&ridx);CHKERRQ(ierr);
3169   ierr  = ISGetIndices(cis,&cidx);CHKERRQ(ierr);
3170 
3171   /* Form the inode structure for the rows of permuted matric using inv perm*/
3172   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
3173 
3174   /* Construct the permutations for rows*/
3175   for (i=0,row = 0; i<nslim_row; ++i){
3176     indx      = ridx[i];
3177     start_val = tns[indx];
3178     end_val   = tns[indx + 1];
3179     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
3180   }
3181 
3182   /* Form the inode structure for the columns of permuted matrix using inv perm*/
3183   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
3184 
3185  /* Construct permutations for columns */
3186   for (i=0,col=0; i<nslim_col; ++i){
3187     indx      = cidx[i];
3188     start_val = tns[indx];
3189     end_val   = tns[indx + 1];
3190     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
3191   }
3192 
3193   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);CHKERRQ(ierr);
3194   ierr = ISSetPermutation(*rperm);CHKERRQ(ierr);
3195   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);CHKERRQ(ierr);
3196   ierr = ISSetPermutation(*cperm);CHKERRQ(ierr);
3197 
3198   ierr  = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr);
3199   ierr  = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr);
3200 
3201   ierr = PetscFree(ns_col);CHKERRQ(ierr);
3202   ierr = PetscFree2(permr,permc);CHKERRQ(ierr);
3203   ierr = ISDestroy(cis);CHKERRQ(ierr);
3204   ierr = ISDestroy(ris);CHKERRQ(ierr);
3205   ierr = PetscFree(tns);CHKERRQ(ierr);
3206   PetscFunctionReturn(0);
3207 }
3208 EXTERN_C_END
3209 
3210 #undef __FUNCT__
3211 #define __FUNCT__ "MatInodeGetInodeSizes"
3212 /*@C
3213    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
3214 
3215    Collective on Mat
3216 
3217    Input Parameter:
3218 .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
3219 
3220    Output Parameter:
3221 +  node_count - no of inodes present in the matrix.
3222 .  sizes      - an array of size node_count,with sizes of each inode.
3223 -  limit      - the max size used to generate the inodes.
3224 
3225    Level: advanced
3226 
3227    Notes: This routine returns some internal storage information
3228    of the matrix, it is intended to be used by advanced users.
3229    It should be called after the matrix is assembled.
3230    The contents of the sizes[] array should not be changed.
3231    PETSC_NULL may be passed for information not requested.
3232 
3233 .keywords: matrix, seqaij, get, inode
3234 
3235 .seealso: MatGetInfo()
3236 @*/
3237 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
3238 {
3239   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
3240 
3241   PetscFunctionBegin;
3242   if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
3243   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr);
3244   if (f) {
3245     ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr);
3246   }
3247   PetscFunctionReturn(0);
3248 }
3249 
3250 EXTERN_C_BEGIN
3251 #undef __FUNCT__
3252 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode"
3253 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
3254 {
3255   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
3256 
3257   PetscFunctionBegin;
3258   if (node_count) *node_count = a->inode.node_count;
3259   if (sizes)      *sizes      = a->inode.size;
3260   if (limit)      *limit      = a->inode.limit;
3261   PetscFunctionReturn(0);
3262 }
3263 EXTERN_C_END
3264