xref: /petsc/src/mat/impls/aij/seq/inode.c (revision 2ced604035a9bfaf0d10926b54a10b89e014d405)
1 #define PETSCMAT_DLL
2 
3 /*
4   This file provides high performance routines for the Inode format (compressed sparse row)
5   by taking advantage of rows with identical nonzero structure (I-nodes).
6 */
7 #include "../src/mat/impls/aij/seq/aij.h"
8 
9 #undef __FUNCT__
10 #define __FUNCT__ "Mat_CreateColInode"
11 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns)
12 {
13   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
14   PetscErrorCode ierr;
15   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;
16 
17   PetscFunctionBegin;
18   n      = A->cmap->n;
19   m      = A->rmap->n;
20   ns_row = a->inode.size;
21 
22   min_mn = (m < n) ? m : n;
23   if (!ns) {
24     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++);
25     for(; count+1 < n; count++,i++);
26     if (count < n)  {
27       i++;
28     }
29     *size = i;
30     PetscFunctionReturn(0);
31   }
32   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr);
33 
34   /* Use the same row structure wherever feasible. */
35   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
36     ns_col[i] = ns_row[i];
37   }
38 
39   /* if m < n; pad up the remainder with inode_limit */
40   for(; count+1 < n; count++,i++) {
41     ns_col[i] = 1;
42   }
43   /* The last node is the odd ball. padd it up with the remaining rows; */
44   if (count < n)  {
45     ns_col[i] = n - count;
46     i++;
47   } else if (count > n) {
48     /* Adjust for the over estimation */
49     ns_col[i-1] += n - count;
50   }
51   *size = i;
52   *ns   = ns_col;
53   PetscFunctionReturn(0);
54 }
55 
56 
57 /*
58       This builds symmetric version of nonzero structure,
59 */
60 #undef __FUNCT__
61 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric"
62 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
63 {
64   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
65   PetscErrorCode ierr;
66   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n;
67   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j;
68 
69   PetscFunctionBegin;
70   nslim_row = a->inode.node_count;
71   m         = A->rmap->n;
72   n         = A->cmap->n;
73   if (m != n) SETERRQ(PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
74 
75   /* Use the row_inode as column_inode */
76   nslim_col = nslim_row;
77   ns_col    = ns_row;
78 
79   /* allocate space for reformated inode structure */
80   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
81   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
82   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
83 
84   for (i1=0,col=0; i1<nslim_col; ++i1){
85     nsz = ns_col[i1];
86     for (i2=0; i2<nsz; ++i2,++col)
87       tvc[col] = i1;
88   }
89   /* allocate space for row pointers */
90   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
91   *iia = ia;
92   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
93   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
94 
95   /* determine the number of columns in each row */
96   ia[0] = oshift;
97   for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) {
98 
99     j    = aj + ai[row] + ishift;
100     jmax = aj + ai[row+1] + ishift;
101     i2   = 0;
102     col  = *j++ + ishift;
103     i2   = tvc[col];
104     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
105       ia[i1+1]++;
106       ia[i2+1]++;
107       i2++;                     /* Start col of next node */
108       while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j;
109       i2 = tvc[col];
110     }
111     if(i2 == i1) ia[i2+1]++;    /* now the diagonal element */
112   }
113 
114   /* shift ia[i] to point to next row */
115   for (i1=1; i1<nslim_row+1; i1++) {
116     row        = ia[i1-1];
117     ia[i1]    += row;
118     work[i1-1] = row - oshift;
119   }
120 
121   /* allocate space for column pointers */
122   nz   = ia[nslim_row] + (!ishift);
123   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
124   *jja = ja;
125 
126  /* loop over lower triangular part putting into ja */
127   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
128     j    = aj + ai[row] + ishift;
129     jmax = aj + ai[row+1] + ishift;
130     i2   = 0;                     /* Col inode index */
131     col  = *j++ + ishift;
132     i2   = tvc[col];
133     while (i2<i1 && j<jmax) {
134       ja[work[i2]++] = i1 + oshift;
135       ja[work[i1]++] = i2 + oshift;
136       ++i2;
137       while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */
138       i2 = tvc[col];
139     }
140     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
141 
142   }
143   ierr = PetscFree(work);CHKERRQ(ierr);
144   ierr = PetscFree(tns);CHKERRQ(ierr);
145   ierr = PetscFree(tvc);CHKERRQ(ierr);
146   PetscFunctionReturn(0);
147 }
148 
149 /*
150       This builds nonsymmetric version of nonzero structure,
151 */
152 #undef __FUNCT__
153 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric"
154 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
155 {
156   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
157   PetscErrorCode ierr;
158   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col;
159   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
160 
161   PetscFunctionBegin;
162   nslim_row = a->inode.node_count;
163   n         = A->cmap->n;
164 
165   /* Create The column_inode for this matrix */
166   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
167 
168   /* allocate space for reformated column_inode structure */
169   ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
170   ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
171   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
172 
173   for (i1=0,col=0; i1<nslim_col; ++i1){
174     nsz = ns_col[i1];
175     for (i2=0; i2<nsz; ++i2,++col)
176       tvc[col] = i1;
177   }
178   /* allocate space for row pointers */
179   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
180   *iia = ia;
181   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
182   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
183 
184   /* determine the number of columns in each row */
185   ia[0] = oshift;
186   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
187     j   = aj + ai[row] + ishift;
188     col = *j++ + ishift;
189     i2  = tvc[col];
190     nz  = ai[row+1] - ai[row];
191     while (nz-- > 0) {           /* off-diagonal elemets */
192       ia[i1+1]++;
193       i2++;                     /* Start col of next node */
194       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
195       if (nz > 0) i2 = tvc[col];
196     }
197   }
198 
199   /* shift ia[i] to point to next row */
200   for (i1=1; i1<nslim_row+1; i1++) {
201     row        = ia[i1-1];
202     ia[i1]    += row;
203     work[i1-1] = row - oshift;
204   }
205 
206   /* allocate space for column pointers */
207   nz   = ia[nslim_row] + (!ishift);
208   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
209   *jja = ja;
210 
211  /* loop over matrix putting into ja */
212   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
213     j   = aj + ai[row] + ishift;
214     i2  = 0;                     /* Col inode index */
215     col = *j++ + ishift;
216     i2  = tvc[col];
217     nz  = ai[row+1] - ai[row];
218     while (nz-- > 0) {
219       ja[work[i1]++] = i2 + oshift;
220       ++i2;
221       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
222       if (nz > 0) i2 = tvc[col];
223     }
224   }
225   ierr = PetscFree(ns_col);CHKERRQ(ierr);
226   ierr = PetscFree(work);CHKERRQ(ierr);
227   ierr = PetscFree(tns);CHKERRQ(ierr);
228   ierr = PetscFree(tvc);CHKERRQ(ierr);
229   PetscFunctionReturn(0);
230 }
231 
232 #undef __FUNCT__
233 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode"
234 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
235 {
236   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
237   PetscErrorCode ierr;
238 
239   PetscFunctionBegin;
240   *n     = a->inode.node_count;
241   if (!ia) PetscFunctionReturn(0);
242   if (!blockcompressed) {
243     ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
244   } else if (symmetric) {
245     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
246   } else {
247     ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
248   }
249   PetscFunctionReturn(0);
250 }
251 
252 #undef __FUNCT__
253 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode"
254 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
255 {
256   PetscErrorCode ierr;
257 
258   PetscFunctionBegin;
259   if (!ia) PetscFunctionReturn(0);
260 
261   if (!blockcompressed) {
262     ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
263   } else {
264     ierr = PetscFree(*ia);CHKERRQ(ierr);
265     ierr = PetscFree(*ja);CHKERRQ(ierr);
266   }
267 
268   PetscFunctionReturn(0);
269 }
270 
271 /* ----------------------------------------------------------- */
272 
273 #undef __FUNCT__
274 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric"
275 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
276 {
277   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
278   PetscErrorCode ierr;
279   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
280   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
281 
282   PetscFunctionBegin;
283   nslim_row = a->inode.node_count;
284   n         = A->cmap->n;
285 
286   /* Create The column_inode for this matrix */
287   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
288 
289   /* allocate space for reformated column_inode structure */
290   ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
291   ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
292   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
293 
294   for (i1=0,col=0; i1<nslim_col; ++i1){
295     nsz = ns_col[i1];
296     for (i2=0; i2<nsz; ++i2,++col)
297       tvc[col] = i1;
298   }
299   /* allocate space for column pointers */
300   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
301   *iia = ia;
302   ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr);
303   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
304 
305   /* determine the number of columns in each row */
306   ia[0] = oshift;
307   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
308     j   = aj + ai[row] + ishift;
309     col = *j++ + ishift;
310     i2  = tvc[col];
311     nz  = ai[row+1] - ai[row];
312     while (nz-- > 0) {           /* off-diagonal elemets */
313       /* ia[i1+1]++; */
314       ia[i2+1]++;
315       i2++;
316       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
317       if (nz > 0) i2 = tvc[col];
318     }
319   }
320 
321   /* shift ia[i] to point to next col */
322   for (i1=1; i1<nslim_col+1; i1++) {
323     col        = ia[i1-1];
324     ia[i1]    += col;
325     work[i1-1] = col - oshift;
326   }
327 
328   /* allocate space for column pointers */
329   nz   = ia[nslim_col] + (!ishift);
330   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
331   *jja = ja;
332 
333  /* loop over matrix putting into ja */
334   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
335     j   = aj + ai[row] + ishift;
336     i2  = 0;                     /* Col inode index */
337     col = *j++ + ishift;
338     i2  = tvc[col];
339     nz  = ai[row+1] - ai[row];
340     while (nz-- > 0) {
341       /* ja[work[i1]++] = i2 + oshift; */
342       ja[work[i2]++] = i1 + oshift;
343       i2++;
344       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
345       if (nz > 0) i2 = tvc[col];
346     }
347   }
348   ierr = PetscFree(ns_col);CHKERRQ(ierr);
349   ierr = PetscFree(work);CHKERRQ(ierr);
350   ierr = PetscFree(tns);CHKERRQ(ierr);
351   ierr = PetscFree(tvc);CHKERRQ(ierr);
352   PetscFunctionReturn(0);
353 }
354 
355 #undef __FUNCT__
356 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode"
357 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
358 {
359   PetscErrorCode ierr;
360 
361   PetscFunctionBegin;
362   ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr);
363   if (!ia) PetscFunctionReturn(0);
364 
365   if (!blockcompressed) {
366     ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
367   } else if (symmetric) {
368     /* Since the indices are symmetric it does'nt matter */
369     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
370   } else {
371     ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
372   }
373   PetscFunctionReturn(0);
374 }
375 
376 #undef __FUNCT__
377 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode"
378 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
379 {
380   PetscErrorCode ierr;
381 
382   PetscFunctionBegin;
383   if (!ia) PetscFunctionReturn(0);
384   if (!blockcompressed) {
385     ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
386   } else {
387     ierr = PetscFree(*ia);CHKERRQ(ierr);
388     ierr = PetscFree(*ja);CHKERRQ(ierr);
389   }
390   PetscFunctionReturn(0);
391 }
392 
393 /* ----------------------------------------------------------- */
394 
395 #undef __FUNCT__
396 #define __FUNCT__ "MatMult_SeqAIJ_Inode"
397 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
398 {
399   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
400   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
401   PetscScalar       *y;
402   const PetscScalar *x;
403   const MatScalar   *v1,*v2,*v3,*v4,*v5;
404   PetscErrorCode    ierr;
405   PetscInt          *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0;
406 
407 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
408 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
409 #endif
410 
411   PetscFunctionBegin;
412   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
413   node_max = a->inode.node_count;
414   ns       = a->inode.size;     /* Node Size array */
415   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
416   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
417   idx  = a->j;
418   v1   = a->a;
419   ii   = a->i;
420 
421   for (i = 0,row = 0; i< node_max; ++i){
422     nsz  = ns[i];
423     n    = ii[1] - ii[0];
424     nonzerorow += (n>0)*nsz;
425     ii  += nsz;
426     PetscPrefetchBlock(idx+nsz*n,n,0,0);    /* Prefetch the indices for the block row after the current one */
427     PetscPrefetchBlock(v1+nsz*n,nsz*n,0,0); /* Prefetch the values for the block row after the current one  */
428     sz   = n;                   /* No of non zeros in this row */
429                                 /* Switch on the size of Node */
430     switch (nsz){               /* Each loop in 'case' is unrolled */
431     case 1 :
432       sum1  = 0.;
433 
434       for(n = 0; n< sz-1; n+=2) {
435         i1   = idx[0];          /* The instructions are ordered to */
436         i2   = idx[1];          /* make the compiler's job easy */
437         idx += 2;
438         tmp0 = x[i1];
439         tmp1 = x[i2];
440         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
441        }
442 
443       if (n == sz-1){          /* Take care of the last nonzero  */
444         tmp0  = x[*idx++];
445         sum1 += *v1++ * tmp0;
446       }
447       y[row++]=sum1;
448       break;
449     case 2:
450       sum1  = 0.;
451       sum2  = 0.;
452       v2    = v1 + n;
453 
454       for (n = 0; n< sz-1; n+=2) {
455         i1   = idx[0];
456         i2   = idx[1];
457         idx += 2;
458         tmp0 = x[i1];
459         tmp1 = x[i2];
460         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
461         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
462       }
463       if (n == sz-1){
464         tmp0  = x[*idx++];
465         sum1 += *v1++ * tmp0;
466         sum2 += *v2++ * tmp0;
467       }
468       y[row++]=sum1;
469       y[row++]=sum2;
470       v1      =v2;              /* Since the next block to be processed starts there*/
471       idx    +=sz;
472       break;
473     case 3:
474       sum1  = 0.;
475       sum2  = 0.;
476       sum3  = 0.;
477       v2    = v1 + n;
478       v3    = v2 + n;
479 
480       for (n = 0; n< sz-1; n+=2) {
481         i1   = idx[0];
482         i2   = idx[1];
483         idx += 2;
484         tmp0 = x[i1];
485         tmp1 = x[i2];
486         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
487         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
488         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
489       }
490       if (n == sz-1){
491         tmp0  = x[*idx++];
492         sum1 += *v1++ * tmp0;
493         sum2 += *v2++ * tmp0;
494         sum3 += *v3++ * tmp0;
495       }
496       y[row++]=sum1;
497       y[row++]=sum2;
498       y[row++]=sum3;
499       v1       =v3;             /* Since the next block to be processed starts there*/
500       idx     +=2*sz;
501       break;
502     case 4:
503       sum1  = 0.;
504       sum2  = 0.;
505       sum3  = 0.;
506       sum4  = 0.;
507       v2    = v1 + n;
508       v3    = v2 + n;
509       v4    = v3 + n;
510 
511       for (n = 0; n< sz-1; n+=2) {
512         i1   = idx[0];
513         i2   = idx[1];
514         idx += 2;
515         tmp0 = x[i1];
516         tmp1 = x[i2];
517         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
518         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
519         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
520         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
521       }
522       if (n == sz-1){
523         tmp0  = x[*idx++];
524         sum1 += *v1++ * tmp0;
525         sum2 += *v2++ * tmp0;
526         sum3 += *v3++ * tmp0;
527         sum4 += *v4++ * tmp0;
528       }
529       y[row++]=sum1;
530       y[row++]=sum2;
531       y[row++]=sum3;
532       y[row++]=sum4;
533       v1      =v4;              /* Since the next block to be processed starts there*/
534       idx    +=3*sz;
535       break;
536     case 5:
537       sum1  = 0.;
538       sum2  = 0.;
539       sum3  = 0.;
540       sum4  = 0.;
541       sum5  = 0.;
542       v2    = v1 + n;
543       v3    = v2 + n;
544       v4    = v3 + n;
545       v5    = v4 + n;
546 
547       for (n = 0; n<sz-1; n+=2) {
548         i1   = idx[0];
549         i2   = idx[1];
550         idx += 2;
551         tmp0 = x[i1];
552         tmp1 = x[i2];
553         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
554         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
555         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
556         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
557         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
558       }
559       if (n == sz-1){
560         tmp0  = x[*idx++];
561         sum1 += *v1++ * tmp0;
562         sum2 += *v2++ * tmp0;
563         sum3 += *v3++ * tmp0;
564         sum4 += *v4++ * tmp0;
565         sum5 += *v5++ * tmp0;
566       }
567       y[row++]=sum1;
568       y[row++]=sum2;
569       y[row++]=sum3;
570       y[row++]=sum4;
571       y[row++]=sum5;
572       v1      =v5;       /* Since the next block to be processed starts there */
573       idx    +=4*sz;
574       break;
575     default :
576       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
577     }
578   }
579   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
580   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
581   ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr);
582   PetscFunctionReturn(0);
583 }
584 /* ----------------------------------------------------------- */
585 /* Almost same code as the MatMult_SeqAIJ_Inode() */
586 #undef __FUNCT__
587 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode"
588 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
589 {
590   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
591   PetscScalar    sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
592   MatScalar      *v1,*v2,*v3,*v4,*v5;
593   PetscScalar    *x,*y,*z,*zt;
594   PetscErrorCode ierr;
595   PetscInt       *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
596 
597   PetscFunctionBegin;
598   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
599   node_max = a->inode.node_count;
600   ns       = a->inode.size;     /* Node Size array */
601   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
602   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
603   if (zz != yy) {
604     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
605   } else {
606     z = y;
607   }
608   zt = z;
609 
610   idx  = a->j;
611   v1   = a->a;
612   ii   = a->i;
613 
614   for (i = 0,row = 0; i< node_max; ++i){
615     nsz  = ns[i];
616     n    = ii[1] - ii[0];
617     ii  += nsz;
618     sz   = n;                   /* No of non zeros in this row */
619                                 /* Switch on the size of Node */
620     switch (nsz){               /* Each loop in 'case' is unrolled */
621     case 1 :
622       sum1  = *zt++;
623 
624       for(n = 0; n< sz-1; n+=2) {
625         i1   = idx[0];          /* The instructions are ordered to */
626         i2   = idx[1];          /* make the compiler's job easy */
627         idx += 2;
628         tmp0 = x[i1];
629         tmp1 = x[i2];
630         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
631        }
632 
633       if(n   == sz-1){          /* Take care of the last nonzero  */
634         tmp0  = x[*idx++];
635         sum1 += *v1++ * tmp0;
636       }
637       y[row++]=sum1;
638       break;
639     case 2:
640       sum1  = *zt++;
641       sum2  = *zt++;
642       v2    = v1 + n;
643 
644       for(n = 0; n< sz-1; n+=2) {
645         i1   = idx[0];
646         i2   = idx[1];
647         idx += 2;
648         tmp0 = x[i1];
649         tmp1 = x[i2];
650         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
651         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
652       }
653       if(n   == sz-1){
654         tmp0  = x[*idx++];
655         sum1 += *v1++ * tmp0;
656         sum2 += *v2++ * tmp0;
657       }
658       y[row++]=sum1;
659       y[row++]=sum2;
660       v1      =v2;              /* Since the next block to be processed starts there*/
661       idx    +=sz;
662       break;
663     case 3:
664       sum1  = *zt++;
665       sum2  = *zt++;
666       sum3  = *zt++;
667       v2    = v1 + n;
668       v3    = v2 + n;
669 
670       for (n = 0; n< sz-1; n+=2) {
671         i1   = idx[0];
672         i2   = idx[1];
673         idx += 2;
674         tmp0 = x[i1];
675         tmp1 = x[i2];
676         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
677         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
678         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
679       }
680       if (n == sz-1){
681         tmp0  = x[*idx++];
682         sum1 += *v1++ * tmp0;
683         sum2 += *v2++ * tmp0;
684         sum3 += *v3++ * tmp0;
685       }
686       y[row++]=sum1;
687       y[row++]=sum2;
688       y[row++]=sum3;
689       v1       =v3;             /* Since the next block to be processed starts there*/
690       idx     +=2*sz;
691       break;
692     case 4:
693       sum1  = *zt++;
694       sum2  = *zt++;
695       sum3  = *zt++;
696       sum4  = *zt++;
697       v2    = v1 + n;
698       v3    = v2 + n;
699       v4    = v3 + n;
700 
701       for (n = 0; n< sz-1; n+=2) {
702         i1   = idx[0];
703         i2   = idx[1];
704         idx += 2;
705         tmp0 = x[i1];
706         tmp1 = x[i2];
707         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
708         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
709         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
710         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
711       }
712       if (n == sz-1){
713         tmp0  = x[*idx++];
714         sum1 += *v1++ * tmp0;
715         sum2 += *v2++ * tmp0;
716         sum3 += *v3++ * tmp0;
717         sum4 += *v4++ * tmp0;
718       }
719       y[row++]=sum1;
720       y[row++]=sum2;
721       y[row++]=sum3;
722       y[row++]=sum4;
723       v1      =v4;              /* Since the next block to be processed starts there*/
724       idx    +=3*sz;
725       break;
726     case 5:
727       sum1  = *zt++;
728       sum2  = *zt++;
729       sum3  = *zt++;
730       sum4  = *zt++;
731       sum5  = *zt++;
732       v2    = v1 + n;
733       v3    = v2 + n;
734       v4    = v3 + n;
735       v5    = v4 + n;
736 
737       for (n = 0; n<sz-1; n+=2) {
738         i1   = idx[0];
739         i2   = idx[1];
740         idx += 2;
741         tmp0 = x[i1];
742         tmp1 = x[i2];
743         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
744         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
745         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
746         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
747         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
748       }
749       if(n   == sz-1){
750         tmp0  = x[*idx++];
751         sum1 += *v1++ * tmp0;
752         sum2 += *v2++ * tmp0;
753         sum3 += *v3++ * tmp0;
754         sum4 += *v4++ * tmp0;
755         sum5 += *v5++ * tmp0;
756       }
757       y[row++]=sum1;
758       y[row++]=sum2;
759       y[row++]=sum3;
760       y[row++]=sum4;
761       y[row++]=sum5;
762       v1      =v5;       /* Since the next block to be processed starts there */
763       idx    +=4*sz;
764       break;
765     default :
766       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
767     }
768   }
769   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
770   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
771   if (zz != yy) {
772     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
773   }
774   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
775   PetscFunctionReturn(0);
776 }
777 
778 /* ----------------------------------------------------------- */
779 #undef __FUNCT__
780 #define __FUNCT__ "MatSolve_SeqAIJ_Inode"
781 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
782 {
783   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
784   IS                iscol = a->col,isrow = a->row;
785   PetscErrorCode    ierr;
786   const PetscInt    *r,*c,*rout,*cout;
787   PetscInt          i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
788   PetscInt          node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
789   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
790   PetscScalar       sum1,sum2,sum3,sum4,sum5;
791   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
792   const PetscScalar *b;
793 
794   PetscFunctionBegin;
795   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
796   node_max = a->inode.node_count;
797   ns       = a->inode.size;     /* Node Size array */
798 
799   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
800   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
801   tmp  = a->solve_work;
802 
803   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
804   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
805 
806   /* forward solve the lower triangular */
807   tmps = tmp ;
808   aa   = a_a ;
809   aj   = a_j ;
810   ad   = a->diag;
811 
812   for (i = 0,row = 0; i< node_max; ++i){
813     nsz = ns[i];
814     aii = ai[row];
815     v1  = aa + aii;
816     vi  = aj + aii;
817     nz  = ad[row]- aii;
818 
819     switch (nsz){               /* Each loop in 'case' is unrolled */
820     case 1 :
821       sum1 = b[*r++];
822       for(j=0; j<nz-1; j+=2){
823         i0   = vi[0];
824         i1   = vi[1];
825         vi  +=2;
826         tmp0 = tmps[i0];
827         tmp1 = tmps[i1];
828         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
829       }
830       if(j == nz-1){
831         tmp0 = tmps[*vi++];
832         sum1 -= *v1++ *tmp0;
833       }
834       tmp[row ++]=sum1;
835       break;
836     case 2:
837       sum1 = b[*r++];
838       sum2 = b[*r++];
839       v2   = aa + ai[row+1];
840 
841       for(j=0; j<nz-1; j+=2){
842         i0   = vi[0];
843         i1   = vi[1];
844         vi  +=2;
845         tmp0 = tmps[i0];
846         tmp1 = tmps[i1];
847         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
848         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
849       }
850       if(j == nz-1){
851         tmp0 = tmps[*vi++];
852         sum1 -= *v1++ *tmp0;
853         sum2 -= *v2++ *tmp0;
854       }
855       sum2 -= *v2++ * sum1;
856       tmp[row ++]=sum1;
857       tmp[row ++]=sum2;
858       break;
859     case 3:
860       sum1 = b[*r++];
861       sum2 = b[*r++];
862       sum3 = b[*r++];
863       v2   = aa + ai[row+1];
864       v3   = aa + ai[row+2];
865 
866       for (j=0; j<nz-1; j+=2){
867         i0   = vi[0];
868         i1   = vi[1];
869         vi  +=2;
870         tmp0 = tmps[i0];
871         tmp1 = tmps[i1];
872         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
873         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
874         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
875       }
876       if (j == nz-1){
877         tmp0 = tmps[*vi++];
878         sum1 -= *v1++ *tmp0;
879         sum2 -= *v2++ *tmp0;
880         sum3 -= *v3++ *tmp0;
881       }
882       sum2 -= *v2++ * sum1;
883       sum3 -= *v3++ * sum1;
884       sum3 -= *v3++ * sum2;
885       tmp[row ++]=sum1;
886       tmp[row ++]=sum2;
887       tmp[row ++]=sum3;
888       break;
889 
890     case 4:
891       sum1 = b[*r++];
892       sum2 = b[*r++];
893       sum3 = b[*r++];
894       sum4 = b[*r++];
895       v2   = aa + ai[row+1];
896       v3   = aa + ai[row+2];
897       v4   = aa + ai[row+3];
898 
899       for (j=0; j<nz-1; j+=2){
900         i0   = vi[0];
901         i1   = vi[1];
902         vi  +=2;
903         tmp0 = tmps[i0];
904         tmp1 = tmps[i1];
905         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
906         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
907         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
908         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
909       }
910       if (j == nz-1){
911         tmp0 = tmps[*vi++];
912         sum1 -= *v1++ *tmp0;
913         sum2 -= *v2++ *tmp0;
914         sum3 -= *v3++ *tmp0;
915         sum4 -= *v4++ *tmp0;
916       }
917       sum2 -= *v2++ * sum1;
918       sum3 -= *v3++ * sum1;
919       sum4 -= *v4++ * sum1;
920       sum3 -= *v3++ * sum2;
921       sum4 -= *v4++ * sum2;
922       sum4 -= *v4++ * sum3;
923 
924       tmp[row ++]=sum1;
925       tmp[row ++]=sum2;
926       tmp[row ++]=sum3;
927       tmp[row ++]=sum4;
928       break;
929     case 5:
930       sum1 = b[*r++];
931       sum2 = b[*r++];
932       sum3 = b[*r++];
933       sum4 = b[*r++];
934       sum5 = b[*r++];
935       v2   = aa + ai[row+1];
936       v3   = aa + ai[row+2];
937       v4   = aa + ai[row+3];
938       v5   = aa + ai[row+4];
939 
940       for (j=0; j<nz-1; j+=2){
941         i0   = vi[0];
942         i1   = vi[1];
943         vi  +=2;
944         tmp0 = tmps[i0];
945         tmp1 = tmps[i1];
946         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
947         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
948         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
949         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
950         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
951       }
952       if (j == nz-1){
953         tmp0 = tmps[*vi++];
954         sum1 -= *v1++ *tmp0;
955         sum2 -= *v2++ *tmp0;
956         sum3 -= *v3++ *tmp0;
957         sum4 -= *v4++ *tmp0;
958         sum5 -= *v5++ *tmp0;
959       }
960 
961       sum2 -= *v2++ * sum1;
962       sum3 -= *v3++ * sum1;
963       sum4 -= *v4++ * sum1;
964       sum5 -= *v5++ * sum1;
965       sum3 -= *v3++ * sum2;
966       sum4 -= *v4++ * sum2;
967       sum5 -= *v5++ * sum2;
968       sum4 -= *v4++ * sum3;
969       sum5 -= *v5++ * sum3;
970       sum5 -= *v5++ * sum4;
971 
972       tmp[row ++]=sum1;
973       tmp[row ++]=sum2;
974       tmp[row ++]=sum3;
975       tmp[row ++]=sum4;
976       tmp[row ++]=sum5;
977       break;
978     default:
979       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
980     }
981   }
982   /* backward solve the upper triangular */
983   for (i=node_max -1 ,row = n-1 ; i>=0; i--){
984     nsz = ns[i];
985     aii = ai[row+1] -1;
986     v1  = aa + aii;
987     vi  = aj + aii;
988     nz  = aii- ad[row];
989     switch (nsz){               /* Each loop in 'case' is unrolled */
990     case 1 :
991       sum1 = tmp[row];
992 
993       for(j=nz ; j>1; j-=2){
994         vi  -=2;
995         i0   = vi[2];
996         i1   = vi[1];
997         tmp0 = tmps[i0];
998         tmp1 = tmps[i1];
999         v1   -= 2;
1000         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1001       }
1002       if (j==1){
1003         tmp0  = tmps[*vi--];
1004         sum1 -= *v1-- * tmp0;
1005       }
1006       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1007       break;
1008     case 2 :
1009       sum1 = tmp[row];
1010       sum2 = tmp[row -1];
1011       v2   = aa + ai[row]-1;
1012       for (j=nz ; j>1; j-=2){
1013         vi  -=2;
1014         i0   = vi[2];
1015         i1   = vi[1];
1016         tmp0 = tmps[i0];
1017         tmp1 = tmps[i1];
1018         v1   -= 2;
1019         v2   -= 2;
1020         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1021         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1022       }
1023       if (j==1){
1024         tmp0  = tmps[*vi--];
1025         sum1 -= *v1-- * tmp0;
1026         sum2 -= *v2-- * tmp0;
1027       }
1028 
1029       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1030       sum2   -= *v2-- * tmp0;
1031       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1032       break;
1033     case 3 :
1034       sum1 = tmp[row];
1035       sum2 = tmp[row -1];
1036       sum3 = tmp[row -2];
1037       v2   = aa + ai[row]-1;
1038       v3   = aa + ai[row -1]-1;
1039       for (j=nz ; j>1; j-=2){
1040         vi  -=2;
1041         i0   = vi[2];
1042         i1   = vi[1];
1043         tmp0 = tmps[i0];
1044         tmp1 = tmps[i1];
1045         v1   -= 2;
1046         v2   -= 2;
1047         v3   -= 2;
1048         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1049         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1050         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1051       }
1052       if (j==1){
1053         tmp0  = tmps[*vi--];
1054         sum1 -= *v1-- * tmp0;
1055         sum2 -= *v2-- * tmp0;
1056         sum3 -= *v3-- * tmp0;
1057       }
1058       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1059       sum2   -= *v2-- * tmp0;
1060       sum3   -= *v3-- * tmp0;
1061       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1062       sum3   -= *v3-- * tmp0;
1063       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1064 
1065       break;
1066     case 4 :
1067       sum1 = tmp[row];
1068       sum2 = tmp[row -1];
1069       sum3 = tmp[row -2];
1070       sum4 = tmp[row -3];
1071       v2   = aa + ai[row]-1;
1072       v3   = aa + ai[row -1]-1;
1073       v4   = aa + ai[row -2]-1;
1074 
1075       for (j=nz ; j>1; j-=2){
1076         vi  -=2;
1077         i0   = vi[2];
1078         i1   = vi[1];
1079         tmp0 = tmps[i0];
1080         tmp1 = tmps[i1];
1081         v1  -= 2;
1082         v2  -= 2;
1083         v3  -= 2;
1084         v4  -= 2;
1085         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1086         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1087         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1088         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1089       }
1090       if (j==1){
1091         tmp0  = tmps[*vi--];
1092         sum1 -= *v1-- * tmp0;
1093         sum2 -= *v2-- * tmp0;
1094         sum3 -= *v3-- * tmp0;
1095         sum4 -= *v4-- * tmp0;
1096       }
1097 
1098       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1099       sum2   -= *v2-- * tmp0;
1100       sum3   -= *v3-- * tmp0;
1101       sum4   -= *v4-- * tmp0;
1102       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1103       sum3   -= *v3-- * tmp0;
1104       sum4   -= *v4-- * tmp0;
1105       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1106       sum4   -= *v4-- * tmp0;
1107       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1108       break;
1109     case 5 :
1110       sum1 = tmp[row];
1111       sum2 = tmp[row -1];
1112       sum3 = tmp[row -2];
1113       sum4 = tmp[row -3];
1114       sum5 = tmp[row -4];
1115       v2   = aa + ai[row]-1;
1116       v3   = aa + ai[row -1]-1;
1117       v4   = aa + ai[row -2]-1;
1118       v5   = aa + ai[row -3]-1;
1119       for (j=nz ; j>1; j-=2){
1120         vi  -= 2;
1121         i0   = vi[2];
1122         i1   = vi[1];
1123         tmp0 = tmps[i0];
1124         tmp1 = tmps[i1];
1125         v1   -= 2;
1126         v2   -= 2;
1127         v3   -= 2;
1128         v4   -= 2;
1129         v5   -= 2;
1130         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1131         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1132         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1133         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1134         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1135       }
1136       if (j==1){
1137         tmp0  = tmps[*vi--];
1138         sum1 -= *v1-- * tmp0;
1139         sum2 -= *v2-- * tmp0;
1140         sum3 -= *v3-- * tmp0;
1141         sum4 -= *v4-- * tmp0;
1142         sum5 -= *v5-- * tmp0;
1143       }
1144 
1145       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1146       sum2   -= *v2-- * tmp0;
1147       sum3   -= *v3-- * tmp0;
1148       sum4   -= *v4-- * tmp0;
1149       sum5   -= *v5-- * tmp0;
1150       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1151       sum3   -= *v3-- * tmp0;
1152       sum4   -= *v4-- * tmp0;
1153       sum5   -= *v5-- * tmp0;
1154       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1155       sum4   -= *v4-- * tmp0;
1156       sum5   -= *v5-- * tmp0;
1157       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1158       sum5   -= *v5-- * tmp0;
1159       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1160       break;
1161     default:
1162       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1163     }
1164   }
1165   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1166   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1167   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1168   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1169   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
1170   PetscFunctionReturn(0);
1171 }
1172 
1173 /* ----------------------------------------------------------- */
1174 #undef __FUNCT__
1175 #define __FUNCT__ "MatSolve_SeqAIJ_Inode_newdatastruct"
1176 PetscErrorCode MatSolve_SeqAIJ_Inode_newdatastruct(Mat A,Vec bb,Vec xx)
1177 {
1178   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
1179   IS                iscol = a->col,isrow = a->row;
1180   PetscErrorCode    ierr;
1181   const PetscInt    *r,*c,*rout,*cout;
1182   PetscInt          i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
1183   PetscInt          node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
1184   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
1185   PetscScalar       sum1,sum2,sum3,sum4,sum5;
1186   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
1187   const PetscScalar *b;
1188 
1189   PetscFunctionBegin;
1190   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
1191   node_max = a->inode.node_count;
1192   ns       = a->inode.size;     /* Node Size array */
1193 
1194   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1195   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1196   tmp  = a->solve_work;
1197 
1198   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1199   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1200 
1201   /* forward solve the lower triangular */
1202   tmps = tmp ;
1203   aa   = a_a ;
1204   aj   = a_j ;
1205   ad   = a->diag;
1206 
1207   for (i = 0,row = 0; i< node_max; ++i){
1208     nsz = ns[i];
1209     aii = ai[row];
1210     v1  = aa + aii;
1211     vi  = aj + aii;
1212     nz  = ai[row+1]- ai[row];
1213 
1214     switch (nsz){               /* Each loop in 'case' is unrolled */
1215     case 1 :
1216       sum1 = b[r[row]];
1217       for(j=0; j<nz-1; j+=2){
1218         i0   = vi[j];
1219         i1   = vi[j+1];
1220         tmp0 = tmps[i0];
1221         tmp1 = tmps[i1];
1222         sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1;
1223       }
1224       if(j == nz-1){
1225         tmp0 = tmps[vi[j]];
1226         sum1 -= v1[j]*tmp0;
1227       }
1228       tmp[row++]=sum1;
1229       break;
1230     case 2:
1231       sum1 = b[r[row]];
1232       sum2 = b[r[row+1]];
1233       v2   = aa + ai[row+1];
1234 
1235       for(j=0; j<nz-1; j+=2){
1236         i0   = vi[j];
1237         i1   = vi[j+1];
1238         tmp0 = tmps[i0];
1239         tmp1 = tmps[i1];
1240         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
1241         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
1242       }
1243       if(j == nz-1){
1244         tmp0 = tmps[vi[j]];
1245         sum1 -= v1[j] *tmp0;
1246         sum2 -= v2[j] *tmp0;
1247       }
1248       sum2 -= v2[nz] * sum1;
1249       tmp[row ++]=sum1;
1250       tmp[row ++]=sum2;
1251       break;
1252     case 3:
1253       sum1 = b[r[row]];
1254       sum2 = b[r[row+1]];
1255       sum3 = b[r[row+2]];
1256       v2   = aa + ai[row+1];
1257       v3   = aa + ai[row+2];
1258 
1259       for (j=0; j<nz-1; j+=2){
1260         i0   = vi[j];
1261         i1   = vi[j+1];
1262         tmp0 = tmps[i0];
1263         tmp1 = tmps[i1];
1264         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
1265         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
1266         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
1267       }
1268       if (j == nz-1){
1269         tmp0 = tmps[vi[j]];
1270         sum1 -= v1[j] *tmp0;
1271         sum2 -= v2[j] *tmp0;
1272         sum3 -= v3[j] *tmp0;
1273       }
1274       sum2 -= v2[nz] * sum1;
1275       sum3 -= v3[nz] * sum1;
1276       sum3 -= v3[nz+1] * sum2;
1277       tmp[row ++]=sum1;
1278       tmp[row ++]=sum2;
1279       tmp[row ++]=sum3;
1280       break;
1281 
1282     case 4:
1283       sum1 = b[r[row]];
1284       sum2 = b[r[row+1]];
1285       sum3 = b[r[row+2]];
1286       sum4 = b[r[row+3]];
1287       v2   = aa + ai[row+1];
1288       v3   = aa + ai[row+2];
1289       v4   = aa + ai[row+3];
1290 
1291       for (j=0; j<nz-1; j+=2){
1292         i0   = vi[j];
1293         i1   = vi[j+1];
1294         tmp0 = tmps[i0];
1295         tmp1 = tmps[i1];
1296         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
1297         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
1298         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
1299         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
1300       }
1301       if (j == nz-1){
1302         tmp0 = tmps[vi[j]];
1303         sum1 -= v1[j] *tmp0;
1304         sum2 -= v2[j] *tmp0;
1305         sum3 -= v3[j] *tmp0;
1306         sum4 -= v4[j] *tmp0;
1307       }
1308       sum2 -= v2[nz] * sum1;
1309       sum3 -= v3[nz] * sum1;
1310       sum4 -= v4[nz] * sum1;
1311       sum3 -= v3[nz+1] * sum2;
1312       sum4 -= v4[nz+1] * sum2;
1313       sum4 -= v4[nz+2] * sum3;
1314 
1315       tmp[row ++]=sum1;
1316       tmp[row ++]=sum2;
1317       tmp[row ++]=sum3;
1318       tmp[row ++]=sum4;
1319       break;
1320     case 5:
1321       sum1 = b[r[row]];
1322       sum2 = b[r[row+1]];
1323       sum3 = b[r[row+2]];
1324       sum4 = b[r[row+3]];
1325       sum5 = b[r[row+4]];
1326       v2   = aa + ai[row+1];
1327       v3   = aa + ai[row+2];
1328       v4   = aa + ai[row+3];
1329       v5   = aa + ai[row+4];
1330 
1331       for (j=0; j<nz-1; j+=2){
1332         i0   = vi[j];
1333         i1   = vi[j+1];
1334         tmp0 = tmps[i0];
1335         tmp1 = tmps[i1];
1336         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
1337         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
1338         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
1339         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
1340         sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1;
1341       }
1342       if (j == nz-1){
1343         tmp0 = tmps[vi[j]];
1344         sum1 -= v1[j] *tmp0;
1345         sum2 -= v2[j] *tmp0;
1346         sum3 -= v3[j] *tmp0;
1347         sum4 -= v4[j] *tmp0;
1348         sum5 -= v5[j] *tmp0;
1349       }
1350 
1351       sum2 -= v2[nz] * sum1;
1352       sum3 -= v3[nz] * sum1;
1353       sum4 -= v4[nz] * sum1;
1354       sum5 -= v5[nz] * sum1;
1355       sum3 -= v3[nz+1] * sum2;
1356       sum4 -= v4[nz+1] * sum2;
1357       sum5 -= v5[nz+1] * sum2;
1358       sum4 -= v4[nz+2] * sum3;
1359       sum5 -= v5[nz+2] * sum3;
1360       sum5 -= v5[nz+3] * sum4;
1361 
1362       tmp[row ++]=sum1;
1363       tmp[row ++]=sum2;
1364       tmp[row ++]=sum3;
1365       tmp[row ++]=sum4;
1366       tmp[row ++]=sum5;
1367       break;
1368     default:
1369       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1370     }
1371   }
1372   /* backward solve the upper triangular */
1373   for (i=node_max -1 ,row = n-1 ; i>=0; i--){
1374     nsz = ns[i];
1375     aii = ad[row+1] + 1;
1376     v1  = aa + aii;
1377     vi  = aj + aii;
1378     nz  = ad[row]- ad[row+1] - 1;
1379     switch (nsz){               /* Each loop in 'case' is unrolled */
1380     case 1 :
1381       sum1 = tmp[row];
1382 
1383       for(j=0 ; j<nz-1; j+=2){
1384         i0   = vi[j];
1385         i1   = vi[j+1];
1386         tmp0 = tmps[i0];
1387         tmp1 = tmps[i1];
1388         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
1389       }
1390       if (j == nz-1){
1391         tmp0  = tmps[vi[j]];
1392         sum1 -= v1[j]*tmp0;
1393       }
1394       x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
1395       break;
1396     case 2 :
1397       sum1 = tmp[row];
1398       sum2 = tmp[row-1];
1399       v2   = aa + ad[row] + 1;
1400       for (j=0 ; j<nz-1; j+=2){
1401         i0   = vi[j];
1402         i1   = vi[j+1];
1403         tmp0 = tmps[i0];
1404         tmp1 = tmps[i1];
1405         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
1406         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
1407       }
1408       if (j == nz-1){
1409         tmp0  = tmps[vi[j]];
1410         sum1 -= v1[j]* tmp0;
1411         sum2 -= v2[j+1]* tmp0;
1412       }
1413 
1414       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
1415       sum2   -= v2[0] * tmp0;
1416       x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
1417       break;
1418     case 3 :
1419       sum1 = tmp[row];
1420       sum2 = tmp[row -1];
1421       sum3 = tmp[row -2];
1422       v2   = aa + ad[row] + 1;
1423       v3   = aa + ad[row -1] + 1;
1424       for (j=0 ; j<nz-1; j+=2){
1425         i0   = vi[j];
1426         i1   = vi[j+1];
1427         tmp0 = tmps[i0];
1428         tmp1 = tmps[i1];
1429         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
1430         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
1431         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
1432       }
1433       if (j== nz-1){
1434         tmp0  = tmps[vi[j]];
1435         sum1 -= v1[j] * tmp0;
1436         sum2 -= v2[j+1] * tmp0;
1437         sum3 -= v3[j+2] * tmp0;
1438       }
1439       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
1440       sum2   -= v2[0]* tmp0;
1441       sum3   -= v3[1] * tmp0;
1442       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
1443       sum3   -= v3[0]* tmp0;
1444       x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
1445 
1446       break;
1447     case 4 :
1448       sum1 = tmp[row];
1449       sum2 = tmp[row -1];
1450       sum3 = tmp[row -2];
1451       sum4 = tmp[row -3];
1452       v2   = aa + ad[row]+1;
1453       v3   = aa + ad[row -1]+1;
1454       v4   = aa + ad[row -2]+1;
1455 
1456       for (j=0 ; j<nz-1; j+=2){
1457         i0   = vi[j];
1458         i1   = vi[j+1];
1459         tmp0 = tmps[i0];
1460         tmp1 = tmps[i1];
1461         sum1 -= v1[j] * tmp0   + v1[j+1] * tmp1;
1462         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
1463         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
1464         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
1465       }
1466       if (j== nz-1){
1467         tmp0  = tmps[vi[j]];
1468         sum1 -= v1[j] * tmp0;
1469         sum2 -= v2[j+1] * tmp0;
1470         sum3 -= v3[j+2] * tmp0;
1471         sum4 -= v4[j+3] * tmp0;
1472       }
1473 
1474       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
1475       sum2   -= v2[0] * tmp0;
1476       sum3   -= v3[1] * tmp0;
1477       sum4   -= v4[2] * tmp0;
1478       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
1479       sum3   -= v3[0] * tmp0;
1480       sum4   -= v4[1] * tmp0;
1481       tmp0    = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
1482       sum4   -= v4[0] * tmp0;
1483       x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
1484       break;
1485     case 5 :
1486       sum1 = tmp[row];
1487       sum2 = tmp[row -1];
1488       sum3 = tmp[row -2];
1489       sum4 = tmp[row -3];
1490       sum5 = tmp[row -4];
1491       v2   = aa + ad[row]+1;
1492       v3   = aa + ad[row -1]+1;
1493       v4   = aa + ad[row -2]+1;
1494       v5   = aa + ad[row -3]+1;
1495       for (j=0 ; j<nz-1; j+=2){
1496         i0   = vi[j];
1497         i1   = vi[j+1];
1498         tmp0 = tmps[i0];
1499         tmp1 = tmps[i1];
1500         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
1501         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
1502         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
1503         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
1504         sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1;
1505       }
1506       if (j==nz-1){
1507         tmp0  = tmps[vi[j]];
1508         sum1 -= v1[j] * tmp0;
1509         sum2 -= v2[j+1] * tmp0;
1510         sum3 -= v3[j+2] * tmp0;
1511         sum4 -= v4[j+3] * tmp0;
1512         sum5 -= v5[j+4] * tmp0;
1513       }
1514 
1515       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
1516       sum2   -= v2[0] * tmp0;
1517       sum3   -= v3[1] * tmp0;
1518       sum4   -= v4[2] * tmp0;
1519       sum5   -= v5[3] * tmp0;
1520       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
1521       sum3   -= v3[0] * tmp0;
1522       sum4   -= v4[1] * tmp0;
1523       sum5   -= v5[2] * tmp0;
1524       tmp0    = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
1525       sum4   -= v4[0] * tmp0;
1526       sum5   -= v5[1] * tmp0;
1527       tmp0    = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
1528       sum5   -= v5[0] * tmp0;
1529       x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--;
1530       break;
1531     default:
1532       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1533     }
1534   }
1535   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1536   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1537   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1538   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1539   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
1540   PetscFunctionReturn(0);
1541 }
1542 
1543 
1544 /*
1545      Makes a longer coloring[] array and calls the usual code with that
1546 */
1547 #undef __FUNCT__
1548 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode"
1549 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
1550 {
1551   Mat_SeqAIJ       *a = (Mat_SeqAIJ*)mat->data;
1552   PetscErrorCode  ierr;
1553   PetscInt        n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
1554   PetscInt        *colorused,i;
1555   ISColoringValue *newcolor;
1556 
1557   PetscFunctionBegin;
1558   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr);
1559   /* loop over inodes, marking a color for each column*/
1560   row = 0;
1561   for (i=0; i<m; i++){
1562     for (j=0; j<ns[i]; j++) {
1563       newcolor[row++] = coloring[i] + j*ncolors;
1564     }
1565   }
1566 
1567   /* eliminate unneeded colors */
1568   ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr);
1569   ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr);
1570   for (i=0; i<n; i++) {
1571     colorused[newcolor[i]] = 1;
1572   }
1573 
1574   for (i=1; i<5*ncolors; i++) {
1575     colorused[i] += colorused[i-1];
1576   }
1577   ncolors = colorused[5*ncolors-1];
1578   for (i=0; i<n; i++) {
1579     newcolor[i] = colorused[newcolor[i]]-1;
1580   }
1581   ierr = PetscFree(colorused);CHKERRQ(ierr);
1582   ierr = ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr);
1583   ierr = PetscFree(coloring);CHKERRQ(ierr);
1584   PetscFunctionReturn(0);
1585 }
1586 
1587 #include "../src/mat/blockinvert.h"
1588 
1589 #undef __FUNCT__
1590 #define __FUNCT__ "MatSOR_SeqAIJ_Inode"
1591 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
1592 {
1593   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1594   PetscScalar        sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3;
1595   MatScalar          *ibdiag,*bdiag;
1596   PetscScalar        *x,*xb,tmp4,tmp5,x1,x2,x3,x4,x5;
1597   const MatScalar    *v = a->a,*v1,*v2,*v3,*v4,*v5;
1598   PetscReal          zeropivot = 1.0e-15, shift = 0.0;
1599   PetscErrorCode     ierr;
1600   PetscInt           n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2;
1601   PetscInt           *idx,*diag = a->diag,*ii = a->i,sz,k;
1602 
1603   PetscFunctionBegin;
1604   if (omega != 1.0) SETERRQ(PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
1605   if (fshift != 0.0) SETERRQ(PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
1606   if (its > 1) {
1607     /* switch to non-inode version */
1608     ierr = MatSOR_SeqAIJ(A,bb,omega,flag,fshift,its,lits,xx);CHKERRQ(ierr);
1609     PetscFunctionReturn(0);
1610   }
1611 
1612   if (!a->inode.ibdiagvalid) {
1613     if (!a->inode.ibdiag) {
1614       /* calculate space needed for diagonal blocks */
1615       for (i=0; i<m; i++) {
1616 	cnt += sizes[i]*sizes[i];
1617       }
1618       a->inode.bdiagsize = cnt;
1619       ierr   = PetscMalloc3(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag,A->rmap->n,MatScalar,&a->inode.ssor_work);CHKERRQ(ierr);
1620     }
1621 
1622     /* copy over the diagonal blocks and invert them */
1623     ibdiag = a->inode.ibdiag;
1624     bdiag  = a->inode.bdiag;
1625     cnt = 0;
1626     for (i=0, row = 0; i<m; i++) {
1627       for (j=0; j<sizes[i]; j++) {
1628         for (k=0; k<sizes[i]; k++) {
1629           bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
1630         }
1631       }
1632       ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr);
1633 
1634       switch(sizes[i]) {
1635         case 1:
1636           /* Create matrix data structure */
1637           if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
1638           ibdiag[cnt] = 1.0/ibdiag[cnt];
1639           break;
1640         case 2:
1641           ierr = Kernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr);
1642           break;
1643         case 3:
1644           ierr = Kernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr);
1645           break;
1646         case 4:
1647           ierr = Kernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr);
1648           break;
1649         case 5:
1650           ierr = Kernel_A_gets_inverse_A_5(ibdiag+cnt,shift);CHKERRQ(ierr);
1651           break;
1652        default:
1653 	 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
1654       }
1655       cnt += sizes[i]*sizes[i];
1656       row += sizes[i];
1657     }
1658     a->inode.ibdiagvalid = PETSC_TRUE;
1659   }
1660   ibdiag = a->inode.ibdiag;
1661   bdiag  = a->inode.bdiag;
1662 
1663 
1664   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
1665   if (flag & SOR_ZERO_INITIAL_GUESS) {
1666     PetscScalar *b;
1667     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1668     if (xx != bb) {
1669       ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1670     } else {
1671       b = x;
1672     }
1673     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){
1674 
1675       for (i=0, row=0; i<m; i++) {
1676         sz  = diag[row] - ii[row];
1677         v1  = a->a + ii[row];
1678         idx = a->j + ii[row];
1679 
1680         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
1681         switch (sizes[i]){
1682           case 1:
1683 
1684             sum1  = b[row];
1685             for(n = 0; n<sz-1; n+=2) {
1686               i1   = idx[0];
1687               i2   = idx[1];
1688               idx += 2;
1689               tmp0 = x[i1];
1690               tmp1 = x[i2];
1691               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1692             }
1693 
1694             if (n == sz-1){
1695               tmp0  = x[*idx];
1696               sum1 -= *v1 * tmp0;
1697             }
1698             x[row++] = sum1*(*ibdiag++);
1699             break;
1700           case 2:
1701             v2    = a->a + ii[row+1];
1702             sum1  = b[row];
1703             sum2  = b[row+1];
1704             for(n = 0; n<sz-1; n+=2) {
1705               i1   = idx[0];
1706               i2   = idx[1];
1707               idx += 2;
1708               tmp0 = x[i1];
1709               tmp1 = x[i2];
1710               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1711               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1712             }
1713 
1714             if (n == sz-1){
1715               tmp0  = x[*idx];
1716               sum1 -= v1[0] * tmp0;
1717               sum2 -= v2[0] * tmp0;
1718             }
1719             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
1720             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
1721             ibdiag  += 4;
1722             break;
1723           case 3:
1724             v2    = a->a + ii[row+1];
1725             v3    = a->a + ii[row+2];
1726             sum1  = b[row];
1727             sum2  = b[row+1];
1728             sum3  = b[row+2];
1729             for(n = 0; n<sz-1; n+=2) {
1730               i1   = idx[0];
1731               i2   = idx[1];
1732               idx += 2;
1733               tmp0 = x[i1];
1734               tmp1 = x[i2];
1735               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1736               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1737               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1738             }
1739 
1740             if (n == sz-1){
1741               tmp0  = x[*idx];
1742               sum1 -= v1[0] * tmp0;
1743               sum2 -= v2[0] * tmp0;
1744               sum3 -= v3[0] * tmp0;
1745             }
1746             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
1747             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
1748             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
1749             ibdiag  += 9;
1750             break;
1751           case 4:
1752             v2    = a->a + ii[row+1];
1753             v3    = a->a + ii[row+2];
1754             v4    = a->a + ii[row+3];
1755             sum1  = b[row];
1756             sum2  = b[row+1];
1757             sum3  = b[row+2];
1758             sum4  = b[row+3];
1759             for(n = 0; n<sz-1; n+=2) {
1760               i1   = idx[0];
1761               i2   = idx[1];
1762               idx += 2;
1763               tmp0 = x[i1];
1764               tmp1 = x[i2];
1765               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1766               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1767               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1768               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
1769             }
1770 
1771             if (n == sz-1){
1772               tmp0  = x[*idx];
1773               sum1 -= v1[0] * tmp0;
1774               sum2 -= v2[0] * tmp0;
1775               sum3 -= v3[0] * tmp0;
1776               sum4 -= v4[0] * tmp0;
1777             }
1778             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
1779             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
1780             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
1781             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
1782             ibdiag  += 16;
1783             break;
1784           case 5:
1785             v2    = a->a + ii[row+1];
1786             v3    = a->a + ii[row+2];
1787             v4    = a->a + ii[row+3];
1788             v5    = a->a + ii[row+4];
1789             sum1  = b[row];
1790             sum2  = b[row+1];
1791             sum3  = b[row+2];
1792             sum4  = b[row+3];
1793             sum5  = b[row+4];
1794             for(n = 0; n<sz-1; n+=2) {
1795               i1   = idx[0];
1796               i2   = idx[1];
1797               idx += 2;
1798               tmp0 = x[i1];
1799               tmp1 = x[i2];
1800               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1801               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1802               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1803               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
1804               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
1805             }
1806 
1807             if (n == sz-1){
1808               tmp0  = x[*idx];
1809               sum1 -= v1[0] * tmp0;
1810               sum2 -= v2[0] * tmp0;
1811               sum3 -= v3[0] * tmp0;
1812               sum4 -= v4[0] * tmp0;
1813               sum5 -= v5[0] * tmp0;
1814             }
1815             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
1816             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
1817             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
1818             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
1819             x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
1820             ibdiag  += 25;
1821             break;
1822 	  default:
1823    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
1824         }
1825       }
1826 
1827       xb = x;
1828       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
1829     } else xb = b;
1830     if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) &&
1831         (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1832       cnt = 0;
1833       for (i=0, row=0; i<m; i++) {
1834 
1835         switch (sizes[i]){
1836           case 1:
1837             x[row++] *= bdiag[cnt++];
1838             break;
1839           case 2:
1840             x1   = x[row]; x2 = x[row+1];
1841             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
1842             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
1843             x[row++] = tmp1;
1844             x[row++] = tmp2;
1845             cnt += 4;
1846             break;
1847           case 3:
1848             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2];
1849             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
1850             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
1851             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
1852             x[row++] = tmp1;
1853             x[row++] = tmp2;
1854             x[row++] = tmp3;
1855             cnt += 9;
1856             break;
1857           case 4:
1858             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
1859             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
1860             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
1861             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
1862             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
1863             x[row++] = tmp1;
1864             x[row++] = tmp2;
1865             x[row++] = tmp3;
1866             x[row++] = tmp4;
1867             cnt += 16;
1868             break;
1869           case 5:
1870             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
1871             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
1872             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
1873             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
1874             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
1875             tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
1876             x[row++] = tmp1;
1877             x[row++] = tmp2;
1878             x[row++] = tmp3;
1879             x[row++] = tmp4;
1880             x[row++] = tmp5;
1881             cnt += 25;
1882             break;
1883 	  default:
1884    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
1885         }
1886       }
1887       ierr = PetscLogFlops(m);CHKERRQ(ierr);
1888     }
1889     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){
1890 
1891       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
1892       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
1893         ibdiag -= sizes[i]*sizes[i];
1894         sz      = ii[row+1] - diag[row] - 1;
1895         v1      = a->a + diag[row] + 1;
1896         idx     = a->j + diag[row] + 1;
1897 
1898         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
1899         switch (sizes[i]){
1900           case 1:
1901 
1902             sum1  = xb[row];
1903             for(n = 0; n<sz-1; n+=2) {
1904               i1   = idx[0];
1905               i2   = idx[1];
1906               idx += 2;
1907               tmp0 = x[i1];
1908               tmp1 = x[i2];
1909               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1910             }
1911 
1912             if (n == sz-1){
1913               tmp0  = x[*idx];
1914               sum1 -= *v1*tmp0;
1915             }
1916             x[row--] = sum1*(*ibdiag);
1917             break;
1918 
1919           case 2:
1920 
1921             sum1  = xb[row];
1922             sum2  = xb[row-1];
1923             /* note that sum1 is associated with the second of the two rows */
1924             v2    = a->a + diag[row-1] + 2;
1925             for(n = 0; n<sz-1; n+=2) {
1926               i1   = idx[0];
1927               i2   = idx[1];
1928               idx += 2;
1929               tmp0 = x[i1];
1930               tmp1 = x[i2];
1931               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1932               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1933             }
1934 
1935             if (n == sz-1){
1936               tmp0  = x[*idx];
1937               sum1 -= *v1*tmp0;
1938               sum2 -= *v2*tmp0;
1939             }
1940             x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
1941             x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
1942             break;
1943           case 3:
1944 
1945             sum1  = xb[row];
1946             sum2  = xb[row-1];
1947             sum3  = xb[row-2];
1948             v2    = a->a + diag[row-1] + 2;
1949             v3    = a->a + diag[row-2] + 3;
1950             for(n = 0; n<sz-1; n+=2) {
1951               i1   = idx[0];
1952               i2   = idx[1];
1953               idx += 2;
1954               tmp0 = x[i1];
1955               tmp1 = x[i2];
1956               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1957               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1958               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1959             }
1960 
1961             if (n == sz-1){
1962               tmp0  = x[*idx];
1963               sum1 -= *v1*tmp0;
1964               sum2 -= *v2*tmp0;
1965               sum3 -= *v3*tmp0;
1966             }
1967             x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
1968             x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
1969             x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
1970             break;
1971           case 4:
1972 
1973             sum1  = xb[row];
1974             sum2  = xb[row-1];
1975             sum3  = xb[row-2];
1976             sum4  = xb[row-3];
1977             v2    = a->a + diag[row-1] + 2;
1978             v3    = a->a + diag[row-2] + 3;
1979             v4    = a->a + diag[row-3] + 4;
1980             for(n = 0; n<sz-1; n+=2) {
1981               i1   = idx[0];
1982               i2   = idx[1];
1983               idx += 2;
1984               tmp0 = x[i1];
1985               tmp1 = x[i2];
1986               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1987               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1988               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1989               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
1990             }
1991 
1992             if (n == sz-1){
1993               tmp0  = x[*idx];
1994               sum1 -= *v1*tmp0;
1995               sum2 -= *v2*tmp0;
1996               sum3 -= *v3*tmp0;
1997               sum4 -= *v4*tmp0;
1998             }
1999             x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
2000             x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
2001             x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
2002             x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
2003             break;
2004           case 5:
2005 
2006             sum1  = xb[row];
2007             sum2  = xb[row-1];
2008             sum3  = xb[row-2];
2009             sum4  = xb[row-3];
2010             sum5  = xb[row-4];
2011             v2    = a->a + diag[row-1] + 2;
2012             v3    = a->a + diag[row-2] + 3;
2013             v4    = a->a + diag[row-3] + 4;
2014             v5    = a->a + diag[row-4] + 5;
2015             for(n = 0; n<sz-1; n+=2) {
2016               i1   = idx[0];
2017               i2   = idx[1];
2018               idx += 2;
2019               tmp0 = x[i1];
2020               tmp1 = x[i2];
2021               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2022               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2023               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2024               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2025               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2026             }
2027 
2028             if (n == sz-1){
2029               tmp0  = x[*idx];
2030               sum1 -= *v1*tmp0;
2031               sum2 -= *v2*tmp0;
2032               sum3 -= *v3*tmp0;
2033               sum4 -= *v4*tmp0;
2034               sum5 -= *v5*tmp0;
2035             }
2036             x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
2037             x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
2038             x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
2039             x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
2040             x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
2041             break;
2042 	  default:
2043    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2044         }
2045       }
2046 
2047       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2048     }
2049     its--;
2050     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2051     if (bb != xx) {ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);}
2052   }
2053   if (flag & SOR_EISENSTAT) {
2054     const PetscScalar *b;
2055     MatScalar         *t = a->inode.ssor_work;
2056 
2057     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2058     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2059     /*
2060           Apply  (U + D)^-1  where D is now the block diagonal
2061     */
2062     ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
2063     for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
2064       ibdiag -= sizes[i]*sizes[i];
2065       sz      = ii[row+1] - diag[row] - 1;
2066       v1      = a->a + diag[row] + 1;
2067       idx     = a->j + diag[row] + 1;
2068       CHKMEMQ;
2069       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2070       switch (sizes[i]){
2071         case 1:
2072 
2073 	  sum1  = b[row];
2074 	  for(n = 0; n<sz-1; n+=2) {
2075 	    i1   = idx[0];
2076 	    i2   = idx[1];
2077 	    idx += 2;
2078 	    tmp0 = x[i1];
2079 	    tmp1 = x[i2];
2080 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2081 	  }
2082 
2083 	  if (n == sz-1){
2084 	    tmp0  = x[*idx];
2085 	    sum1 -= *v1*tmp0;
2086 	  }
2087 	  x[row] = sum1*(*ibdiag);row--;
2088 	  break;
2089 
2090         case 2:
2091 
2092 	  sum1  = b[row];
2093 	  sum2  = b[row-1];
2094 	  /* note that sum1 is associated with the second of the two rows */
2095 	  v2    = a->a + diag[row-1] + 2;
2096 	  for(n = 0; n<sz-1; n+=2) {
2097 	    i1   = idx[0];
2098 	    i2   = idx[1];
2099 	    idx += 2;
2100 	    tmp0 = x[i1];
2101 	    tmp1 = x[i2];
2102 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2103 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2104 	  }
2105 
2106 	  if (n == sz-1){
2107 	    tmp0  = x[*idx];
2108 	    sum1 -= *v1*tmp0;
2109 	    sum2 -= *v2*tmp0;
2110 	  }
2111 	  x[row] = sum2*ibdiag[1] + sum1*ibdiag[3];
2112 	  x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
2113           row -= 2;
2114 	  break;
2115         case 3:
2116 
2117 	  sum1  = b[row];
2118 	  sum2  = b[row-1];
2119 	  sum3  = b[row-2];
2120 	  v2    = a->a + diag[row-1] + 2;
2121 	  v3    = a->a + diag[row-2] + 3;
2122 	  for(n = 0; n<sz-1; n+=2) {
2123 	    i1   = idx[0];
2124 	    i2   = idx[1];
2125 	    idx += 2;
2126 	    tmp0 = x[i1];
2127 	    tmp1 = x[i2];
2128 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2129 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2130 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2131 	  }
2132 
2133 	  if (n == sz-1){
2134 	    tmp0  = x[*idx];
2135 	    sum1 -= *v1*tmp0;
2136 	    sum2 -= *v2*tmp0;
2137 	    sum3 -= *v3*tmp0;
2138 	  }
2139 	  x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
2140 	  x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
2141 	  x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
2142           row -= 3;
2143 	  break;
2144         case 4:
2145 
2146 	  sum1  = b[row];
2147 	  sum2  = b[row-1];
2148 	  sum3  = b[row-2];
2149 	  sum4  = b[row-3];
2150 	  v2    = a->a + diag[row-1] + 2;
2151 	  v3    = a->a + diag[row-2] + 3;
2152 	  v4    = a->a + diag[row-3] + 4;
2153 	  for(n = 0; n<sz-1; n+=2) {
2154 	    i1   = idx[0];
2155 	    i2   = idx[1];
2156 	    idx += 2;
2157 	    tmp0 = x[i1];
2158 	    tmp1 = x[i2];
2159 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2160 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2161 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2162 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2163 	  }
2164 
2165 	  if (n == sz-1){
2166 	    tmp0  = x[*idx];
2167 	    sum1 -= *v1*tmp0;
2168 	    sum2 -= *v2*tmp0;
2169 	    sum3 -= *v3*tmp0;
2170 	    sum4 -= *v4*tmp0;
2171 	  }
2172 	  x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
2173 	  x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
2174 	  x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
2175 	  x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
2176           row -= 4;
2177 	  break;
2178         case 5:
2179 
2180 	  sum1  = b[row];
2181 	  sum2  = b[row-1];
2182 	  sum3  = b[row-2];
2183 	  sum4  = b[row-3];
2184 	  sum5  = b[row-4];
2185 	  v2    = a->a + diag[row-1] + 2;
2186 	  v3    = a->a + diag[row-2] + 3;
2187 	  v4    = a->a + diag[row-3] + 4;
2188 	  v5    = a->a + diag[row-4] + 5;
2189 	  for(n = 0; n<sz-1; n+=2) {
2190 	    i1   = idx[0];
2191 	    i2   = idx[1];
2192 	    idx += 2;
2193 	    tmp0 = x[i1];
2194 	    tmp1 = x[i2];
2195 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2196 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2197 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2198 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2199 	    sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2200 	  }
2201 
2202 	  if (n == sz-1){
2203 	    tmp0  = x[*idx];
2204 	    sum1 -= *v1*tmp0;
2205 	    sum2 -= *v2*tmp0;
2206 	    sum3 -= *v3*tmp0;
2207 	    sum4 -= *v4*tmp0;
2208 	    sum5 -= *v5*tmp0;
2209 	  }
2210 	  x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
2211 	  x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
2212 	  x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
2213 	  x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
2214 	  x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
2215           row -= 5;
2216 	  break;
2217         default:
2218 	  SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2219       }
2220       CHKMEMQ;
2221     }
2222     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2223 
2224     /*
2225            t = b - D x    where D is the block diagonal
2226     */
2227     cnt = 0;
2228     for (i=0, row=0; i<m; i++) {
2229       CHKMEMQ;
2230       switch (sizes[i]){
2231         case 1:
2232 	  t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
2233 	  break;
2234         case 2:
2235 	  x1   = x[row]; x2 = x[row+1];
2236 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
2237 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
2238 	  t[row]   = b[row] - tmp1;
2239 	  t[row+1] = b[row+1] - tmp2; row += 2;
2240 	  cnt += 4;
2241 	  break;
2242         case 3:
2243 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2];
2244 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
2245 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
2246 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
2247 	  t[row] = b[row] - tmp1;
2248 	  t[row+1] = b[row+1] - tmp2;
2249 	  t[row+2] = b[row+2] - tmp3; row += 3;
2250 	  cnt += 9;
2251 	  break;
2252         case 4:
2253 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
2254 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
2255 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
2256 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
2257 	  tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
2258 	  t[row] = b[row] - tmp1;
2259 	  t[row+1] = b[row+1] - tmp2;
2260 	  t[row+2] = b[row+2] - tmp3;
2261 	  t[row+3] = b[row+3] - tmp4; row += 4;
2262 	  cnt += 16;
2263 	  break;
2264         case 5:
2265 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
2266 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
2267 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
2268 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
2269 	  tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
2270 	  tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
2271 	  t[row] = b[row] - tmp1;
2272 	  t[row+1] = b[row+1] - tmp2;
2273 	  t[row+2] = b[row+2] - tmp3;
2274 	  t[row+3] = b[row+3] - tmp4;
2275 	  t[row+4] = b[row+4] - tmp5;row += 5;
2276 	  cnt += 25;
2277 	  break;
2278         default:
2279 	  SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2280       }
2281       CHKMEMQ;
2282     }
2283     ierr = PetscLogFlops(m);CHKERRQ(ierr);
2284 
2285 
2286 
2287     /*
2288           Apply (L + D)^-1 where D is the block diagonal
2289     */
2290     for (i=0, row=0; i<m; i++) {
2291       sz  = diag[row] - ii[row];
2292       v1  = a->a + ii[row];
2293       idx = a->j + ii[row];
2294       CHKMEMQ;
2295       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2296       switch (sizes[i]){
2297         case 1:
2298 
2299 	  sum1  = t[row];
2300 	  for(n = 0; n<sz-1; n+=2) {
2301 	    i1   = idx[0];
2302 	    i2   = idx[1];
2303 	    idx += 2;
2304 	    tmp0 = t[i1];
2305 	    tmp1 = t[i2];
2306 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2307 	  }
2308 
2309 	  if (n == sz-1){
2310 	    tmp0  = t[*idx];
2311 	    sum1 -= *v1 * tmp0;
2312 	  }
2313 	  x[row] += t[row] = sum1*(*ibdiag++); row++;
2314 	  break;
2315         case 2:
2316 	  v2    = a->a + ii[row+1];
2317 	  sum1  = t[row];
2318 	  sum2  = t[row+1];
2319 	  for(n = 0; n<sz-1; n+=2) {
2320 	    i1   = idx[0];
2321 	    i2   = idx[1];
2322 	    idx += 2;
2323 	    tmp0 = t[i1];
2324 	    tmp1 = t[i2];
2325 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2326 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2327 	  }
2328 
2329 	  if (n == sz-1){
2330 	    tmp0  = t[*idx];
2331 	    sum1 -= v1[0] * tmp0;
2332 	    sum2 -= v2[0] * tmp0;
2333 	  }
2334 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
2335 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
2336 	  ibdiag  += 4; row += 2;
2337 	  break;
2338         case 3:
2339 	  v2    = a->a + ii[row+1];
2340 	  v3    = a->a + ii[row+2];
2341 	  sum1  = t[row];
2342 	  sum2  = t[row+1];
2343 	  sum3  = t[row+2];
2344 	  for(n = 0; n<sz-1; n+=2) {
2345 	    i1   = idx[0];
2346 	    i2   = idx[1];
2347 	    idx += 2;
2348 	    tmp0 = t[i1];
2349 	    tmp1 = t[i2];
2350 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2351 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2352 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2353 	  }
2354 
2355 	  if (n == sz-1){
2356 	    tmp0  = t[*idx];
2357 	    sum1 -= v1[0] * tmp0;
2358 	    sum2 -= v2[0] * tmp0;
2359 	    sum3 -= v3[0] * tmp0;
2360 	  }
2361 	  x[row]  += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2362 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2363 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2364 	  ibdiag  += 9; row += 3;
2365 	  break;
2366         case 4:
2367 	  v2    = a->a + ii[row+1];
2368 	  v3    = a->a + ii[row+2];
2369 	  v4    = a->a + ii[row+3];
2370 	  sum1  = t[row];
2371 	  sum2  = t[row+1];
2372 	  sum3  = t[row+2];
2373 	  sum4  = t[row+3];
2374 	  for(n = 0; n<sz-1; n+=2) {
2375 	    i1   = idx[0];
2376 	    i2   = idx[1];
2377 	    idx += 2;
2378 	    tmp0 = t[i1];
2379 	    tmp1 = t[i2];
2380 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2381 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2382 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2383 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2384 	  }
2385 
2386 	  if (n == sz-1){
2387 	    tmp0  = t[*idx];
2388 	    sum1 -= v1[0] * tmp0;
2389 	    sum2 -= v2[0] * tmp0;
2390 	    sum3 -= v3[0] * tmp0;
2391 	    sum4 -= v4[0] * tmp0;
2392 	  }
2393 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2394 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2395 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2396 	  x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2397 	  ibdiag  += 16; row += 4;
2398 	  break;
2399         case 5:
2400 	  v2    = a->a + ii[row+1];
2401 	  v3    = a->a + ii[row+2];
2402 	  v4    = a->a + ii[row+3];
2403 	  v5    = a->a + ii[row+4];
2404 	  sum1  = t[row];
2405 	  sum2  = t[row+1];
2406 	  sum3  = t[row+2];
2407 	  sum4  = t[row+3];
2408 	  sum5  = t[row+4];
2409 	  for(n = 0; n<sz-1; n+=2) {
2410 	    i1   = idx[0];
2411 	    i2   = idx[1];
2412 	    idx += 2;
2413 	    tmp0 = t[i1];
2414 	    tmp1 = t[i2];
2415 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2416 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2417 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2418 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2419 	    sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2420 	  }
2421 
2422 	  if (n == sz-1){
2423 	    tmp0  = t[*idx];
2424 	    sum1 -= v1[0] * tmp0;
2425 	    sum2 -= v2[0] * tmp0;
2426 	    sum3 -= v3[0] * tmp0;
2427 	    sum4 -= v4[0] * tmp0;
2428 	    sum5 -= v5[0] * tmp0;
2429 	  }
2430 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2431 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
2432 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
2433 	  x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
2434 	  x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
2435 	  ibdiag  += 25; row += 5;
2436 	  break;
2437         default:
2438 	  SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2439       }
2440       CHKMEMQ;
2441     }
2442     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2443     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2444     ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2445   }
2446   PetscFunctionReturn(0);
2447 }
2448 
2449 #undef __FUNCT__
2450 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode"
2451 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
2452 {
2453   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
2454   PetscScalar        *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
2455   const MatScalar    *bdiag = a->inode.bdiag;
2456   const PetscScalar  *b;
2457   PetscErrorCode      ierr;
2458   PetscInt            m = a->inode.node_count,cnt = 0,i,row;
2459   const PetscInt      *sizes = a->inode.size;
2460 
2461   PetscFunctionBegin;
2462   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2463   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2464   cnt = 0;
2465   for (i=0, row=0; i<m; i++) {
2466     switch (sizes[i]){
2467       case 1:
2468 	x[row] = b[row]*bdiag[cnt++];row++;
2469 	break;
2470       case 2:
2471 	x1   = b[row]; x2 = b[row+1];
2472 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
2473 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
2474 	x[row++] = tmp1;
2475 	x[row++] = tmp2;
2476 	cnt += 4;
2477 	break;
2478       case 3:
2479 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2];
2480 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
2481 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
2482 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
2483 	x[row++] = tmp1;
2484 	x[row++] = tmp2;
2485 	x[row++] = tmp3;
2486 	cnt += 9;
2487 	break;
2488       case 4:
2489 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
2490 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
2491 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
2492 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
2493 	tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
2494 	x[row++] = tmp1;
2495 	x[row++] = tmp2;
2496 	x[row++] = tmp3;
2497 	x[row++] = tmp4;
2498 	cnt += 16;
2499 	break;
2500       case 5:
2501 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
2502 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
2503 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
2504 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
2505 	tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
2506 	tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
2507 	x[row++] = tmp1;
2508 	x[row++] = tmp2;
2509 	x[row++] = tmp3;
2510 	x[row++] = tmp4;
2511 	x[row++] = tmp5;
2512 	cnt += 25;
2513 	break;
2514       default:
2515 	SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2516     }
2517   }
2518   ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr);
2519   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2520   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2521   PetscFunctionReturn(0);
2522 }
2523 
2524 /*
2525     samestructure indicates that the matrix has not changed its nonzero structure so we
2526     do not need to recompute the inodes
2527 */
2528 #undef __FUNCT__
2529 #define __FUNCT__ "Mat_CheckInode"
2530 PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure)
2531 {
2532   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
2533   PetscErrorCode ierr;
2534   PetscInt       i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size;
2535   PetscTruth     flag;
2536 
2537   PetscFunctionBegin;
2538   if (!a->inode.use)                     PetscFunctionReturn(0);
2539   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
2540 
2541 
2542   m = A->rmap->n;
2543   if (a->inode.size) {ns = a->inode.size;}
2544   else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);}
2545 
2546   i          = 0;
2547   node_count = 0;
2548   idx        = a->j;
2549   ii         = a->i;
2550   while (i < m){                /* For each row */
2551     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
2552     /* Limits the number of elements in a node to 'a->inode.limit' */
2553     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
2554       nzy     = ii[j+1] - ii[j]; /* Same number of nonzeros */
2555       if (nzy != nzx) break;
2556       idy  += nzx;             /* Same nonzero pattern */
2557       ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
2558       if (!flag) break;
2559     }
2560     ns[node_count++] = blk_size;
2561     idx += blk_size*nzx;
2562     i    = j;
2563   }
2564   /* If not enough inodes found,, do not use inode version of the routines */
2565   if (!a->inode.size && m && node_count > .9*m) {
2566     ierr = PetscFree(ns);CHKERRQ(ierr);
2567     a->inode.node_count     = 0;
2568     a->inode.size           = PETSC_NULL;
2569     a->inode.use            = PETSC_FALSE;
2570     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
2571   } else {
2572     A->ops->mult              = MatMult_SeqAIJ_Inode;
2573     A->ops->sor               = MatSOR_SeqAIJ_Inode;
2574     A->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
2575     A->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
2576     A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
2577     A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
2578     A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
2579     A->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
2580     A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
2581     a->inode.node_count       = node_count;
2582     a->inode.size             = ns;
2583     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
2584   }
2585   PetscFunctionReturn(0);
2586 }
2587 
2588 #define MatGetRow_FactoredLU(cols,nzl,nzu,nz,ai,aj,adiag,row) {	\
2589 PetscInt __k, *__vi; \
2590 __vi = aj + ai[row];				\
2591 for(__k=0;__k<nzl;__k++) cols[__k] = __vi[__k]; \
2592 __vi = aj + adiag[row];				\
2593 cols[nzl] = __vi[0];\
2594 __vi = aj + adiag[row+1]+1;\
2595 for(__k=0;__k<nzu;__k++) cols[nzl+1+__k] = __vi[__k];}
2596 
2597 #undef __FUNCT__
2598 #define __FUNCT__ "Mat_CheckInode_FactorLU"
2599 PetscErrorCode Mat_CheckInode_FactorLU(Mat A,PetscTruth samestructure)
2600 {
2601   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
2602   PetscErrorCode ierr;
2603   PetscInt       i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size;
2604   PetscInt       *cols1,*cols2,*ns,*ai = a->i,*aj = a->j, *adiag = a->diag;
2605   PetscTruth     flag;
2606 
2607   PetscFunctionBegin;
2608   if (!a->inode.use)                     PetscFunctionReturn(0);
2609   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
2610 
2611   m = A->rmap->n;
2612   if (a->inode.size) {ns = a->inode.size;}
2613   else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);}
2614 
2615   i          = 0;
2616   node_count = 0;
2617   while (i < m){                /* For each row */
2618     nzl1 = ai[i+1] - ai[i];       /* Number of nonzeros in L */
2619     nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/
2620     nzx  = nzl1 + nzu1 + 1;
2621     ierr = PetscMalloc((nzx+1)*sizeof(PetscInt),&cols1);CHKERRQ(ierr);
2622     MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i);
2623 
2624     /* Limits the number of elements in a node to 'a->inode.limit' */
2625     for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
2626       nzl2    = ai[j+1] - ai[j];
2627       nzu2    = adiag[j] - adiag[j+1] - 1;
2628       nzy     = nzl2 + nzu2 + 1;
2629       if( nzy != nzx) break;
2630       ierr    = PetscMalloc((nzy+1)*sizeof(PetscInt),&cols2);CHKERRQ(ierr);
2631       MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);
2632       ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
2633       if (!flag) {ierr = PetscFree(cols2);CHKERRQ(ierr);break;}
2634       ierr = PetscFree(cols2);CHKERRQ(ierr);
2635     }
2636     ns[node_count++] = blk_size;
2637     ierr = PetscFree(cols1);CHKERRQ(ierr);
2638     i    = j;
2639   }
2640   /* If not enough inodes found,, do not use inode version of the routines */
2641   if (!a->inode.size && m && node_count > .9*m) {
2642     ierr = PetscFree(ns);CHKERRQ(ierr);
2643     a->inode.node_count     = 0;
2644     a->inode.size           = PETSC_NULL;
2645     a->inode.use            = PETSC_FALSE;
2646     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
2647   } else {
2648     A->ops->mult              = 0;
2649     A->ops->sor               = 0;
2650     A->ops->multadd           = 0;
2651     A->ops->getrowij          = 0;
2652     A->ops->restorerowij      = 0;
2653     A->ops->getcolumnij       = 0;
2654     A->ops->restorecolumnij   = 0;
2655     A->ops->coloringpatch     = 0;
2656     A->ops->multdiagonalblock = 0;
2657     a->inode.node_count       = node_count;
2658     a->inode.size             = ns;
2659     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
2660   }
2661   PetscFunctionReturn(0);
2662 }
2663 
2664 /*
2665      This is really ugly. if inodes are used this replaces the
2666   permutations with ones that correspond to rows/cols of the matrix
2667   rather then inode blocks
2668 */
2669 #undef __FUNCT__
2670 #define __FUNCT__ "MatInodeAdjustForInodes"
2671 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
2672 {
2673   PetscErrorCode ierr,(*f)(Mat,IS*,IS*);
2674 
2675   PetscFunctionBegin;
2676   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);CHKERRQ(ierr);
2677   if (f) {
2678     ierr = (*f)(A,rperm,cperm);CHKERRQ(ierr);
2679   }
2680   PetscFunctionReturn(0);
2681 }
2682 
2683 EXTERN_C_BEGIN
2684 #undef __FUNCT__
2685 #define __FUNCT__ "MatAdjustForInodes_SeqAIJ_Inode"
2686 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
2687 {
2688   Mat_SeqAIJ      *a=(Mat_SeqAIJ*)A->data;
2689   PetscErrorCode ierr;
2690   PetscInt       m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
2691   const PetscInt *ridx,*cidx;
2692   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
2693   PetscInt       nslim_col,*ns_col;
2694   IS             ris = *rperm,cis = *cperm;
2695 
2696   PetscFunctionBegin;
2697   if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */
2698   if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */
2699 
2700   ierr  = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
2701   ierr  = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
2702   ierr  = PetscMalloc2(m,PetscInt,&permr,n,PetscInt,&permc);CHKERRQ(ierr);
2703 
2704   ierr  = ISGetIndices(ris,&ridx);CHKERRQ(ierr);
2705   ierr  = ISGetIndices(cis,&cidx);CHKERRQ(ierr);
2706 
2707   /* Form the inode structure for the rows of permuted matric using inv perm*/
2708   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
2709 
2710   /* Construct the permutations for rows*/
2711   for (i=0,row = 0; i<nslim_row; ++i){
2712     indx      = ridx[i];
2713     start_val = tns[indx];
2714     end_val   = tns[indx + 1];
2715     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
2716   }
2717 
2718   /* Form the inode structure for the columns of permuted matrix using inv perm*/
2719   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
2720 
2721  /* Construct permutations for columns */
2722   for (i=0,col=0; i<nslim_col; ++i){
2723     indx      = cidx[i];
2724     start_val = tns[indx];
2725     end_val   = tns[indx + 1];
2726     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
2727   }
2728 
2729   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);CHKERRQ(ierr);
2730   ierr = ISSetPermutation(*rperm);CHKERRQ(ierr);
2731   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);CHKERRQ(ierr);
2732   ierr = ISSetPermutation(*cperm);CHKERRQ(ierr);
2733 
2734   ierr  = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr);
2735   ierr  = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr);
2736 
2737   ierr = PetscFree(ns_col);CHKERRQ(ierr);
2738   ierr = PetscFree2(permr,permc);CHKERRQ(ierr);
2739   ierr = ISDestroy(cis);CHKERRQ(ierr);
2740   ierr = ISDestroy(ris);CHKERRQ(ierr);
2741   ierr = PetscFree(tns);CHKERRQ(ierr);
2742   PetscFunctionReturn(0);
2743 }
2744 EXTERN_C_END
2745 
2746 #undef __FUNCT__
2747 #define __FUNCT__ "MatInodeGetInodeSizes"
2748 /*@C
2749    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
2750 
2751    Collective on Mat
2752 
2753    Input Parameter:
2754 .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
2755 
2756    Output Parameter:
2757 +  node_count - no of inodes present in the matrix.
2758 .  sizes      - an array of size node_count,with sizes of each inode.
2759 -  limit      - the max size used to generate the inodes.
2760 
2761    Level: advanced
2762 
2763    Notes: This routine returns some internal storage information
2764    of the matrix, it is intended to be used by advanced users.
2765    It should be called after the matrix is assembled.
2766    The contents of the sizes[] array should not be changed.
2767    PETSC_NULL may be passed for information not requested.
2768 
2769 .keywords: matrix, seqaij, get, inode
2770 
2771 .seealso: MatGetInfo()
2772 @*/
2773 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
2774 {
2775   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
2776 
2777   PetscFunctionBegin;
2778   if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
2779   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr);
2780   if (f) {
2781     ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr);
2782   }
2783   PetscFunctionReturn(0);
2784 }
2785 
2786 EXTERN_C_BEGIN
2787 #undef __FUNCT__
2788 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode"
2789 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
2790 {
2791   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
2792 
2793   PetscFunctionBegin;
2794   if (node_count) *node_count = a->inode.node_count;
2795   if (sizes)      *sizes      = a->inode.size;
2796   if (limit)      *limit      = a->inode.limit;
2797   PetscFunctionReturn(0);
2798 }
2799 EXTERN_C_END
2800