xref: /petsc/src/mat/impls/aij/seq/inode.c (revision da18021fca600d13c5e78c2cfb1afa98ec39d5c2)
1 #define PETSCMAT_DLL
2 
3 /*
4   This file provides high performance routines for the Inode format (compressed sparse row)
5   by taking advantage of rows with identical nonzero structure (I-nodes).
6 */
7 #include "../src/mat/impls/aij/seq/aij.h"
8 
9 #undef __FUNCT__
10 #define __FUNCT__ "Mat_CreateColInode"
11 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns)
12 {
13   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
14   PetscErrorCode ierr;
15   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;
16 
17   PetscFunctionBegin;
18   n      = A->cmap->n;
19   m      = A->rmap->n;
20   ns_row = a->inode.size;
21 
22   min_mn = (m < n) ? m : n;
23   if (!ns) {
24     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++);
25     for(; count+1 < n; count++,i++);
26     if (count < n)  {
27       i++;
28     }
29     *size = i;
30     PetscFunctionReturn(0);
31   }
32   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr);
33 
34   /* Use the same row structure wherever feasible. */
35   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
36     ns_col[i] = ns_row[i];
37   }
38 
39   /* if m < n; pad up the remainder with inode_limit */
40   for(; count+1 < n; count++,i++) {
41     ns_col[i] = 1;
42   }
43   /* The last node is the odd ball. padd it up with the remaining rows; */
44   if (count < n)  {
45     ns_col[i] = n - count;
46     i++;
47   } else if (count > n) {
48     /* Adjust for the over estimation */
49     ns_col[i-1] += n - count;
50   }
51   *size = i;
52   *ns   = ns_col;
53   PetscFunctionReturn(0);
54 }
55 
56 
57 /*
58       This builds symmetric version of nonzero structure,
59 */
60 #undef __FUNCT__
61 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric"
62 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
63 {
64   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
65   PetscErrorCode ierr;
66   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n;
67   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j;
68 
69   PetscFunctionBegin;
70   nslim_row = a->inode.node_count;
71   m         = A->rmap->n;
72   n         = A->cmap->n;
73   if (m != n) SETERRQ(PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
74 
75   /* Use the row_inode as column_inode */
76   nslim_col = nslim_row;
77   ns_col    = ns_row;
78 
79   /* allocate space for reformated inode structure */
80   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
81   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
82   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
83 
84   for (i1=0,col=0; i1<nslim_col; ++i1){
85     nsz = ns_col[i1];
86     for (i2=0; i2<nsz; ++i2,++col)
87       tvc[col] = i1;
88   }
89   /* allocate space for row pointers */
90   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
91   *iia = ia;
92   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
93   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
94 
95   /* determine the number of columns in each row */
96   ia[0] = oshift;
97   for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) {
98 
99     j    = aj + ai[row] + ishift;
100     jmax = aj + ai[row+1] + ishift;
101     i2   = 0;
102     col  = *j++ + ishift;
103     i2   = tvc[col];
104     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
105       ia[i1+1]++;
106       ia[i2+1]++;
107       i2++;                     /* Start col of next node */
108       while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j;
109       i2 = tvc[col];
110     }
111     if(i2 == i1) ia[i2+1]++;    /* now the diagonal element */
112   }
113 
114   /* shift ia[i] to point to next row */
115   for (i1=1; i1<nslim_row+1; i1++) {
116     row        = ia[i1-1];
117     ia[i1]    += row;
118     work[i1-1] = row - oshift;
119   }
120 
121   /* allocate space for column pointers */
122   nz   = ia[nslim_row] + (!ishift);
123   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
124   *jja = ja;
125 
126  /* loop over lower triangular part putting into ja */
127   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
128     j    = aj + ai[row] + ishift;
129     jmax = aj + ai[row+1] + ishift;
130     i2   = 0;                     /* Col inode index */
131     col  = *j++ + ishift;
132     i2   = tvc[col];
133     while (i2<i1 && j<jmax) {
134       ja[work[i2]++] = i1 + oshift;
135       ja[work[i1]++] = i2 + oshift;
136       ++i2;
137       while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */
138       i2 = tvc[col];
139     }
140     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
141 
142   }
143   ierr = PetscFree(work);CHKERRQ(ierr);
144   ierr = PetscFree(tns);CHKERRQ(ierr);
145   ierr = PetscFree(tvc);CHKERRQ(ierr);
146   PetscFunctionReturn(0);
147 }
148 
149 /*
150       This builds nonsymmetric version of nonzero structure,
151 */
152 #undef __FUNCT__
153 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric"
154 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
155 {
156   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
157   PetscErrorCode ierr;
158   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col;
159   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
160 
161   PetscFunctionBegin;
162   nslim_row = a->inode.node_count;
163   n         = A->cmap->n;
164 
165   /* Create The column_inode for this matrix */
166   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
167 
168   /* allocate space for reformated column_inode structure */
169   ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
170   ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
171   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
172 
173   for (i1=0,col=0; i1<nslim_col; ++i1){
174     nsz = ns_col[i1];
175     for (i2=0; i2<nsz; ++i2,++col)
176       tvc[col] = i1;
177   }
178   /* allocate space for row pointers */
179   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
180   *iia = ia;
181   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
182   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
183 
184   /* determine the number of columns in each row */
185   ia[0] = oshift;
186   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
187     j   = aj + ai[row] + ishift;
188     col = *j++ + ishift;
189     i2  = tvc[col];
190     nz  = ai[row+1] - ai[row];
191     while (nz-- > 0) {           /* off-diagonal elemets */
192       ia[i1+1]++;
193       i2++;                     /* Start col of next node */
194       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
195       if (nz > 0) i2 = tvc[col];
196     }
197   }
198 
199   /* shift ia[i] to point to next row */
200   for (i1=1; i1<nslim_row+1; i1++) {
201     row        = ia[i1-1];
202     ia[i1]    += row;
203     work[i1-1] = row - oshift;
204   }
205 
206   /* allocate space for column pointers */
207   nz   = ia[nslim_row] + (!ishift);
208   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
209   *jja = ja;
210 
211  /* loop over matrix putting into ja */
212   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
213     j   = aj + ai[row] + ishift;
214     i2  = 0;                     /* Col inode index */
215     col = *j++ + ishift;
216     i2  = tvc[col];
217     nz  = ai[row+1] - ai[row];
218     while (nz-- > 0) {
219       ja[work[i1]++] = i2 + oshift;
220       ++i2;
221       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
222       if (nz > 0) i2 = tvc[col];
223     }
224   }
225   ierr = PetscFree(ns_col);CHKERRQ(ierr);
226   ierr = PetscFree(work);CHKERRQ(ierr);
227   ierr = PetscFree(tns);CHKERRQ(ierr);
228   ierr = PetscFree(tvc);CHKERRQ(ierr);
229   PetscFunctionReturn(0);
230 }
231 
232 #undef __FUNCT__
233 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode"
234 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
235 {
236   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
237   PetscErrorCode ierr;
238 
239   PetscFunctionBegin;
240   *n     = a->inode.node_count;
241   if (!ia) PetscFunctionReturn(0);
242   if (!blockcompressed) {
243     ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
244   } else if (symmetric) {
245     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
246   } else {
247     ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
248   }
249   PetscFunctionReturn(0);
250 }
251 
252 #undef __FUNCT__
253 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode"
254 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
255 {
256   PetscErrorCode ierr;
257 
258   PetscFunctionBegin;
259   if (!ia) PetscFunctionReturn(0);
260 
261   if (!blockcompressed) {
262     ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
263   } else {
264     ierr = PetscFree(*ia);CHKERRQ(ierr);
265     ierr = PetscFree(*ja);CHKERRQ(ierr);
266   }
267 
268   PetscFunctionReturn(0);
269 }
270 
271 /* ----------------------------------------------------------- */
272 
273 #undef __FUNCT__
274 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric"
275 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
276 {
277   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
278   PetscErrorCode ierr;
279   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
280   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
281 
282   PetscFunctionBegin;
283   nslim_row = a->inode.node_count;
284   n         = A->cmap->n;
285 
286   /* Create The column_inode for this matrix */
287   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
288 
289   /* allocate space for reformated column_inode structure */
290   ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
291   ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
292   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
293 
294   for (i1=0,col=0; i1<nslim_col; ++i1){
295     nsz = ns_col[i1];
296     for (i2=0; i2<nsz; ++i2,++col)
297       tvc[col] = i1;
298   }
299   /* allocate space for column pointers */
300   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
301   *iia = ia;
302   ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr);
303   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
304 
305   /* determine the number of columns in each row */
306   ia[0] = oshift;
307   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
308     j   = aj + ai[row] + ishift;
309     col = *j++ + ishift;
310     i2  = tvc[col];
311     nz  = ai[row+1] - ai[row];
312     while (nz-- > 0) {           /* off-diagonal elemets */
313       /* ia[i1+1]++; */
314       ia[i2+1]++;
315       i2++;
316       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
317       if (nz > 0) i2 = tvc[col];
318     }
319   }
320 
321   /* shift ia[i] to point to next col */
322   for (i1=1; i1<nslim_col+1; i1++) {
323     col        = ia[i1-1];
324     ia[i1]    += col;
325     work[i1-1] = col - oshift;
326   }
327 
328   /* allocate space for column pointers */
329   nz   = ia[nslim_col] + (!ishift);
330   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
331   *jja = ja;
332 
333  /* loop over matrix putting into ja */
334   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
335     j   = aj + ai[row] + ishift;
336     i2  = 0;                     /* Col inode index */
337     col = *j++ + ishift;
338     i2  = tvc[col];
339     nz  = ai[row+1] - ai[row];
340     while (nz-- > 0) {
341       /* ja[work[i1]++] = i2 + oshift; */
342       ja[work[i2]++] = i1 + oshift;
343       i2++;
344       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
345       if (nz > 0) i2 = tvc[col];
346     }
347   }
348   ierr = PetscFree(ns_col);CHKERRQ(ierr);
349   ierr = PetscFree(work);CHKERRQ(ierr);
350   ierr = PetscFree(tns);CHKERRQ(ierr);
351   ierr = PetscFree(tvc);CHKERRQ(ierr);
352   PetscFunctionReturn(0);
353 }
354 
355 #undef __FUNCT__
356 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode"
357 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
358 {
359   PetscErrorCode ierr;
360 
361   PetscFunctionBegin;
362   ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr);
363   if (!ia) PetscFunctionReturn(0);
364 
365   if (!blockcompressed) {
366     ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
367   } else if (symmetric) {
368     /* Since the indices are symmetric it does'nt matter */
369     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
370   } else {
371     ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
372   }
373   PetscFunctionReturn(0);
374 }
375 
376 #undef __FUNCT__
377 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode"
378 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
379 {
380   PetscErrorCode ierr;
381 
382   PetscFunctionBegin;
383   if (!ia) PetscFunctionReturn(0);
384   if (!blockcompressed) {
385     ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
386   } else {
387     ierr = PetscFree(*ia);CHKERRQ(ierr);
388     ierr = PetscFree(*ja);CHKERRQ(ierr);
389   }
390   PetscFunctionReturn(0);
391 }
392 
393 /* ----------------------------------------------------------- */
394 
395 #undef __FUNCT__
396 #define __FUNCT__ "MatMult_SeqAIJ_Inode"
397 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
398 {
399   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
400   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
401   PetscScalar       *y;
402   const PetscScalar *x;
403   const MatScalar   *v1,*v2,*v3,*v4,*v5;
404   PetscErrorCode    ierr;
405   PetscInt          *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0;
406 
407 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
408 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
409 #endif
410 
411   PetscFunctionBegin;
412   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
413   node_max = a->inode.node_count;
414   ns       = a->inode.size;     /* Node Size array */
415   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
416   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
417   idx  = a->j;
418   v1   = a->a;
419   ii   = a->i;
420 
421   for (i = 0,row = 0; i< node_max; ++i){
422     nsz  = ns[i];
423     n    = ii[1] - ii[0];
424     nonzerorow += (n>0)*nsz;
425     ii  += nsz;
426     sz   = n;                   /* No of non zeros in this row */
427                                 /* Switch on the size of Node */
428     switch (nsz){               /* Each loop in 'case' is unrolled */
429     case 1 :
430       sum1  = 0.;
431 
432       for(n = 0; n< sz-1; n+=2) {
433         i1   = idx[0];          /* The instructions are ordered to */
434         i2   = idx[1];          /* make the compiler's job easy */
435         idx += 2;
436         tmp0 = x[i1];
437         tmp1 = x[i2];
438         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
439        }
440 
441       if (n == sz-1){          /* Take care of the last nonzero  */
442         tmp0  = x[*idx++];
443         sum1 += *v1++ * tmp0;
444       }
445       y[row++]=sum1;
446       break;
447     case 2:
448       sum1  = 0.;
449       sum2  = 0.;
450       v2    = v1 + n;
451 
452       for (n = 0; n< sz-1; n+=2) {
453         i1   = idx[0];
454         i2   = idx[1];
455         idx += 2;
456         tmp0 = x[i1];
457         tmp1 = x[i2];
458         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
459         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
460       }
461       if (n == sz-1){
462         tmp0  = x[*idx++];
463         sum1 += *v1++ * tmp0;
464         sum2 += *v2++ * tmp0;
465       }
466       y[row++]=sum1;
467       y[row++]=sum2;
468       v1      =v2;              /* Since the next block to be processed starts there*/
469       idx    +=sz;
470       break;
471     case 3:
472       sum1  = 0.;
473       sum2  = 0.;
474       sum3  = 0.;
475       v2    = v1 + n;
476       v3    = v2 + n;
477 
478       for (n = 0; n< sz-1; n+=2) {
479         i1   = idx[0];
480         i2   = idx[1];
481         idx += 2;
482         tmp0 = x[i1];
483         tmp1 = x[i2];
484         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
485         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
486         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
487       }
488       if (n == sz-1){
489         tmp0  = x[*idx++];
490         sum1 += *v1++ * tmp0;
491         sum2 += *v2++ * tmp0;
492         sum3 += *v3++ * tmp0;
493       }
494       y[row++]=sum1;
495       y[row++]=sum2;
496       y[row++]=sum3;
497       v1       =v3;             /* Since the next block to be processed starts there*/
498       idx     +=2*sz;
499       break;
500     case 4:
501       sum1  = 0.;
502       sum2  = 0.;
503       sum3  = 0.;
504       sum4  = 0.;
505       v2    = v1 + n;
506       v3    = v2 + n;
507       v4    = v3 + n;
508 
509       for (n = 0; n< sz-1; n+=2) {
510         i1   = idx[0];
511         i2   = idx[1];
512         idx += 2;
513         tmp0 = x[i1];
514         tmp1 = x[i2];
515         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
516         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
517         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
518         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
519       }
520       if (n == sz-1){
521         tmp0  = x[*idx++];
522         sum1 += *v1++ * tmp0;
523         sum2 += *v2++ * tmp0;
524         sum3 += *v3++ * tmp0;
525         sum4 += *v4++ * tmp0;
526       }
527       y[row++]=sum1;
528       y[row++]=sum2;
529       y[row++]=sum3;
530       y[row++]=sum4;
531       v1      =v4;              /* Since the next block to be processed starts there*/
532       idx    +=3*sz;
533       break;
534     case 5:
535       sum1  = 0.;
536       sum2  = 0.;
537       sum3  = 0.;
538       sum4  = 0.;
539       sum5  = 0.;
540       v2    = v1 + n;
541       v3    = v2 + n;
542       v4    = v3 + n;
543       v5    = v4 + n;
544 
545       for (n = 0; n<sz-1; n+=2) {
546         i1   = idx[0];
547         i2   = idx[1];
548         idx += 2;
549         tmp0 = x[i1];
550         tmp1 = x[i2];
551         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
552         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
553         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
554         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
555         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
556       }
557       if (n == sz-1){
558         tmp0  = x[*idx++];
559         sum1 += *v1++ * tmp0;
560         sum2 += *v2++ * tmp0;
561         sum3 += *v3++ * tmp0;
562         sum4 += *v4++ * tmp0;
563         sum5 += *v5++ * tmp0;
564       }
565       y[row++]=sum1;
566       y[row++]=sum2;
567       y[row++]=sum3;
568       y[row++]=sum4;
569       y[row++]=sum5;
570       v1      =v5;       /* Since the next block to be processed starts there */
571       idx    +=4*sz;
572       break;
573     default :
574       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
575     }
576   }
577   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
578   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
579   ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr);
580   PetscFunctionReturn(0);
581 }
582 /* ----------------------------------------------------------- */
583 /* Almost same code as the MatMult_SeqAIJ_Inode() */
584 #undef __FUNCT__
585 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode"
586 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
587 {
588   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
589   PetscScalar    sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
590   MatScalar      *v1,*v2,*v3,*v4,*v5;
591   PetscScalar    *x,*y,*z,*zt;
592   PetscErrorCode ierr;
593   PetscInt       *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
594 
595   PetscFunctionBegin;
596   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
597   node_max = a->inode.node_count;
598   ns       = a->inode.size;     /* Node Size array */
599   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
600   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
601   if (zz != yy) {
602     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
603   } else {
604     z = y;
605   }
606   zt = z;
607 
608   idx  = a->j;
609   v1   = a->a;
610   ii   = a->i;
611 
612   for (i = 0,row = 0; i< node_max; ++i){
613     nsz  = ns[i];
614     n    = ii[1] - ii[0];
615     ii  += nsz;
616     sz   = n;                   /* No of non zeros in this row */
617                                 /* Switch on the size of Node */
618     switch (nsz){               /* Each loop in 'case' is unrolled */
619     case 1 :
620       sum1  = *zt++;
621 
622       for(n = 0; n< sz-1; n+=2) {
623         i1   = idx[0];          /* The instructions are ordered to */
624         i2   = idx[1];          /* make the compiler's job easy */
625         idx += 2;
626         tmp0 = x[i1];
627         tmp1 = x[i2];
628         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
629        }
630 
631       if(n   == sz-1){          /* Take care of the last nonzero  */
632         tmp0  = x[*idx++];
633         sum1 += *v1++ * tmp0;
634       }
635       y[row++]=sum1;
636       break;
637     case 2:
638       sum1  = *zt++;
639       sum2  = *zt++;
640       v2    = v1 + n;
641 
642       for(n = 0; n< sz-1; n+=2) {
643         i1   = idx[0];
644         i2   = idx[1];
645         idx += 2;
646         tmp0 = x[i1];
647         tmp1 = x[i2];
648         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
649         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
650       }
651       if(n   == sz-1){
652         tmp0  = x[*idx++];
653         sum1 += *v1++ * tmp0;
654         sum2 += *v2++ * tmp0;
655       }
656       y[row++]=sum1;
657       y[row++]=sum2;
658       v1      =v2;              /* Since the next block to be processed starts there*/
659       idx    +=sz;
660       break;
661     case 3:
662       sum1  = *zt++;
663       sum2  = *zt++;
664       sum3  = *zt++;
665       v2    = v1 + n;
666       v3    = v2 + n;
667 
668       for (n = 0; n< sz-1; n+=2) {
669         i1   = idx[0];
670         i2   = idx[1];
671         idx += 2;
672         tmp0 = x[i1];
673         tmp1 = x[i2];
674         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
675         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
676         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
677       }
678       if (n == sz-1){
679         tmp0  = x[*idx++];
680         sum1 += *v1++ * tmp0;
681         sum2 += *v2++ * tmp0;
682         sum3 += *v3++ * tmp0;
683       }
684       y[row++]=sum1;
685       y[row++]=sum2;
686       y[row++]=sum3;
687       v1       =v3;             /* Since the next block to be processed starts there*/
688       idx     +=2*sz;
689       break;
690     case 4:
691       sum1  = *zt++;
692       sum2  = *zt++;
693       sum3  = *zt++;
694       sum4  = *zt++;
695       v2    = v1 + n;
696       v3    = v2 + n;
697       v4    = v3 + n;
698 
699       for (n = 0; n< sz-1; n+=2) {
700         i1   = idx[0];
701         i2   = idx[1];
702         idx += 2;
703         tmp0 = x[i1];
704         tmp1 = x[i2];
705         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
706         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
707         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
708         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
709       }
710       if (n == sz-1){
711         tmp0  = x[*idx++];
712         sum1 += *v1++ * tmp0;
713         sum2 += *v2++ * tmp0;
714         sum3 += *v3++ * tmp0;
715         sum4 += *v4++ * tmp0;
716       }
717       y[row++]=sum1;
718       y[row++]=sum2;
719       y[row++]=sum3;
720       y[row++]=sum4;
721       v1      =v4;              /* Since the next block to be processed starts there*/
722       idx    +=3*sz;
723       break;
724     case 5:
725       sum1  = *zt++;
726       sum2  = *zt++;
727       sum3  = *zt++;
728       sum4  = *zt++;
729       sum5  = *zt++;
730       v2    = v1 + n;
731       v3    = v2 + n;
732       v4    = v3 + n;
733       v5    = v4 + n;
734 
735       for (n = 0; n<sz-1; n+=2) {
736         i1   = idx[0];
737         i2   = idx[1];
738         idx += 2;
739         tmp0 = x[i1];
740         tmp1 = x[i2];
741         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
742         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
743         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
744         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
745         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
746       }
747       if(n   == sz-1){
748         tmp0  = x[*idx++];
749         sum1 += *v1++ * tmp0;
750         sum2 += *v2++ * tmp0;
751         sum3 += *v3++ * tmp0;
752         sum4 += *v4++ * tmp0;
753         sum5 += *v5++ * tmp0;
754       }
755       y[row++]=sum1;
756       y[row++]=sum2;
757       y[row++]=sum3;
758       y[row++]=sum4;
759       y[row++]=sum5;
760       v1      =v5;       /* Since the next block to be processed starts there */
761       idx    +=4*sz;
762       break;
763     default :
764       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
765     }
766   }
767   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
768   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
769   if (zz != yy) {
770     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
771   }
772   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
773   PetscFunctionReturn(0);
774 }
775 
776 /* ----------------------------------------------------------- */
777 #undef __FUNCT__
778 #define __FUNCT__ "MatSolve_SeqAIJ_Inode"
779 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
780 {
781   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
782   IS                iscol = a->col,isrow = a->row;
783   PetscErrorCode    ierr;
784   const PetscInt    *r,*c,*rout,*cout;
785   PetscInt          i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
786   PetscInt          node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
787   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
788   PetscScalar       sum1,sum2,sum3,sum4,sum5;
789   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
790   const PetscScalar *b;
791 
792   PetscFunctionBegin;
793   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
794   node_max = a->inode.node_count;
795   ns       = a->inode.size;     /* Node Size array */
796 
797   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
798   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
799   tmp  = a->solve_work;
800 
801   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
802   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
803 
804   /* forward solve the lower triangular */
805   tmps = tmp ;
806   aa   = a_a ;
807   aj   = a_j ;
808   ad   = a->diag;
809 
810   for (i = 0,row = 0; i< node_max; ++i){
811     nsz = ns[i];
812     aii = ai[row];
813     v1  = aa + aii;
814     vi  = aj + aii;
815     nz  = ad[row]- aii;
816 
817     switch (nsz){               /* Each loop in 'case' is unrolled */
818     case 1 :
819       sum1 = b[*r++];
820       for(j=0; j<nz-1; j+=2){
821         i0   = vi[0];
822         i1   = vi[1];
823         vi  +=2;
824         tmp0 = tmps[i0];
825         tmp1 = tmps[i1];
826         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
827       }
828       if(j == nz-1){
829         tmp0 = tmps[*vi++];
830         sum1 -= *v1++ *tmp0;
831       }
832       tmp[row ++]=sum1;
833       break;
834     case 2:
835       sum1 = b[*r++];
836       sum2 = b[*r++];
837       v2   = aa + ai[row+1];
838 
839       for(j=0; j<nz-1; j+=2){
840         i0   = vi[0];
841         i1   = vi[1];
842         vi  +=2;
843         tmp0 = tmps[i0];
844         tmp1 = tmps[i1];
845         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
846         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
847       }
848       if(j == nz-1){
849         tmp0 = tmps[*vi++];
850         sum1 -= *v1++ *tmp0;
851         sum2 -= *v2++ *tmp0;
852       }
853       sum2 -= *v2++ * sum1;
854       tmp[row ++]=sum1;
855       tmp[row ++]=sum2;
856       break;
857     case 3:
858       sum1 = b[*r++];
859       sum2 = b[*r++];
860       sum3 = b[*r++];
861       v2   = aa + ai[row+1];
862       v3   = aa + ai[row+2];
863 
864       for (j=0; j<nz-1; j+=2){
865         i0   = vi[0];
866         i1   = vi[1];
867         vi  +=2;
868         tmp0 = tmps[i0];
869         tmp1 = tmps[i1];
870         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
871         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
872         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
873       }
874       if (j == nz-1){
875         tmp0 = tmps[*vi++];
876         sum1 -= *v1++ *tmp0;
877         sum2 -= *v2++ *tmp0;
878         sum3 -= *v3++ *tmp0;
879       }
880       sum2 -= *v2++ * sum1;
881       sum3 -= *v3++ * sum1;
882       sum3 -= *v3++ * sum2;
883       tmp[row ++]=sum1;
884       tmp[row ++]=sum2;
885       tmp[row ++]=sum3;
886       break;
887 
888     case 4:
889       sum1 = b[*r++];
890       sum2 = b[*r++];
891       sum3 = b[*r++];
892       sum4 = b[*r++];
893       v2   = aa + ai[row+1];
894       v3   = aa + ai[row+2];
895       v4   = aa + ai[row+3];
896 
897       for (j=0; j<nz-1; j+=2){
898         i0   = vi[0];
899         i1   = vi[1];
900         vi  +=2;
901         tmp0 = tmps[i0];
902         tmp1 = tmps[i1];
903         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
904         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
905         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
906         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
907       }
908       if (j == nz-1){
909         tmp0 = tmps[*vi++];
910         sum1 -= *v1++ *tmp0;
911         sum2 -= *v2++ *tmp0;
912         sum3 -= *v3++ *tmp0;
913         sum4 -= *v4++ *tmp0;
914       }
915       sum2 -= *v2++ * sum1;
916       sum3 -= *v3++ * sum1;
917       sum4 -= *v4++ * sum1;
918       sum3 -= *v3++ * sum2;
919       sum4 -= *v4++ * sum2;
920       sum4 -= *v4++ * sum3;
921 
922       tmp[row ++]=sum1;
923       tmp[row ++]=sum2;
924       tmp[row ++]=sum3;
925       tmp[row ++]=sum4;
926       break;
927     case 5:
928       sum1 = b[*r++];
929       sum2 = b[*r++];
930       sum3 = b[*r++];
931       sum4 = b[*r++];
932       sum5 = b[*r++];
933       v2   = aa + ai[row+1];
934       v3   = aa + ai[row+2];
935       v4   = aa + ai[row+3];
936       v5   = aa + ai[row+4];
937 
938       for (j=0; j<nz-1; j+=2){
939         i0   = vi[0];
940         i1   = vi[1];
941         vi  +=2;
942         tmp0 = tmps[i0];
943         tmp1 = tmps[i1];
944         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
945         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
946         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
947         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
948         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
949       }
950       if (j == nz-1){
951         tmp0 = tmps[*vi++];
952         sum1 -= *v1++ *tmp0;
953         sum2 -= *v2++ *tmp0;
954         sum3 -= *v3++ *tmp0;
955         sum4 -= *v4++ *tmp0;
956         sum5 -= *v5++ *tmp0;
957       }
958 
959       sum2 -= *v2++ * sum1;
960       sum3 -= *v3++ * sum1;
961       sum4 -= *v4++ * sum1;
962       sum5 -= *v5++ * sum1;
963       sum3 -= *v3++ * sum2;
964       sum4 -= *v4++ * sum2;
965       sum5 -= *v5++ * sum2;
966       sum4 -= *v4++ * sum3;
967       sum5 -= *v5++ * sum3;
968       sum5 -= *v5++ * sum4;
969 
970       tmp[row ++]=sum1;
971       tmp[row ++]=sum2;
972       tmp[row ++]=sum3;
973       tmp[row ++]=sum4;
974       tmp[row ++]=sum5;
975       break;
976     default:
977       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
978     }
979   }
980   /* backward solve the upper triangular */
981   for (i=node_max -1 ,row = n-1 ; i>=0; i--){
982     nsz = ns[i];
983     aii = ai[row+1] -1;
984     v1  = aa + aii;
985     vi  = aj + aii;
986     nz  = aii- ad[row];
987     switch (nsz){               /* Each loop in 'case' is unrolled */
988     case 1 :
989       sum1 = tmp[row];
990 
991       for(j=nz ; j>1; j-=2){
992         vi  -=2;
993         i0   = vi[2];
994         i1   = vi[1];
995         tmp0 = tmps[i0];
996         tmp1 = tmps[i1];
997         v1   -= 2;
998         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
999       }
1000       if (j==1){
1001         tmp0  = tmps[*vi--];
1002         sum1 -= *v1-- * tmp0;
1003       }
1004       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1005       break;
1006     case 2 :
1007       sum1 = tmp[row];
1008       sum2 = tmp[row -1];
1009       v2   = aa + ai[row]-1;
1010       for (j=nz ; j>1; j-=2){
1011         vi  -=2;
1012         i0   = vi[2];
1013         i1   = vi[1];
1014         tmp0 = tmps[i0];
1015         tmp1 = tmps[i1];
1016         v1   -= 2;
1017         v2   -= 2;
1018         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1019         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1020       }
1021       if (j==1){
1022         tmp0  = tmps[*vi--];
1023         sum1 -= *v1-- * tmp0;
1024         sum2 -= *v2-- * tmp0;
1025       }
1026 
1027       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1028       sum2   -= *v2-- * tmp0;
1029       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1030       break;
1031     case 3 :
1032       sum1 = tmp[row];
1033       sum2 = tmp[row -1];
1034       sum3 = tmp[row -2];
1035       v2   = aa + ai[row]-1;
1036       v3   = aa + ai[row -1]-1;
1037       for (j=nz ; j>1; j-=2){
1038         vi  -=2;
1039         i0   = vi[2];
1040         i1   = vi[1];
1041         tmp0 = tmps[i0];
1042         tmp1 = tmps[i1];
1043         v1   -= 2;
1044         v2   -= 2;
1045         v3   -= 2;
1046         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1047         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1048         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1049       }
1050       if (j==1){
1051         tmp0  = tmps[*vi--];
1052         sum1 -= *v1-- * tmp0;
1053         sum2 -= *v2-- * tmp0;
1054         sum3 -= *v3-- * tmp0;
1055       }
1056       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1057       sum2   -= *v2-- * tmp0;
1058       sum3   -= *v3-- * tmp0;
1059       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1060       sum3   -= *v3-- * tmp0;
1061       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1062 
1063       break;
1064     case 4 :
1065       sum1 = tmp[row];
1066       sum2 = tmp[row -1];
1067       sum3 = tmp[row -2];
1068       sum4 = tmp[row -3];
1069       v2   = aa + ai[row]-1;
1070       v3   = aa + ai[row -1]-1;
1071       v4   = aa + ai[row -2]-1;
1072 
1073       for (j=nz ; j>1; j-=2){
1074         vi  -=2;
1075         i0   = vi[2];
1076         i1   = vi[1];
1077         tmp0 = tmps[i0];
1078         tmp1 = tmps[i1];
1079         v1  -= 2;
1080         v2  -= 2;
1081         v3  -= 2;
1082         v4  -= 2;
1083         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1084         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1085         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1086         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1087       }
1088       if (j==1){
1089         tmp0  = tmps[*vi--];
1090         sum1 -= *v1-- * tmp0;
1091         sum2 -= *v2-- * tmp0;
1092         sum3 -= *v3-- * tmp0;
1093         sum4 -= *v4-- * tmp0;
1094       }
1095 
1096       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1097       sum2   -= *v2-- * tmp0;
1098       sum3   -= *v3-- * tmp0;
1099       sum4   -= *v4-- * tmp0;
1100       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1101       sum3   -= *v3-- * tmp0;
1102       sum4   -= *v4-- * tmp0;
1103       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1104       sum4   -= *v4-- * tmp0;
1105       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1106       break;
1107     case 5 :
1108       sum1 = tmp[row];
1109       sum2 = tmp[row -1];
1110       sum3 = tmp[row -2];
1111       sum4 = tmp[row -3];
1112       sum5 = tmp[row -4];
1113       v2   = aa + ai[row]-1;
1114       v3   = aa + ai[row -1]-1;
1115       v4   = aa + ai[row -2]-1;
1116       v5   = aa + ai[row -3]-1;
1117       for (j=nz ; j>1; j-=2){
1118         vi  -= 2;
1119         i0   = vi[2];
1120         i1   = vi[1];
1121         tmp0 = tmps[i0];
1122         tmp1 = tmps[i1];
1123         v1   -= 2;
1124         v2   -= 2;
1125         v3   -= 2;
1126         v4   -= 2;
1127         v5   -= 2;
1128         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1129         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1130         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1131         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1132         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1133       }
1134       if (j==1){
1135         tmp0  = tmps[*vi--];
1136         sum1 -= *v1-- * tmp0;
1137         sum2 -= *v2-- * tmp0;
1138         sum3 -= *v3-- * tmp0;
1139         sum4 -= *v4-- * tmp0;
1140         sum5 -= *v5-- * tmp0;
1141       }
1142 
1143       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1144       sum2   -= *v2-- * tmp0;
1145       sum3   -= *v3-- * tmp0;
1146       sum4   -= *v4-- * tmp0;
1147       sum5   -= *v5-- * tmp0;
1148       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1149       sum3   -= *v3-- * tmp0;
1150       sum4   -= *v4-- * tmp0;
1151       sum5   -= *v5-- * tmp0;
1152       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1153       sum4   -= *v4-- * tmp0;
1154       sum5   -= *v5-- * tmp0;
1155       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1156       sum5   -= *v5-- * tmp0;
1157       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1158       break;
1159     default:
1160       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1161     }
1162   }
1163   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1164   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1165   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1166   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1167   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
1168   PetscFunctionReturn(0);
1169 }
1170 
1171 /* ----------------------------------------------------------- */
1172 #undef __FUNCT__
1173 #define __FUNCT__ "MatSolve_SeqAIJ_Inode_newdatastruct"
1174 PetscErrorCode MatSolve_SeqAIJ_Inode_newdatastruct(Mat A,Vec bb,Vec xx)
1175 {
1176   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
1177   IS                iscol = a->col,isrow = a->row;
1178   PetscErrorCode    ierr;
1179   const PetscInt    *r,*c,*rout,*cout;
1180   PetscInt          i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
1181   PetscInt          node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
1182   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
1183   PetscScalar       sum1,sum2,sum3,sum4,sum5;
1184   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
1185   const PetscScalar *b;
1186 
1187   PetscFunctionBegin;
1188   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
1189   node_max = a->inode.node_count;
1190   ns       = a->inode.size;     /* Node Size array */
1191 
1192   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1193   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1194   tmp  = a->solve_work;
1195 
1196   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1197   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1198 
1199   /* forward solve the lower triangular */
1200   tmps = tmp ;
1201   aa   = a_a ;
1202   aj   = a_j ;
1203   ad   = a->diag;
1204 
1205   for (i = 0,row = 0; i< node_max; ++i){
1206     nsz = ns[i];
1207     aii = ai[row];
1208     v1  = aa + aii;
1209     vi  = aj + aii;
1210     nz  = ai[row+1]- ai[row];
1211 
1212     switch (nsz){               /* Each loop in 'case' is unrolled */
1213     case 1 :
1214       sum1 = b[r[row]];
1215       for(j=0; j<nz-1; j+=2){
1216         i0   = vi[j];
1217         i1   = vi[j+1];
1218         tmp0 = tmps[i0];
1219         tmp1 = tmps[i1];
1220         sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1;
1221       }
1222       if(j == nz-1){
1223         tmp0 = tmps[vi[j]];
1224         sum1 -= v1[j]*tmp0;
1225       }
1226       tmp[row++]=sum1;
1227       break;
1228     case 2:
1229       sum1 = b[r[row]];
1230       sum2 = b[r[row+1]];
1231       v2   = aa + ai[row+1];
1232 
1233       for(j=0; j<nz-1; j+=2){
1234         i0   = vi[j];
1235         i1   = vi[j+1];
1236         tmp0 = tmps[i0];
1237         tmp1 = tmps[i1];
1238         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
1239         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
1240       }
1241       if(j == nz-1){
1242         tmp0 = tmps[vi[j]];
1243         sum1 -= v1[j] *tmp0;
1244         sum2 -= v2[j] *tmp0;
1245       }
1246       sum2 -= v2[nz] * sum1;
1247       tmp[row ++]=sum1;
1248       tmp[row ++]=sum2;
1249       break;
1250     case 3:
1251       sum1 = b[r[row]];
1252       sum2 = b[r[row+1]];
1253       sum3 = b[r[row+2]];
1254       v2   = aa + ai[row+1];
1255       v3   = aa + ai[row+2];
1256 
1257       for (j=0; j<nz-1; j+=2){
1258         i0   = vi[j];
1259         i1   = vi[j+1];
1260         tmp0 = tmps[i0];
1261         tmp1 = tmps[i1];
1262         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
1263         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
1264         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
1265       }
1266       if (j == nz-1){
1267         tmp0 = tmps[vi[j]];
1268         sum1 -= v1[j] *tmp0;
1269         sum2 -= v2[j] *tmp0;
1270         sum3 -= v3[j] *tmp0;
1271       }
1272       sum2 -= v2[nz] * sum1;
1273       sum3 -= v3[nz] * sum1;
1274       sum3 -= v3[nz+1] * sum2;
1275       tmp[row ++]=sum1;
1276       tmp[row ++]=sum2;
1277       tmp[row ++]=sum3;
1278       break;
1279 
1280     case 4:
1281       sum1 = b[r[row]];
1282       sum2 = b[r[row+1]];
1283       sum3 = b[r[row+2]];
1284       sum4 = b[r[row+3]];
1285       v2   = aa + ai[row+1];
1286       v3   = aa + ai[row+2];
1287       v4   = aa + ai[row+3];
1288 
1289       for (j=0; j<nz-1; j+=2){
1290         i0   = vi[j];
1291         i1   = vi[j+1];
1292         tmp0 = tmps[i0];
1293         tmp1 = tmps[i1];
1294         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
1295         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
1296         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
1297         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
1298       }
1299       if (j == nz-1){
1300         tmp0 = tmps[vi[j]];
1301         sum1 -= v1[j] *tmp0;
1302         sum2 -= v2[j] *tmp0;
1303         sum3 -= v3[j] *tmp0;
1304         sum4 -= v4[j] *tmp0;
1305       }
1306       sum2 -= v2[nz] * sum1;
1307       sum3 -= v3[nz] * sum1;
1308       sum4 -= v4[nz] * sum1;
1309       sum3 -= v3[nz+1] * sum2;
1310       sum4 -= v4[nz+1] * sum2;
1311       sum4 -= v4[nz+2] * sum3;
1312 
1313       tmp[row ++]=sum1;
1314       tmp[row ++]=sum2;
1315       tmp[row ++]=sum3;
1316       tmp[row ++]=sum4;
1317       break;
1318     case 5:
1319       sum1 = b[r[row]];
1320       sum2 = b[r[row+1]];
1321       sum3 = b[r[row+2]];
1322       sum4 = b[r[row+3]];
1323       sum5 = b[r[row+4]];
1324       v2   = aa + ai[row+1];
1325       v3   = aa + ai[row+2];
1326       v4   = aa + ai[row+3];
1327       v5   = aa + ai[row+4];
1328 
1329       for (j=0; j<nz-1; j+=2){
1330         i0   = vi[j];
1331         i1   = vi[j+1];
1332         tmp0 = tmps[i0];
1333         tmp1 = tmps[i1];
1334         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
1335         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
1336         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
1337         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
1338         sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1;
1339       }
1340       if (j == nz-1){
1341         tmp0 = tmps[vi[j]];
1342         sum1 -= v1[j] *tmp0;
1343         sum2 -= v2[j] *tmp0;
1344         sum3 -= v3[j] *tmp0;
1345         sum4 -= v4[j] *tmp0;
1346         sum5 -= v5[j] *tmp0;
1347       }
1348 
1349       sum2 -= v2[nz] * sum1;
1350       sum3 -= v3[nz] * sum1;
1351       sum4 -= v4[nz] * sum1;
1352       sum5 -= v5[nz] * sum1;
1353       sum3 -= v3[nz+1] * sum2;
1354       sum4 -= v4[nz+1] * sum2;
1355       sum5 -= v5[nz+1] * sum2;
1356       sum4 -= v4[nz+2] * sum3;
1357       sum5 -= v5[nz+2] * sum3;
1358       sum5 -= v5[nz+3] * sum4;
1359 
1360       tmp[row ++]=sum1;
1361       tmp[row ++]=sum2;
1362       tmp[row ++]=sum3;
1363       tmp[row ++]=sum4;
1364       tmp[row ++]=sum5;
1365       break;
1366     default:
1367       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1368     }
1369   }
1370   /* backward solve the upper triangular */
1371   for (i=node_max -1 ,row = n-1 ; i>=0; i--){
1372     nsz = ns[i];
1373     aii = ad[row+1] + 1;
1374     v1  = aa + aii;
1375     vi  = aj + aii;
1376     nz  = ad[row]- ad[row+1] - 1;
1377     switch (nsz){               /* Each loop in 'case' is unrolled */
1378     case 1 :
1379       sum1 = tmp[row];
1380 
1381       for(j=0 ; j<nz-1; j+=2){
1382         i0   = vi[j];
1383         i1   = vi[j+1];
1384         tmp0 = tmps[i0];
1385         tmp1 = tmps[i1];
1386         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
1387       }
1388       if (j == nz-1){
1389         tmp0  = tmps[vi[j]];
1390         sum1 -= v1[j]*tmp0;
1391       }
1392       x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
1393       break;
1394     case 2 :
1395       sum1 = tmp[row];
1396       sum2 = tmp[row-1];
1397       v2   = aa + ad[row] + 1;
1398       for (j=0 ; j<nz-1; j+=2){
1399         i0   = vi[j];
1400         i1   = vi[j+1];
1401         tmp0 = tmps[i0];
1402         tmp1 = tmps[i1];
1403         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
1404         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
1405       }
1406       if (j == nz-1){
1407         tmp0  = tmps[vi[j]];
1408         sum1 -= v1[j]* tmp0;
1409         sum2 -= v2[j+1]* tmp0;
1410       }
1411 
1412       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
1413       sum2   -= v2[0] * tmp0;
1414       x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
1415       break;
1416     case 3 :
1417       sum1 = tmp[row];
1418       sum2 = tmp[row -1];
1419       sum3 = tmp[row -2];
1420       v2   = aa + ad[row] + 1;
1421       v3   = aa + ad[row -1] + 1;
1422       for (j=0 ; j<nz-1; j+=2){
1423         i0   = vi[j];
1424         i1   = vi[j+1];
1425         tmp0 = tmps[i0];
1426         tmp1 = tmps[i1];
1427         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
1428         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
1429         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
1430       }
1431       if (j== nz-1){
1432         tmp0  = tmps[vi[j]];
1433         sum1 -= v1[j] * tmp0;
1434         sum2 -= v2[j+1] * tmp0;
1435         sum3 -= v3[j+2] * tmp0;
1436       }
1437       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
1438       sum2   -= v2[0]* tmp0;
1439       sum3   -= v3[1] * tmp0;
1440       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
1441       sum3   -= v3[0]* tmp0;
1442       x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
1443 
1444       break;
1445     case 4 :
1446       sum1 = tmp[row];
1447       sum2 = tmp[row -1];
1448       sum3 = tmp[row -2];
1449       sum4 = tmp[row -3];
1450       v2   = aa + ad[row]+1;
1451       v3   = aa + ad[row -1]+1;
1452       v4   = aa + ad[row -2]+1;
1453 
1454       for (j=0 ; j<nz-1; j+=2){
1455         i0   = vi[j];
1456         i1   = vi[j+1];
1457         tmp0 = tmps[i0];
1458         tmp1 = tmps[i1];
1459         sum1 -= v1[j] * tmp0   + v1[j+1] * tmp1;
1460         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
1461         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
1462         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
1463       }
1464       if (j== nz-1){
1465         tmp0  = tmps[vi[j]];
1466         sum1 -= v1[j] * tmp0;
1467         sum2 -= v2[j+1] * tmp0;
1468         sum3 -= v3[j+2] * tmp0;
1469         sum4 -= v4[j+3] * tmp0;
1470       }
1471 
1472       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
1473       sum2   -= v2[0] * tmp0;
1474       sum3   -= v3[1] * tmp0;
1475       sum4   -= v4[2] * tmp0;
1476       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
1477       sum3   -= v3[0] * tmp0;
1478       sum4   -= v4[1] * tmp0;
1479       tmp0    = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
1480       sum4   -= v4[0] * tmp0;
1481       x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
1482       break;
1483     case 5 :
1484       sum1 = tmp[row];
1485       sum2 = tmp[row -1];
1486       sum3 = tmp[row -2];
1487       sum4 = tmp[row -3];
1488       sum5 = tmp[row -4];
1489       v2   = aa + ad[row]+1;
1490       v3   = aa + ad[row -1]+1;
1491       v4   = aa + ad[row -2]+1;
1492       v5   = aa + ad[row -3]+1;
1493       for (j=0 ; j<nz-1; j+=2){
1494         i0   = vi[j];
1495         i1   = vi[j+1];
1496         tmp0 = tmps[i0];
1497         tmp1 = tmps[i1];
1498         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
1499         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
1500         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
1501         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
1502         sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1;
1503       }
1504       if (j==nz-1){
1505         tmp0  = tmps[vi[j]];
1506         sum1 -= v1[j] * tmp0;
1507         sum2 -= v2[j+1] * tmp0;
1508         sum3 -= v3[j+2] * tmp0;
1509         sum4 -= v4[j+3] * tmp0;
1510         sum5 -= v5[j+4] * tmp0;
1511       }
1512 
1513       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
1514       sum2   -= v2[0] * tmp0;
1515       sum3   -= v3[1] * tmp0;
1516       sum4   -= v4[2] * tmp0;
1517       sum5   -= v5[3] * tmp0;
1518       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
1519       sum3   -= v3[0] * tmp0;
1520       sum4   -= v4[1] * tmp0;
1521       sum5   -= v5[2] * tmp0;
1522       tmp0    = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
1523       sum4   -= v4[0] * tmp0;
1524       sum5   -= v5[1] * tmp0;
1525       tmp0    = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
1526       sum5   -= v5[0] * tmp0;
1527       x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--;
1528       break;
1529     default:
1530       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1531     }
1532   }
1533   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1534   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1535   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1536   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1537   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
1538   PetscFunctionReturn(0);
1539 }
1540 
1541 
1542 /*
1543      Makes a longer coloring[] array and calls the usual code with that
1544 */
1545 #undef __FUNCT__
1546 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode"
1547 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
1548 {
1549   Mat_SeqAIJ       *a = (Mat_SeqAIJ*)mat->data;
1550   PetscErrorCode  ierr;
1551   PetscInt        n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
1552   PetscInt        *colorused,i;
1553   ISColoringValue *newcolor;
1554 
1555   PetscFunctionBegin;
1556   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr);
1557   /* loop over inodes, marking a color for each column*/
1558   row = 0;
1559   for (i=0; i<m; i++){
1560     for (j=0; j<ns[i]; j++) {
1561       newcolor[row++] = coloring[i] + j*ncolors;
1562     }
1563   }
1564 
1565   /* eliminate unneeded colors */
1566   ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr);
1567   ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr);
1568   for (i=0; i<n; i++) {
1569     colorused[newcolor[i]] = 1;
1570   }
1571 
1572   for (i=1; i<5*ncolors; i++) {
1573     colorused[i] += colorused[i-1];
1574   }
1575   ncolors = colorused[5*ncolors-1];
1576   for (i=0; i<n; i++) {
1577     newcolor[i] = colorused[newcolor[i]]-1;
1578   }
1579   ierr = PetscFree(colorused);CHKERRQ(ierr);
1580   ierr = ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr);
1581   ierr = PetscFree(coloring);CHKERRQ(ierr);
1582   PetscFunctionReturn(0);
1583 }
1584 
1585 #include "../src/mat/blockinvert.h"
1586 
1587 #undef __FUNCT__
1588 #define __FUNCT__ "MatSOR_SeqAIJ_Inode"
1589 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
1590 {
1591   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1592   PetscScalar        sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3;
1593   MatScalar          *ibdiag,*bdiag;
1594   PetscScalar        *x,*xb,tmp4,tmp5,x1,x2,x3,x4,x5;
1595   const MatScalar    *v = a->a,*v1,*v2,*v3,*v4,*v5;
1596   PetscReal          zeropivot = 1.0e-15, shift = 0.0;
1597   PetscErrorCode     ierr;
1598   PetscInt           n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2;
1599   PetscInt           *idx,*diag = a->diag,*ii = a->i,sz,k;
1600 
1601   PetscFunctionBegin;
1602   if (omega != 1.0) SETERRQ(PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
1603   if (fshift != 0.0) SETERRQ(PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
1604   if (its > 1) {
1605     /* switch to non-inode version */
1606     ierr = MatSOR_SeqAIJ(A,bb,omega,flag,fshift,its,lits,xx);CHKERRQ(ierr);
1607     PetscFunctionReturn(0);
1608   }
1609 
1610   if (!a->inode.ibdiagvalid) {
1611     if (!a->inode.ibdiag) {
1612       /* calculate space needed for diagonal blocks */
1613       for (i=0; i<m; i++) {
1614 	cnt += sizes[i]*sizes[i];
1615       }
1616       a->inode.bdiagsize = cnt;
1617       ierr   = PetscMalloc3(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag,A->rmap->n,MatScalar,&a->inode.ssor_work);CHKERRQ(ierr);
1618     }
1619 
1620     /* copy over the diagonal blocks and invert them */
1621     ibdiag = a->inode.ibdiag;
1622     bdiag  = a->inode.bdiag;
1623     cnt = 0;
1624     for (i=0, row = 0; i<m; i++) {
1625       for (j=0; j<sizes[i]; j++) {
1626         for (k=0; k<sizes[i]; k++) {
1627           bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
1628         }
1629       }
1630       ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr);
1631 
1632       switch(sizes[i]) {
1633         case 1:
1634           /* Create matrix data structure */
1635           if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
1636           ibdiag[cnt] = 1.0/ibdiag[cnt];
1637           break;
1638         case 2:
1639           ierr = Kernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr);
1640           break;
1641         case 3:
1642           ierr = Kernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr);
1643           break;
1644         case 4:
1645           ierr = Kernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr);
1646           break;
1647         case 5:
1648           ierr = Kernel_A_gets_inverse_A_5(ibdiag+cnt,shift);CHKERRQ(ierr);
1649           break;
1650        default:
1651 	 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
1652       }
1653       cnt += sizes[i]*sizes[i];
1654       row += sizes[i];
1655     }
1656     a->inode.ibdiagvalid = PETSC_TRUE;
1657   }
1658   ibdiag = a->inode.ibdiag;
1659   bdiag  = a->inode.bdiag;
1660 
1661 
1662   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
1663   if (flag & SOR_ZERO_INITIAL_GUESS) {
1664     PetscScalar *b;
1665     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1666     if (xx != bb) {
1667       ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1668     } else {
1669       b = x;
1670     }
1671     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){
1672 
1673       for (i=0, row=0; i<m; i++) {
1674         sz  = diag[row] - ii[row];
1675         v1  = a->a + ii[row];
1676         idx = a->j + ii[row];
1677 
1678         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
1679         switch (sizes[i]){
1680           case 1:
1681 
1682             sum1  = b[row];
1683             for(n = 0; n<sz-1; n+=2) {
1684               i1   = idx[0];
1685               i2   = idx[1];
1686               idx += 2;
1687               tmp0 = x[i1];
1688               tmp1 = x[i2];
1689               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1690             }
1691 
1692             if (n == sz-1){
1693               tmp0  = x[*idx];
1694               sum1 -= *v1 * tmp0;
1695             }
1696             x[row++] = sum1*(*ibdiag++);
1697             break;
1698           case 2:
1699             v2    = a->a + ii[row+1];
1700             sum1  = b[row];
1701             sum2  = b[row+1];
1702             for(n = 0; n<sz-1; n+=2) {
1703               i1   = idx[0];
1704               i2   = idx[1];
1705               idx += 2;
1706               tmp0 = x[i1];
1707               tmp1 = x[i2];
1708               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1709               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1710             }
1711 
1712             if (n == sz-1){
1713               tmp0  = x[*idx];
1714               sum1 -= v1[0] * tmp0;
1715               sum2 -= v2[0] * tmp0;
1716             }
1717             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
1718             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
1719             ibdiag  += 4;
1720             break;
1721           case 3:
1722             v2    = a->a + ii[row+1];
1723             v3    = a->a + ii[row+2];
1724             sum1  = b[row];
1725             sum2  = b[row+1];
1726             sum3  = b[row+2];
1727             for(n = 0; n<sz-1; n+=2) {
1728               i1   = idx[0];
1729               i2   = idx[1];
1730               idx += 2;
1731               tmp0 = x[i1];
1732               tmp1 = x[i2];
1733               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1734               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1735               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1736             }
1737 
1738             if (n == sz-1){
1739               tmp0  = x[*idx];
1740               sum1 -= v1[0] * tmp0;
1741               sum2 -= v2[0] * tmp0;
1742               sum3 -= v3[0] * tmp0;
1743             }
1744             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
1745             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
1746             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
1747             ibdiag  += 9;
1748             break;
1749           case 4:
1750             v2    = a->a + ii[row+1];
1751             v3    = a->a + ii[row+2];
1752             v4    = a->a + ii[row+3];
1753             sum1  = b[row];
1754             sum2  = b[row+1];
1755             sum3  = b[row+2];
1756             sum4  = b[row+3];
1757             for(n = 0; n<sz-1; n+=2) {
1758               i1   = idx[0];
1759               i2   = idx[1];
1760               idx += 2;
1761               tmp0 = x[i1];
1762               tmp1 = x[i2];
1763               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1764               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1765               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1766               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
1767             }
1768 
1769             if (n == sz-1){
1770               tmp0  = x[*idx];
1771               sum1 -= v1[0] * tmp0;
1772               sum2 -= v2[0] * tmp0;
1773               sum3 -= v3[0] * tmp0;
1774               sum4 -= v4[0] * tmp0;
1775             }
1776             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
1777             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
1778             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
1779             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
1780             ibdiag  += 16;
1781             break;
1782           case 5:
1783             v2    = a->a + ii[row+1];
1784             v3    = a->a + ii[row+2];
1785             v4    = a->a + ii[row+3];
1786             v5    = a->a + ii[row+4];
1787             sum1  = b[row];
1788             sum2  = b[row+1];
1789             sum3  = b[row+2];
1790             sum4  = b[row+3];
1791             sum5  = b[row+4];
1792             for(n = 0; n<sz-1; n+=2) {
1793               i1   = idx[0];
1794               i2   = idx[1];
1795               idx += 2;
1796               tmp0 = x[i1];
1797               tmp1 = x[i2];
1798               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1799               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1800               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1801               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
1802               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
1803             }
1804 
1805             if (n == sz-1){
1806               tmp0  = x[*idx];
1807               sum1 -= v1[0] * tmp0;
1808               sum2 -= v2[0] * tmp0;
1809               sum3 -= v3[0] * tmp0;
1810               sum4 -= v4[0] * tmp0;
1811               sum5 -= v5[0] * tmp0;
1812             }
1813             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
1814             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
1815             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
1816             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
1817             x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
1818             ibdiag  += 25;
1819             break;
1820 	  default:
1821    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
1822         }
1823       }
1824 
1825       xb = x;
1826       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
1827     } else xb = b;
1828     if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) &&
1829         (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1830       cnt = 0;
1831       for (i=0, row=0; i<m; i++) {
1832 
1833         switch (sizes[i]){
1834           case 1:
1835             x[row++] *= bdiag[cnt++];
1836             break;
1837           case 2:
1838             x1   = x[row]; x2 = x[row+1];
1839             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
1840             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
1841             x[row++] = tmp1;
1842             x[row++] = tmp2;
1843             cnt += 4;
1844             break;
1845           case 3:
1846             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2];
1847             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
1848             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
1849             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
1850             x[row++] = tmp1;
1851             x[row++] = tmp2;
1852             x[row++] = tmp3;
1853             cnt += 9;
1854             break;
1855           case 4:
1856             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
1857             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
1858             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
1859             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
1860             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
1861             x[row++] = tmp1;
1862             x[row++] = tmp2;
1863             x[row++] = tmp3;
1864             x[row++] = tmp4;
1865             cnt += 16;
1866             break;
1867           case 5:
1868             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
1869             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
1870             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
1871             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
1872             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
1873             tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
1874             x[row++] = tmp1;
1875             x[row++] = tmp2;
1876             x[row++] = tmp3;
1877             x[row++] = tmp4;
1878             x[row++] = tmp5;
1879             cnt += 25;
1880             break;
1881 	  default:
1882    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
1883         }
1884       }
1885       ierr = PetscLogFlops(m);CHKERRQ(ierr);
1886     }
1887     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){
1888 
1889       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
1890       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
1891         ibdiag -= sizes[i]*sizes[i];
1892         sz      = ii[row+1] - diag[row] - 1;
1893         v1      = a->a + diag[row] + 1;
1894         idx     = a->j + diag[row] + 1;
1895 
1896         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
1897         switch (sizes[i]){
1898           case 1:
1899 
1900             sum1  = xb[row];
1901             for(n = 0; n<sz-1; n+=2) {
1902               i1   = idx[0];
1903               i2   = idx[1];
1904               idx += 2;
1905               tmp0 = x[i1];
1906               tmp1 = x[i2];
1907               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1908             }
1909 
1910             if (n == sz-1){
1911               tmp0  = x[*idx];
1912               sum1 -= *v1*tmp0;
1913             }
1914             x[row--] = sum1*(*ibdiag);
1915             break;
1916 
1917           case 2:
1918 
1919             sum1  = xb[row];
1920             sum2  = xb[row-1];
1921             /* note that sum1 is associated with the second of the two rows */
1922             v2    = a->a + diag[row-1] + 2;
1923             for(n = 0; n<sz-1; n+=2) {
1924               i1   = idx[0];
1925               i2   = idx[1];
1926               idx += 2;
1927               tmp0 = x[i1];
1928               tmp1 = x[i2];
1929               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1930               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1931             }
1932 
1933             if (n == sz-1){
1934               tmp0  = x[*idx];
1935               sum1 -= *v1*tmp0;
1936               sum2 -= *v2*tmp0;
1937             }
1938             x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
1939             x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
1940             break;
1941           case 3:
1942 
1943             sum1  = xb[row];
1944             sum2  = xb[row-1];
1945             sum3  = xb[row-2];
1946             v2    = a->a + diag[row-1] + 2;
1947             v3    = a->a + diag[row-2] + 3;
1948             for(n = 0; n<sz-1; n+=2) {
1949               i1   = idx[0];
1950               i2   = idx[1];
1951               idx += 2;
1952               tmp0 = x[i1];
1953               tmp1 = x[i2];
1954               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1955               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1956               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1957             }
1958 
1959             if (n == sz-1){
1960               tmp0  = x[*idx];
1961               sum1 -= *v1*tmp0;
1962               sum2 -= *v2*tmp0;
1963               sum3 -= *v3*tmp0;
1964             }
1965             x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
1966             x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
1967             x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
1968             break;
1969           case 4:
1970 
1971             sum1  = xb[row];
1972             sum2  = xb[row-1];
1973             sum3  = xb[row-2];
1974             sum4  = xb[row-3];
1975             v2    = a->a + diag[row-1] + 2;
1976             v3    = a->a + diag[row-2] + 3;
1977             v4    = a->a + diag[row-3] + 4;
1978             for(n = 0; n<sz-1; n+=2) {
1979               i1   = idx[0];
1980               i2   = idx[1];
1981               idx += 2;
1982               tmp0 = x[i1];
1983               tmp1 = x[i2];
1984               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1985               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1986               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1987               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
1988             }
1989 
1990             if (n == sz-1){
1991               tmp0  = x[*idx];
1992               sum1 -= *v1*tmp0;
1993               sum2 -= *v2*tmp0;
1994               sum3 -= *v3*tmp0;
1995               sum4 -= *v4*tmp0;
1996             }
1997             x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
1998             x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
1999             x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
2000             x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
2001             break;
2002           case 5:
2003 
2004             sum1  = xb[row];
2005             sum2  = xb[row-1];
2006             sum3  = xb[row-2];
2007             sum4  = xb[row-3];
2008             sum5  = xb[row-4];
2009             v2    = a->a + diag[row-1] + 2;
2010             v3    = a->a + diag[row-2] + 3;
2011             v4    = a->a + diag[row-3] + 4;
2012             v5    = a->a + diag[row-4] + 5;
2013             for(n = 0; n<sz-1; n+=2) {
2014               i1   = idx[0];
2015               i2   = idx[1];
2016               idx += 2;
2017               tmp0 = x[i1];
2018               tmp1 = x[i2];
2019               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2020               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2021               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2022               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2023               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2024             }
2025 
2026             if (n == sz-1){
2027               tmp0  = x[*idx];
2028               sum1 -= *v1*tmp0;
2029               sum2 -= *v2*tmp0;
2030               sum3 -= *v3*tmp0;
2031               sum4 -= *v4*tmp0;
2032               sum5 -= *v5*tmp0;
2033             }
2034             x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
2035             x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
2036             x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
2037             x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
2038             x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
2039             break;
2040 	  default:
2041    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2042         }
2043       }
2044 
2045       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2046     }
2047     its--;
2048     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2049     if (bb != xx) {ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);}
2050   }
2051   if (flag & SOR_EISENSTAT) {
2052     const PetscScalar *b;
2053     MatScalar         *t = a->inode.ssor_work;
2054 
2055     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2056     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2057     /*
2058           Apply  (U + D)^-1  where D is now the block diagonal
2059     */
2060     ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
2061     for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
2062       ibdiag -= sizes[i]*sizes[i];
2063       sz      = ii[row+1] - diag[row] - 1;
2064       v1      = a->a + diag[row] + 1;
2065       idx     = a->j + diag[row] + 1;
2066       CHKMEMQ;
2067       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2068       switch (sizes[i]){
2069         case 1:
2070 
2071 	  sum1  = b[row];
2072 	  for(n = 0; n<sz-1; n+=2) {
2073 	    i1   = idx[0];
2074 	    i2   = idx[1];
2075 	    idx += 2;
2076 	    tmp0 = x[i1];
2077 	    tmp1 = x[i2];
2078 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2079 	  }
2080 
2081 	  if (n == sz-1){
2082 	    tmp0  = x[*idx];
2083 	    sum1 -= *v1*tmp0;
2084 	  }
2085 	  x[row] = sum1*(*ibdiag);row--;
2086 	  break;
2087 
2088         case 2:
2089 
2090 	  sum1  = b[row];
2091 	  sum2  = b[row-1];
2092 	  /* note that sum1 is associated with the second of the two rows */
2093 	  v2    = a->a + diag[row-1] + 2;
2094 	  for(n = 0; n<sz-1; n+=2) {
2095 	    i1   = idx[0];
2096 	    i2   = idx[1];
2097 	    idx += 2;
2098 	    tmp0 = x[i1];
2099 	    tmp1 = x[i2];
2100 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2101 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2102 	  }
2103 
2104 	  if (n == sz-1){
2105 	    tmp0  = x[*idx];
2106 	    sum1 -= *v1*tmp0;
2107 	    sum2 -= *v2*tmp0;
2108 	  }
2109 	  x[row] = sum2*ibdiag[1] + sum1*ibdiag[3];
2110 	  x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
2111           row -= 2;
2112 	  break;
2113         case 3:
2114 
2115 	  sum1  = b[row];
2116 	  sum2  = b[row-1];
2117 	  sum3  = b[row-2];
2118 	  v2    = a->a + diag[row-1] + 2;
2119 	  v3    = a->a + diag[row-2] + 3;
2120 	  for(n = 0; n<sz-1; n+=2) {
2121 	    i1   = idx[0];
2122 	    i2   = idx[1];
2123 	    idx += 2;
2124 	    tmp0 = x[i1];
2125 	    tmp1 = x[i2];
2126 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2127 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2128 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2129 	  }
2130 
2131 	  if (n == sz-1){
2132 	    tmp0  = x[*idx];
2133 	    sum1 -= *v1*tmp0;
2134 	    sum2 -= *v2*tmp0;
2135 	    sum3 -= *v3*tmp0;
2136 	  }
2137 	  x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
2138 	  x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
2139 	  x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
2140           row -= 3;
2141 	  break;
2142         case 4:
2143 
2144 	  sum1  = b[row];
2145 	  sum2  = b[row-1];
2146 	  sum3  = b[row-2];
2147 	  sum4  = b[row-3];
2148 	  v2    = a->a + diag[row-1] + 2;
2149 	  v3    = a->a + diag[row-2] + 3;
2150 	  v4    = a->a + diag[row-3] + 4;
2151 	  for(n = 0; n<sz-1; n+=2) {
2152 	    i1   = idx[0];
2153 	    i2   = idx[1];
2154 	    idx += 2;
2155 	    tmp0 = x[i1];
2156 	    tmp1 = x[i2];
2157 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2158 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2159 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2160 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2161 	  }
2162 
2163 	  if (n == sz-1){
2164 	    tmp0  = x[*idx];
2165 	    sum1 -= *v1*tmp0;
2166 	    sum2 -= *v2*tmp0;
2167 	    sum3 -= *v3*tmp0;
2168 	    sum4 -= *v4*tmp0;
2169 	  }
2170 	  x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
2171 	  x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
2172 	  x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
2173 	  x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
2174           row -= 4;
2175 	  break;
2176         case 5:
2177 
2178 	  sum1  = b[row];
2179 	  sum2  = b[row-1];
2180 	  sum3  = b[row-2];
2181 	  sum4  = b[row-3];
2182 	  sum5  = b[row-4];
2183 	  v2    = a->a + diag[row-1] + 2;
2184 	  v3    = a->a + diag[row-2] + 3;
2185 	  v4    = a->a + diag[row-3] + 4;
2186 	  v5    = a->a + diag[row-4] + 5;
2187 	  for(n = 0; n<sz-1; n+=2) {
2188 	    i1   = idx[0];
2189 	    i2   = idx[1];
2190 	    idx += 2;
2191 	    tmp0 = x[i1];
2192 	    tmp1 = x[i2];
2193 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2194 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2195 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2196 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2197 	    sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2198 	  }
2199 
2200 	  if (n == sz-1){
2201 	    tmp0  = x[*idx];
2202 	    sum1 -= *v1*tmp0;
2203 	    sum2 -= *v2*tmp0;
2204 	    sum3 -= *v3*tmp0;
2205 	    sum4 -= *v4*tmp0;
2206 	    sum5 -= *v5*tmp0;
2207 	  }
2208 	  x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
2209 	  x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
2210 	  x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
2211 	  x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
2212 	  x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
2213           row -= 5;
2214 	  break;
2215         default:
2216 	  SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2217       }
2218       CHKMEMQ;
2219     }
2220     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2221 
2222     /*
2223            t = b - D x    where D is the block diagonal
2224     */
2225     cnt = 0;
2226     for (i=0, row=0; i<m; i++) {
2227       CHKMEMQ;
2228       switch (sizes[i]){
2229         case 1:
2230 	  t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
2231 	  break;
2232         case 2:
2233 	  x1   = x[row]; x2 = x[row+1];
2234 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
2235 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
2236 	  t[row]   = b[row] - tmp1;
2237 	  t[row+1] = b[row+1] - tmp2; row += 2;
2238 	  cnt += 4;
2239 	  break;
2240         case 3:
2241 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2];
2242 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
2243 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
2244 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
2245 	  t[row] = b[row] - tmp1;
2246 	  t[row+1] = b[row+1] - tmp2;
2247 	  t[row+2] = b[row+2] - tmp3; row += 3;
2248 	  cnt += 9;
2249 	  break;
2250         case 4:
2251 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
2252 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
2253 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
2254 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
2255 	  tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
2256 	  t[row] = b[row] - tmp1;
2257 	  t[row+1] = b[row+1] - tmp2;
2258 	  t[row+2] = b[row+2] - tmp3;
2259 	  t[row+3] = b[row+3] - tmp4; row += 4;
2260 	  cnt += 16;
2261 	  break;
2262         case 5:
2263 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
2264 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
2265 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
2266 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
2267 	  tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
2268 	  tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
2269 	  t[row] = b[row] - tmp1;
2270 	  t[row+1] = b[row+1] - tmp2;
2271 	  t[row+2] = b[row+2] - tmp3;
2272 	  t[row+3] = b[row+3] - tmp4;
2273 	  t[row+4] = b[row+4] - tmp5;row += 5;
2274 	  cnt += 25;
2275 	  break;
2276         default:
2277 	  SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2278       }
2279       CHKMEMQ;
2280     }
2281     ierr = PetscLogFlops(m);CHKERRQ(ierr);
2282 
2283 
2284 
2285     /*
2286           Apply (L + D)^-1 where D is the block diagonal
2287     */
2288     for (i=0, row=0; i<m; i++) {
2289       sz  = diag[row] - ii[row];
2290       v1  = a->a + ii[row];
2291       idx = a->j + ii[row];
2292       CHKMEMQ;
2293       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2294       switch (sizes[i]){
2295         case 1:
2296 
2297 	  sum1  = t[row];
2298 	  for(n = 0; n<sz-1; n+=2) {
2299 	    i1   = idx[0];
2300 	    i2   = idx[1];
2301 	    idx += 2;
2302 	    tmp0 = t[i1];
2303 	    tmp1 = t[i2];
2304 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2305 	  }
2306 
2307 	  if (n == sz-1){
2308 	    tmp0  = t[*idx];
2309 	    sum1 -= *v1 * tmp0;
2310 	  }
2311 	  x[row] += t[row] = sum1*(*ibdiag++); row++;
2312 	  break;
2313         case 2:
2314 	  v2    = a->a + ii[row+1];
2315 	  sum1  = t[row];
2316 	  sum2  = t[row+1];
2317 	  for(n = 0; n<sz-1; n+=2) {
2318 	    i1   = idx[0];
2319 	    i2   = idx[1];
2320 	    idx += 2;
2321 	    tmp0 = t[i1];
2322 	    tmp1 = t[i2];
2323 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2324 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2325 	  }
2326 
2327 	  if (n == sz-1){
2328 	    tmp0  = t[*idx];
2329 	    sum1 -= v1[0] * tmp0;
2330 	    sum2 -= v2[0] * tmp0;
2331 	  }
2332 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
2333 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
2334 	  ibdiag  += 4; row += 2;
2335 	  break;
2336         case 3:
2337 	  v2    = a->a + ii[row+1];
2338 	  v3    = a->a + ii[row+2];
2339 	  sum1  = t[row];
2340 	  sum2  = t[row+1];
2341 	  sum3  = t[row+2];
2342 	  for(n = 0; n<sz-1; n+=2) {
2343 	    i1   = idx[0];
2344 	    i2   = idx[1];
2345 	    idx += 2;
2346 	    tmp0 = t[i1];
2347 	    tmp1 = t[i2];
2348 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2349 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2350 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2351 	  }
2352 
2353 	  if (n == sz-1){
2354 	    tmp0  = t[*idx];
2355 	    sum1 -= v1[0] * tmp0;
2356 	    sum2 -= v2[0] * tmp0;
2357 	    sum3 -= v3[0] * tmp0;
2358 	  }
2359 	  x[row]  += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2360 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2361 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2362 	  ibdiag  += 9; row += 3;
2363 	  break;
2364         case 4:
2365 	  v2    = a->a + ii[row+1];
2366 	  v3    = a->a + ii[row+2];
2367 	  v4    = a->a + ii[row+3];
2368 	  sum1  = t[row];
2369 	  sum2  = t[row+1];
2370 	  sum3  = t[row+2];
2371 	  sum4  = t[row+3];
2372 	  for(n = 0; n<sz-1; n+=2) {
2373 	    i1   = idx[0];
2374 	    i2   = idx[1];
2375 	    idx += 2;
2376 	    tmp0 = t[i1];
2377 	    tmp1 = t[i2];
2378 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2379 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2380 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2381 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2382 	  }
2383 
2384 	  if (n == sz-1){
2385 	    tmp0  = t[*idx];
2386 	    sum1 -= v1[0] * tmp0;
2387 	    sum2 -= v2[0] * tmp0;
2388 	    sum3 -= v3[0] * tmp0;
2389 	    sum4 -= v4[0] * tmp0;
2390 	  }
2391 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2392 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2393 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2394 	  x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2395 	  ibdiag  += 16; row += 4;
2396 	  break;
2397         case 5:
2398 	  v2    = a->a + ii[row+1];
2399 	  v3    = a->a + ii[row+2];
2400 	  v4    = a->a + ii[row+3];
2401 	  v5    = a->a + ii[row+4];
2402 	  sum1  = t[row];
2403 	  sum2  = t[row+1];
2404 	  sum3  = t[row+2];
2405 	  sum4  = t[row+3];
2406 	  sum5  = t[row+4];
2407 	  for(n = 0; n<sz-1; n+=2) {
2408 	    i1   = idx[0];
2409 	    i2   = idx[1];
2410 	    idx += 2;
2411 	    tmp0 = t[i1];
2412 	    tmp1 = t[i2];
2413 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2414 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2415 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2416 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2417 	    sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2418 	  }
2419 
2420 	  if (n == sz-1){
2421 	    tmp0  = t[*idx];
2422 	    sum1 -= v1[0] * tmp0;
2423 	    sum2 -= v2[0] * tmp0;
2424 	    sum3 -= v3[0] * tmp0;
2425 	    sum4 -= v4[0] * tmp0;
2426 	    sum5 -= v5[0] * tmp0;
2427 	  }
2428 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2429 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
2430 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
2431 	  x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
2432 	  x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
2433 	  ibdiag  += 25; row += 5;
2434 	  break;
2435         default:
2436 	  SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2437       }
2438       CHKMEMQ;
2439     }
2440     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2441     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2442     ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2443   }
2444   PetscFunctionReturn(0);
2445 }
2446 
2447 #undef __FUNCT__
2448 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode"
2449 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
2450 {
2451   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
2452   PetscScalar        *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
2453   const MatScalar    *bdiag = a->inode.bdiag;
2454   const PetscScalar  *b;
2455   PetscErrorCode      ierr;
2456   PetscInt            m = a->inode.node_count,cnt = 0,i,row;
2457   const PetscInt      *sizes = a->inode.size;
2458 
2459   PetscFunctionBegin;
2460   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2461   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2462   cnt = 0;
2463   for (i=0, row=0; i<m; i++) {
2464     switch (sizes[i]){
2465       case 1:
2466 	x[row] = b[row]*bdiag[cnt++];row++;
2467 	break;
2468       case 2:
2469 	x1   = b[row]; x2 = b[row+1];
2470 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
2471 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
2472 	x[row++] = tmp1;
2473 	x[row++] = tmp2;
2474 	cnt += 4;
2475 	break;
2476       case 3:
2477 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2];
2478 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
2479 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
2480 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
2481 	x[row++] = tmp1;
2482 	x[row++] = tmp2;
2483 	x[row++] = tmp3;
2484 	cnt += 9;
2485 	break;
2486       case 4:
2487 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
2488 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
2489 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
2490 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
2491 	tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
2492 	x[row++] = tmp1;
2493 	x[row++] = tmp2;
2494 	x[row++] = tmp3;
2495 	x[row++] = tmp4;
2496 	cnt += 16;
2497 	break;
2498       case 5:
2499 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
2500 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
2501 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
2502 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
2503 	tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
2504 	tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
2505 	x[row++] = tmp1;
2506 	x[row++] = tmp2;
2507 	x[row++] = tmp3;
2508 	x[row++] = tmp4;
2509 	x[row++] = tmp5;
2510 	cnt += 25;
2511 	break;
2512       default:
2513 	SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2514     }
2515   }
2516   ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr);
2517   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2518   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2519   PetscFunctionReturn(0);
2520 }
2521 
2522 /*
2523     samestructure indicates that the matrix has not changed its nonzero structure so we
2524     do not need to recompute the inodes
2525 */
2526 #undef __FUNCT__
2527 #define __FUNCT__ "Mat_CheckInode"
2528 PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure)
2529 {
2530   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
2531   PetscErrorCode ierr;
2532   PetscInt       i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size;
2533   PetscTruth     flag;
2534 
2535   PetscFunctionBegin;
2536   if (!a->inode.use)                     PetscFunctionReturn(0);
2537   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
2538 
2539 
2540   m = A->rmap->n;
2541   if (a->inode.size) {ns = a->inode.size;}
2542   else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);}
2543 
2544   i          = 0;
2545   node_count = 0;
2546   idx        = a->j;
2547   ii         = a->i;
2548   while (i < m){                /* For each row */
2549     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
2550     /* Limits the number of elements in a node to 'a->inode.limit' */
2551     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
2552       nzy     = ii[j+1] - ii[j]; /* Same number of nonzeros */
2553       if (nzy != nzx) break;
2554       idy  += nzx;             /* Same nonzero pattern */
2555       ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
2556       if (!flag) break;
2557     }
2558     ns[node_count++] = blk_size;
2559     idx += blk_size*nzx;
2560     i    = j;
2561   }
2562   /* If not enough inodes found,, do not use inode version of the routines */
2563   if (!a->inode.size && m && node_count > .9*m) {
2564     ierr = PetscFree(ns);CHKERRQ(ierr);
2565     a->inode.node_count     = 0;
2566     a->inode.size           = PETSC_NULL;
2567     a->inode.use            = PETSC_FALSE;
2568     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
2569   } else {
2570     A->ops->mult              = MatMult_SeqAIJ_Inode;
2571     A->ops->sor               = MatSOR_SeqAIJ_Inode;
2572     A->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
2573     A->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
2574     A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
2575     A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
2576     A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
2577     A->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
2578     A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
2579     a->inode.node_count       = node_count;
2580     a->inode.size             = ns;
2581     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
2582   }
2583   PetscFunctionReturn(0);
2584 }
2585 
2586 #define MatGetRow_FactoredLU(cols,nzl,nzu,nz,ai,aj,adiag,row) {	\
2587 PetscInt __k, *__vi; \
2588 __vi = aj + ai[row];				\
2589 for(__k=0;__k<nzl;__k++) cols[__k] = __vi[__k]; \
2590 __vi = aj + adiag[row];				\
2591 cols[nzl] = __vi[0];\
2592 __vi = aj + adiag[row+1]+1;\
2593 for(__k=0;__k<nzu;__k++) cols[nzl+1+__k] = __vi[__k];}
2594 
2595 #undef __FUNCT__
2596 #define __FUNCT__ "Mat_CheckInode_FactorLU"
2597 PetscErrorCode Mat_CheckInode_FactorLU(Mat A,PetscTruth samestructure)
2598 {
2599   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
2600   PetscErrorCode ierr;
2601   PetscInt       i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size;
2602   PetscInt       *cols1,*cols2,*ns,*ai = a->i,*aj = a->j, *adiag = a->diag;
2603   PetscTruth     flag;
2604 
2605   PetscFunctionBegin;
2606   if (!a->inode.use)                     PetscFunctionReturn(0);
2607   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
2608 
2609   m = A->rmap->n;
2610   if (a->inode.size) {ns = a->inode.size;}
2611   else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);}
2612 
2613   i          = 0;
2614   node_count = 0;
2615   while (i < m){                /* For each row */
2616     nzl1 = ai[i+1] - ai[i];       /* Number of nonzeros in L */
2617     nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/
2618     nzx  = nzl1 + nzu1 + 1;
2619     ierr = PetscMalloc((nzx+1)*sizeof(PetscInt),&cols1);CHKERRQ(ierr);
2620     MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i);
2621 
2622     /* Limits the number of elements in a node to 'a->inode.limit' */
2623     for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
2624       nzl2    = ai[j+1] - ai[j];
2625       nzu2    = adiag[j] - adiag[j+1] - 1;
2626       nzy     = nzl2 + nzu2 + 1;
2627       if( nzy != nzx) break;
2628       ierr    = PetscMalloc((nzy+1)*sizeof(PetscInt),&cols2);CHKERRQ(ierr);
2629       MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);
2630       ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
2631       if (!flag) {ierr = PetscFree(cols2);CHKERRQ(ierr);break;}
2632       ierr = PetscFree(cols2);CHKERRQ(ierr);
2633     }
2634     ns[node_count++] = blk_size;
2635     ierr = PetscFree(cols1);CHKERRQ(ierr);
2636     i    = j;
2637   }
2638   /* If not enough inodes found,, do not use inode version of the routines */
2639   if (!a->inode.size && m && node_count > .9*m) {
2640     ierr = PetscFree(ns);CHKERRQ(ierr);
2641     a->inode.node_count     = 0;
2642     a->inode.size           = PETSC_NULL;
2643     a->inode.use            = PETSC_FALSE;
2644     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
2645   } else {
2646     A->ops->mult              = 0;
2647     A->ops->sor               = 0;
2648     A->ops->multadd           = 0;
2649     A->ops->getrowij          = 0;
2650     A->ops->restorerowij      = 0;
2651     A->ops->getcolumnij       = 0;
2652     A->ops->restorecolumnij   = 0;
2653     A->ops->coloringpatch     = 0;
2654     A->ops->multdiagonalblock = 0;
2655     a->inode.node_count       = node_count;
2656     a->inode.size             = ns;
2657     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
2658   }
2659   PetscFunctionReturn(0);
2660 }
2661 
2662 /*
2663      This is really ugly. if inodes are used this replaces the
2664   permutations with ones that correspond to rows/cols of the matrix
2665   rather then inode blocks
2666 */
2667 #undef __FUNCT__
2668 #define __FUNCT__ "MatInodeAdjustForInodes"
2669 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
2670 {
2671   PetscErrorCode ierr,(*f)(Mat,IS*,IS*);
2672 
2673   PetscFunctionBegin;
2674   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);CHKERRQ(ierr);
2675   if (f) {
2676     ierr = (*f)(A,rperm,cperm);CHKERRQ(ierr);
2677   }
2678   PetscFunctionReturn(0);
2679 }
2680 
2681 EXTERN_C_BEGIN
2682 #undef __FUNCT__
2683 #define __FUNCT__ "MatAdjustForInodes_SeqAIJ_Inode"
2684 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
2685 {
2686   Mat_SeqAIJ      *a=(Mat_SeqAIJ*)A->data;
2687   PetscErrorCode ierr;
2688   PetscInt       m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
2689   const PetscInt *ridx,*cidx;
2690   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
2691   PetscInt       nslim_col,*ns_col;
2692   IS             ris = *rperm,cis = *cperm;
2693 
2694   PetscFunctionBegin;
2695   if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */
2696   if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */
2697 
2698   ierr  = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
2699   ierr  = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
2700   ierr  = PetscMalloc2(m,PetscInt,&permr,n,PetscInt,&permc);CHKERRQ(ierr);
2701 
2702   ierr  = ISGetIndices(ris,&ridx);CHKERRQ(ierr);
2703   ierr  = ISGetIndices(cis,&cidx);CHKERRQ(ierr);
2704 
2705   /* Form the inode structure for the rows of permuted matric using inv perm*/
2706   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
2707 
2708   /* Construct the permutations for rows*/
2709   for (i=0,row = 0; i<nslim_row; ++i){
2710     indx      = ridx[i];
2711     start_val = tns[indx];
2712     end_val   = tns[indx + 1];
2713     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
2714   }
2715 
2716   /* Form the inode structure for the columns of permuted matrix using inv perm*/
2717   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
2718 
2719  /* Construct permutations for columns */
2720   for (i=0,col=0; i<nslim_col; ++i){
2721     indx      = cidx[i];
2722     start_val = tns[indx];
2723     end_val   = tns[indx + 1];
2724     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
2725   }
2726 
2727   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);CHKERRQ(ierr);
2728   ierr = ISSetPermutation(*rperm);CHKERRQ(ierr);
2729   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);CHKERRQ(ierr);
2730   ierr = ISSetPermutation(*cperm);CHKERRQ(ierr);
2731 
2732   ierr  = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr);
2733   ierr  = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr);
2734 
2735   ierr = PetscFree(ns_col);CHKERRQ(ierr);
2736   ierr = PetscFree2(permr,permc);CHKERRQ(ierr);
2737   ierr = ISDestroy(cis);CHKERRQ(ierr);
2738   ierr = ISDestroy(ris);CHKERRQ(ierr);
2739   ierr = PetscFree(tns);CHKERRQ(ierr);
2740   PetscFunctionReturn(0);
2741 }
2742 EXTERN_C_END
2743 
2744 #undef __FUNCT__
2745 #define __FUNCT__ "MatInodeGetInodeSizes"
2746 /*@C
2747    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
2748 
2749    Collective on Mat
2750 
2751    Input Parameter:
2752 .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
2753 
2754    Output Parameter:
2755 +  node_count - no of inodes present in the matrix.
2756 .  sizes      - an array of size node_count,with sizes of each inode.
2757 -  limit      - the max size used to generate the inodes.
2758 
2759    Level: advanced
2760 
2761    Notes: This routine returns some internal storage information
2762    of the matrix, it is intended to be used by advanced users.
2763    It should be called after the matrix is assembled.
2764    The contents of the sizes[] array should not be changed.
2765    PETSC_NULL may be passed for information not requested.
2766 
2767 .keywords: matrix, seqaij, get, inode
2768 
2769 .seealso: MatGetInfo()
2770 @*/
2771 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
2772 {
2773   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
2774 
2775   PetscFunctionBegin;
2776   if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
2777   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr);
2778   if (f) {
2779     ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr);
2780   }
2781   PetscFunctionReturn(0);
2782 }
2783 
2784 EXTERN_C_BEGIN
2785 #undef __FUNCT__
2786 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode"
2787 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
2788 {
2789   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
2790 
2791   PetscFunctionBegin;
2792   if (node_count) *node_count = a->inode.node_count;
2793   if (sizes)      *sizes      = a->inode.size;
2794   if (limit)      *limit      = a->inode.limit;
2795   PetscFunctionReturn(0);
2796 }
2797 EXTERN_C_END
2798