xref: /petsc/src/mat/impls/aij/seq/inode.c (revision 982c0cc0f4fcb440457cbf39109759124c711b27)
1 #define PETSCMAT_DLL
2 
3 /*
4   This file provides high performance routines for the Inode format (compressed sparse row)
5   by taking advantage of rows with identical nonzero structure (I-nodes).
6 */
7 #include "../src/mat/impls/aij/seq/aij.h"
8 
9 #undef __FUNCT__
10 #define __FUNCT__ "Mat_CreateColInode"
11 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns)
12 {
13   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
14   PetscErrorCode ierr;
15   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;
16 
17   PetscFunctionBegin;
18   n      = A->cmap->n;
19   m      = A->rmap->n;
20   ns_row = a->inode.size;
21 
22   min_mn = (m < n) ? m : n;
23   if (!ns) {
24     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++);
25     for(; count+1 < n; count++,i++);
26     if (count < n)  {
27       i++;
28     }
29     *size = i;
30     PetscFunctionReturn(0);
31   }
32   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr);
33 
34   /* Use the same row structure wherever feasible. */
35   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
36     ns_col[i] = ns_row[i];
37   }
38 
39   /* if m < n; pad up the remainder with inode_limit */
40   for(; count+1 < n; count++,i++) {
41     ns_col[i] = 1;
42   }
43   /* The last node is the odd ball. padd it up with the remaining rows; */
44   if (count < n)  {
45     ns_col[i] = n - count;
46     i++;
47   } else if (count > n) {
48     /* Adjust for the over estimation */
49     ns_col[i-1] += n - count;
50   }
51   *size = i;
52   *ns   = ns_col;
53   PetscFunctionReturn(0);
54 }
55 
56 
57 /*
58       This builds symmetric version of nonzero structure,
59 */
60 #undef __FUNCT__
61 #define __FUNCT__ "MatGetRowIJ_Inode_Symmetric"
62 static PetscErrorCode MatGetRowIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
63 {
64   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
65   PetscErrorCode ierr;
66   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n;
67   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j;
68 
69   PetscFunctionBegin;
70   nslim_row = a->inode.node_count;
71   m         = A->rmap->n;
72   n         = A->cmap->n;
73   if (m != n) SETERRQ(PETSC_ERR_SUP,"MatGetRowIJ_Inode_Symmetric: Matrix should be square");
74 
75   /* Use the row_inode as column_inode */
76   nslim_col = nslim_row;
77   ns_col    = ns_row;
78 
79   /* allocate space for reformated inode structure */
80   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
81   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
82   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
83 
84   for (i1=0,col=0; i1<nslim_col; ++i1){
85     nsz = ns_col[i1];
86     for (i2=0; i2<nsz; ++i2,++col)
87       tvc[col] = i1;
88   }
89   /* allocate space for row pointers */
90   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
91   *iia = ia;
92   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
93   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
94 
95   /* determine the number of columns in each row */
96   ia[0] = oshift;
97   for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) {
98 
99     j    = aj + ai[row] + ishift;
100     jmax = aj + ai[row+1] + ishift;
101     i2   = 0;
102     col  = *j++ + ishift;
103     i2   = tvc[col];
104     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
105       ia[i1+1]++;
106       ia[i2+1]++;
107       i2++;                     /* Start col of next node */
108       while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j;
109       i2 = tvc[col];
110     }
111     if(i2 == i1) ia[i2+1]++;    /* now the diagonal element */
112   }
113 
114   /* shift ia[i] to point to next row */
115   for (i1=1; i1<nslim_row+1; i1++) {
116     row        = ia[i1-1];
117     ia[i1]    += row;
118     work[i1-1] = row - oshift;
119   }
120 
121   /* allocate space for column pointers */
122   nz   = ia[nslim_row] + (!ishift);
123   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
124   *jja = ja;
125 
126  /* loop over lower triangular part putting into ja */
127   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
128     j    = aj + ai[row] + ishift;
129     jmax = aj + ai[row+1] + ishift;
130     i2   = 0;                     /* Col inode index */
131     col  = *j++ + ishift;
132     i2   = tvc[col];
133     while (i2<i1 && j<jmax) {
134       ja[work[i2]++] = i1 + oshift;
135       ja[work[i1]++] = i2 + oshift;
136       ++i2;
137       while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */
138       i2 = tvc[col];
139     }
140     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
141 
142   }
143   ierr = PetscFree(work);CHKERRQ(ierr);
144   ierr = PetscFree(tns);CHKERRQ(ierr);
145   ierr = PetscFree(tvc);CHKERRQ(ierr);
146   PetscFunctionReturn(0);
147 }
148 
149 /*
150       This builds nonsymmetric version of nonzero structure,
151 */
152 #undef __FUNCT__
153 #define __FUNCT__ "MatGetRowIJ_Inode_Nonsymmetric"
154 static PetscErrorCode MatGetRowIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
155 {
156   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
157   PetscErrorCode ierr;
158   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col;
159   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
160 
161   PetscFunctionBegin;
162   nslim_row = a->inode.node_count;
163   n         = A->cmap->n;
164 
165   /* Create The column_inode for this matrix */
166   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
167 
168   /* allocate space for reformated column_inode structure */
169   ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
170   ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
171   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
172 
173   for (i1=0,col=0; i1<nslim_col; ++i1){
174     nsz = ns_col[i1];
175     for (i2=0; i2<nsz; ++i2,++col)
176       tvc[col] = i1;
177   }
178   /* allocate space for row pointers */
179   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
180   *iia = ia;
181   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
182   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
183 
184   /* determine the number of columns in each row */
185   ia[0] = oshift;
186   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
187     j   = aj + ai[row] + ishift;
188     col = *j++ + ishift;
189     i2  = tvc[col];
190     nz  = ai[row+1] - ai[row];
191     while (nz-- > 0) {           /* off-diagonal elemets */
192       ia[i1+1]++;
193       i2++;                     /* Start col of next node */
194       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
195       if (nz > 0) i2 = tvc[col];
196     }
197   }
198 
199   /* shift ia[i] to point to next row */
200   for (i1=1; i1<nslim_row+1; i1++) {
201     row        = ia[i1-1];
202     ia[i1]    += row;
203     work[i1-1] = row - oshift;
204   }
205 
206   /* allocate space for column pointers */
207   nz   = ia[nslim_row] + (!ishift);
208   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
209   *jja = ja;
210 
211  /* loop over matrix putting into ja */
212   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
213     j   = aj + ai[row] + ishift;
214     i2  = 0;                     /* Col inode index */
215     col = *j++ + ishift;
216     i2  = tvc[col];
217     nz  = ai[row+1] - ai[row];
218     while (nz-- > 0) {
219       ja[work[i1]++] = i2 + oshift;
220       ++i2;
221       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
222       if (nz > 0) i2 = tvc[col];
223     }
224   }
225   ierr = PetscFree(ns_col);CHKERRQ(ierr);
226   ierr = PetscFree(work);CHKERRQ(ierr);
227   ierr = PetscFree(tns);CHKERRQ(ierr);
228   ierr = PetscFree(tvc);CHKERRQ(ierr);
229   PetscFunctionReturn(0);
230 }
231 
232 #undef __FUNCT__
233 #define __FUNCT__ "MatGetRowIJ_Inode"
234 static PetscErrorCode MatGetRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
235 {
236   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
237   PetscErrorCode ierr;
238 
239   PetscFunctionBegin;
240   *n     = a->inode.node_count;
241   if (!ia) PetscFunctionReturn(0);
242   if (!blockcompressed) {
243     ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
244   } else if (symmetric) {
245     ierr = MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
246   } else {
247     ierr = MatGetRowIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
248   }
249   PetscFunctionReturn(0);
250 }
251 
252 #undef __FUNCT__
253 #define __FUNCT__ "MatRestoreRowIJ_Inode"
254 static PetscErrorCode MatRestoreRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
255 {
256   PetscErrorCode ierr;
257 
258   PetscFunctionBegin;
259   if (!ia) PetscFunctionReturn(0);
260 
261   if (!blockcompressed) {
262     ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
263   } else {
264     ierr = PetscFree(*ia);CHKERRQ(ierr);
265     ierr = PetscFree(*ja);CHKERRQ(ierr);
266   }
267 
268   PetscFunctionReturn(0);
269 }
270 
271 /* ----------------------------------------------------------- */
272 
273 #undef __FUNCT__
274 #define __FUNCT__ "MatGetColumnIJ_Inode_Nonsymmetric"
275 static PetscErrorCode MatGetColumnIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
276 {
277   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
278   PetscErrorCode ierr;
279   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
280   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
281 
282   PetscFunctionBegin;
283   nslim_row = a->inode.node_count;
284   n         = A->cmap->n;
285 
286   /* Create The column_inode for this matrix */
287   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
288 
289   /* allocate space for reformated column_inode structure */
290   ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
291   ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
292   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
293 
294   for (i1=0,col=0; i1<nslim_col; ++i1){
295     nsz = ns_col[i1];
296     for (i2=0; i2<nsz; ++i2,++col)
297       tvc[col] = i1;
298   }
299   /* allocate space for column pointers */
300   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
301   *iia = ia;
302   ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr);
303   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
304 
305   /* determine the number of columns in each row */
306   ia[0] = oshift;
307   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
308     j   = aj + ai[row] + ishift;
309     col = *j++ + ishift;
310     i2  = tvc[col];
311     nz  = ai[row+1] - ai[row];
312     while (nz-- > 0) {           /* off-diagonal elemets */
313       /* ia[i1+1]++; */
314       ia[i2+1]++;
315       i2++;
316       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
317       if (nz > 0) i2 = tvc[col];
318     }
319   }
320 
321   /* shift ia[i] to point to next col */
322   for (i1=1; i1<nslim_col+1; i1++) {
323     col        = ia[i1-1];
324     ia[i1]    += col;
325     work[i1-1] = col - oshift;
326   }
327 
328   /* allocate space for column pointers */
329   nz   = ia[nslim_col] + (!ishift);
330   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
331   *jja = ja;
332 
333  /* loop over matrix putting into ja */
334   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
335     j   = aj + ai[row] + ishift;
336     i2  = 0;                     /* Col inode index */
337     col = *j++ + ishift;
338     i2  = tvc[col];
339     nz  = ai[row+1] - ai[row];
340     while (nz-- > 0) {
341       /* ja[work[i1]++] = i2 + oshift; */
342       ja[work[i2]++] = i1 + oshift;
343       i2++;
344       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
345       if (nz > 0) i2 = tvc[col];
346     }
347   }
348   ierr = PetscFree(ns_col);CHKERRQ(ierr);
349   ierr = PetscFree(work);CHKERRQ(ierr);
350   ierr = PetscFree(tns);CHKERRQ(ierr);
351   ierr = PetscFree(tvc);CHKERRQ(ierr);
352   PetscFunctionReturn(0);
353 }
354 
355 #undef __FUNCT__
356 #define __FUNCT__ "MatGetColumnIJ_Inode"
357 static PetscErrorCode MatGetColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
358 {
359   PetscErrorCode ierr;
360 
361   PetscFunctionBegin;
362   ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr);
363   if (!ia) PetscFunctionReturn(0);
364 
365   if (!blockcompressed) {
366     ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
367   } else if (symmetric) {
368     /* Since the indices are symmetric it does'nt matter */
369     ierr = MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
370   } else {
371     ierr = MatGetColumnIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
372   }
373   PetscFunctionReturn(0);
374 }
375 
376 #undef __FUNCT__
377 #define __FUNCT__ "MatRestoreColumnIJ_Inode"
378 static PetscErrorCode MatRestoreColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
379 {
380   PetscErrorCode ierr;
381 
382   PetscFunctionBegin;
383   if (!ia) PetscFunctionReturn(0);
384   if (!blockcompressed) {
385     ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
386   } else {
387     ierr = PetscFree(*ia);CHKERRQ(ierr);
388     ierr = PetscFree(*ja);CHKERRQ(ierr);
389   }
390   PetscFunctionReturn(0);
391 }
392 
393 /* ----------------------------------------------------------- */
394 
395 #undef __FUNCT__
396 #define __FUNCT__ "MatMult_Inode"
397 static PetscErrorCode MatMult_Inode(Mat A,Vec xx,Vec yy)
398 {
399   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
400   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
401   PetscScalar       *y;
402   const PetscScalar *x;
403   const MatScalar   *v1,*v2,*v3,*v4,*v5;
404   PetscErrorCode    ierr;
405   PetscInt          *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0;
406 
407 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
408 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
409 #endif
410 
411   PetscFunctionBegin;
412   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
413   node_max = a->inode.node_count;
414   ns       = a->inode.size;     /* Node Size array */
415   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
416   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
417   idx  = a->j;
418   v1   = a->a;
419   ii   = a->i;
420 
421   for (i = 0,row = 0; i< node_max; ++i){
422     nsz  = ns[i];
423     n    = ii[1] - ii[0];
424     nonzerorow += (n>0)*nsz;
425     ii  += nsz;
426     sz   = n;                   /* No of non zeros in this row */
427                                 /* Switch on the size of Node */
428     switch (nsz){               /* Each loop in 'case' is unrolled */
429     case 1 :
430       sum1  = 0;
431 
432       for(n = 0; n< sz-1; n+=2) {
433         i1   = idx[0];          /* The instructions are ordered to */
434         i2   = idx[1];          /* make the compiler's job easy */
435         idx += 2;
436         tmp0 = x[i1];
437         tmp1 = x[i2];
438         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
439        }
440 
441       if (n == sz-1){          /* Take care of the last nonzero  */
442         tmp0  = x[*idx++];
443         sum1 += *v1++ * tmp0;
444       }
445       y[row++]=sum1;
446       break;
447     case 2:
448       sum1  = 0;
449       sum2  = 0;
450       v2    = v1 + n;
451 
452       for (n = 0; n< sz-1; n+=2) {
453         i1   = idx[0];
454         i2   = idx[1];
455         idx += 2;
456         tmp0 = x[i1];
457         tmp1 = x[i2];
458         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
459         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
460       }
461       if (n == sz-1){
462         tmp0  = x[*idx++];
463         sum1 += *v1++ * tmp0;
464         sum2 += *v2++ * tmp0;
465       }
466       y[row++]=sum1;
467       y[row++]=sum2;
468       v1      =v2;              /* Since the next block to be processed starts there*/
469       idx    +=sz;
470       break;
471     case 3:
472       sum1  = 0;
473       sum2  = 0;
474       sum3  = 0;
475       v2    = v1 + n;
476       v3    = v2 + n;
477 
478       for (n = 0; n< sz-1; n+=2) {
479         i1   = idx[0];
480         i2   = idx[1];
481         idx += 2;
482         tmp0 = x[i1];
483         tmp1 = x[i2];
484         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
485         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
486         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
487       }
488       if (n == sz-1){
489         tmp0  = x[*idx++];
490         sum1 += *v1++ * tmp0;
491         sum2 += *v2++ * tmp0;
492         sum3 += *v3++ * tmp0;
493       }
494       y[row++]=sum1;
495       y[row++]=sum2;
496       y[row++]=sum3;
497       v1       =v3;             /* Since the next block to be processed starts there*/
498       idx     +=2*sz;
499       break;
500     case 4:
501       sum1  = 0;
502       sum2  = 0;
503       sum3  = 0;
504       sum4  = 0;
505       v2    = v1 + n;
506       v3    = v2 + n;
507       v4    = v3 + n;
508 
509       for (n = 0; n< sz-1; n+=2) {
510         i1   = idx[0];
511         i2   = idx[1];
512         idx += 2;
513         tmp0 = x[i1];
514         tmp1 = x[i2];
515         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
516         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
517         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
518         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
519       }
520       if (n == sz-1){
521         tmp0  = x[*idx++];
522         sum1 += *v1++ * tmp0;
523         sum2 += *v2++ * tmp0;
524         sum3 += *v3++ * tmp0;
525         sum4 += *v4++ * tmp0;
526       }
527       y[row++]=sum1;
528       y[row++]=sum2;
529       y[row++]=sum3;
530       y[row++]=sum4;
531       v1      =v4;              /* Since the next block to be processed starts there*/
532       idx    +=3*sz;
533       break;
534     case 5:
535       sum1  = 0;
536       sum2  = 0;
537       sum3  = 0;
538       sum4  = 0;
539       sum5  = 0;
540       v2    = v1 + n;
541       v3    = v2 + n;
542       v4    = v3 + n;
543       v5    = v4 + n;
544 
545       for (n = 0; n<sz-1; n+=2) {
546         i1   = idx[0];
547         i2   = idx[1];
548         idx += 2;
549         tmp0 = x[i1];
550         tmp1 = x[i2];
551         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
552         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
553         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
554         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
555         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
556       }
557       if (n == sz-1){
558         tmp0  = x[*idx++];
559         sum1 += *v1++ * tmp0;
560         sum2 += *v2++ * tmp0;
561         sum3 += *v3++ * tmp0;
562         sum4 += *v4++ * tmp0;
563         sum5 += *v5++ * tmp0;
564       }
565       y[row++]=sum1;
566       y[row++]=sum2;
567       y[row++]=sum3;
568       y[row++]=sum4;
569       y[row++]=sum5;
570       v1      =v5;       /* Since the next block to be processed starts there */
571       idx    +=4*sz;
572       break;
573     default :
574       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
575     }
576   }
577   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
578   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
579   ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr);
580   PetscFunctionReturn(0);
581 }
582 /* ----------------------------------------------------------- */
583 /* Almost same code as the MatMult_Inode() */
584 #undef __FUNCT__
585 #define __FUNCT__ "MatMultAdd_Inode"
586 static PetscErrorCode MatMultAdd_Inode(Mat A,Vec xx,Vec zz,Vec yy)
587 {
588   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
589   PetscScalar    sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
590   MatScalar      *v1,*v2,*v3,*v4,*v5;
591   PetscScalar    *x,*y,*z,*zt;
592   PetscErrorCode ierr;
593   PetscInt       *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
594 
595   PetscFunctionBegin;
596   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
597   node_max = a->inode.node_count;
598   ns       = a->inode.size;     /* Node Size array */
599   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
600   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
601   if (zz != yy) {
602     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
603   } else {
604     z = y;
605   }
606   zt = z;
607 
608   idx  = a->j;
609   v1   = a->a;
610   ii   = a->i;
611 
612   for (i = 0,row = 0; i< node_max; ++i){
613     nsz  = ns[i];
614     n    = ii[1] - ii[0];
615     ii  += nsz;
616     sz   = n;                   /* No of non zeros in this row */
617                                 /* Switch on the size of Node */
618     switch (nsz){               /* Each loop in 'case' is unrolled */
619     case 1 :
620       sum1  = *zt++;
621 
622       for(n = 0; n< sz-1; n+=2) {
623         i1   = idx[0];          /* The instructions are ordered to */
624         i2   = idx[1];          /* make the compiler's job easy */
625         idx += 2;
626         tmp0 = x[i1];
627         tmp1 = x[i2];
628         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
629        }
630 
631       if(n   == sz-1){          /* Take care of the last nonzero  */
632         tmp0  = x[*idx++];
633         sum1 += *v1++ * tmp0;
634       }
635       y[row++]=sum1;
636       break;
637     case 2:
638       sum1  = *zt++;
639       sum2  = *zt++;
640       v2    = v1 + n;
641 
642       for(n = 0; n< sz-1; n+=2) {
643         i1   = idx[0];
644         i2   = idx[1];
645         idx += 2;
646         tmp0 = x[i1];
647         tmp1 = x[i2];
648         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
649         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
650       }
651       if(n   == sz-1){
652         tmp0  = x[*idx++];
653         sum1 += *v1++ * tmp0;
654         sum2 += *v2++ * tmp0;
655       }
656       y[row++]=sum1;
657       y[row++]=sum2;
658       v1      =v2;              /* Since the next block to be processed starts there*/
659       idx    +=sz;
660       break;
661     case 3:
662       sum1  = *zt++;
663       sum2  = *zt++;
664       sum3  = *zt++;
665       v2    = v1 + n;
666       v3    = v2 + n;
667 
668       for (n = 0; n< sz-1; n+=2) {
669         i1   = idx[0];
670         i2   = idx[1];
671         idx += 2;
672         tmp0 = x[i1];
673         tmp1 = x[i2];
674         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
675         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
676         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
677       }
678       if (n == sz-1){
679         tmp0  = x[*idx++];
680         sum1 += *v1++ * tmp0;
681         sum2 += *v2++ * tmp0;
682         sum3 += *v3++ * tmp0;
683       }
684       y[row++]=sum1;
685       y[row++]=sum2;
686       y[row++]=sum3;
687       v1       =v3;             /* Since the next block to be processed starts there*/
688       idx     +=2*sz;
689       break;
690     case 4:
691       sum1  = *zt++;
692       sum2  = *zt++;
693       sum3  = *zt++;
694       sum4  = *zt++;
695       v2    = v1 + n;
696       v3    = v2 + n;
697       v4    = v3 + n;
698 
699       for (n = 0; n< sz-1; n+=2) {
700         i1   = idx[0];
701         i2   = idx[1];
702         idx += 2;
703         tmp0 = x[i1];
704         tmp1 = x[i2];
705         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
706         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
707         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
708         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
709       }
710       if (n == sz-1){
711         tmp0  = x[*idx++];
712         sum1 += *v1++ * tmp0;
713         sum2 += *v2++ * tmp0;
714         sum3 += *v3++ * tmp0;
715         sum4 += *v4++ * tmp0;
716       }
717       y[row++]=sum1;
718       y[row++]=sum2;
719       y[row++]=sum3;
720       y[row++]=sum4;
721       v1      =v4;              /* Since the next block to be processed starts there*/
722       idx    +=3*sz;
723       break;
724     case 5:
725       sum1  = *zt++;
726       sum2  = *zt++;
727       sum3  = *zt++;
728       sum4  = *zt++;
729       sum5  = *zt++;
730       v2    = v1 + n;
731       v3    = v2 + n;
732       v4    = v3 + n;
733       v5    = v4 + n;
734 
735       for (n = 0; n<sz-1; n+=2) {
736         i1   = idx[0];
737         i2   = idx[1];
738         idx += 2;
739         tmp0 = x[i1];
740         tmp1 = x[i2];
741         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
742         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
743         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
744         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
745         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
746       }
747       if(n   == sz-1){
748         tmp0  = x[*idx++];
749         sum1 += *v1++ * tmp0;
750         sum2 += *v2++ * tmp0;
751         sum3 += *v3++ * tmp0;
752         sum4 += *v4++ * tmp0;
753         sum5 += *v5++ * tmp0;
754       }
755       y[row++]=sum1;
756       y[row++]=sum2;
757       y[row++]=sum3;
758       y[row++]=sum4;
759       y[row++]=sum5;
760       v1      =v5;       /* Since the next block to be processed starts there */
761       idx    +=4*sz;
762       break;
763     default :
764       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
765     }
766   }
767   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
768   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
769   if (zz != yy) {
770     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
771   }
772   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
773   PetscFunctionReturn(0);
774 }
775 
776 /* ----------------------------------------------------------- */
777 #undef __FUNCT__
778 #define __FUNCT__ "MatSolve_Inode"
779 PetscErrorCode MatSolve_Inode(Mat A,Vec bb,Vec xx)
780 {
781   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
782   IS                iscol = a->col,isrow = a->row;
783   PetscErrorCode    ierr;
784   const PetscInt    *r,*c,*rout,*cout;
785   PetscInt          i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
786   PetscInt          node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
787   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
788   PetscScalar       sum1,sum2,sum3,sum4,sum5;
789   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
790   const PetscScalar *b;
791 
792   PetscFunctionBegin;
793   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
794   node_max = a->inode.node_count;
795   ns       = a->inode.size;     /* Node Size array */
796 
797   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
798   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
799   tmp  = a->solve_work;
800 
801   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
802   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
803 
804   /* forward solve the lower triangular */
805   tmps = tmp ;
806   aa   = a_a ;
807   aj   = a_j ;
808   ad   = a->diag;
809 
810   for (i = 0,row = 0; i< node_max; ++i){
811     nsz = ns[i];
812     aii = ai[row];
813     v1  = aa + aii;
814     vi  = aj + aii;
815     nz  = ad[row]- aii;
816 
817     switch (nsz){               /* Each loop in 'case' is unrolled */
818     case 1 :
819       sum1 = b[*r++];
820       /*      while (nz--) sum1 -= *v1++ *tmps[*vi++];*/
821       for(j=0; j<nz-1; j+=2){
822         i0   = vi[0];
823         i1   = vi[1];
824         vi  +=2;
825         tmp0 = tmps[i0];
826         tmp1 = tmps[i1];
827         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
828       }
829       if(j == nz-1){
830         tmp0 = tmps[*vi++];
831         sum1 -= *v1++ *tmp0;
832       }
833       tmp[row ++]=sum1;
834       break;
835     case 2:
836       sum1 = b[*r++];
837       sum2 = b[*r++];
838       v2   = aa + ai[row+1];
839 
840       for(j=0; j<nz-1; j+=2){
841         i0   = vi[0];
842         i1   = vi[1];
843         vi  +=2;
844         tmp0 = tmps[i0];
845         tmp1 = tmps[i1];
846         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
847         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
848       }
849       if(j == nz-1){
850         tmp0 = tmps[*vi++];
851         sum1 -= *v1++ *tmp0;
852         sum2 -= *v2++ *tmp0;
853       }
854       sum2 -= *v2++ * sum1;
855       tmp[row ++]=sum1;
856       tmp[row ++]=sum2;
857       break;
858     case 3:
859       sum1 = b[*r++];
860       sum2 = b[*r++];
861       sum3 = b[*r++];
862       v2   = aa + ai[row+1];
863       v3   = aa + ai[row+2];
864 
865       for (j=0; j<nz-1; j+=2){
866         i0   = vi[0];
867         i1   = vi[1];
868         vi  +=2;
869         tmp0 = tmps[i0];
870         tmp1 = tmps[i1];
871         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
872         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
873         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
874       }
875       if (j == nz-1){
876         tmp0 = tmps[*vi++];
877         sum1 -= *v1++ *tmp0;
878         sum2 -= *v2++ *tmp0;
879         sum3 -= *v3++ *tmp0;
880       }
881       sum2 -= *v2++ * sum1;
882       sum3 -= *v3++ * sum1;
883       sum3 -= *v3++ * sum2;
884       tmp[row ++]=sum1;
885       tmp[row ++]=sum2;
886       tmp[row ++]=sum3;
887       break;
888 
889     case 4:
890       sum1 = b[*r++];
891       sum2 = b[*r++];
892       sum3 = b[*r++];
893       sum4 = b[*r++];
894       v2   = aa + ai[row+1];
895       v3   = aa + ai[row+2];
896       v4   = aa + ai[row+3];
897 
898       for (j=0; j<nz-1; j+=2){
899         i0   = vi[0];
900         i1   = vi[1];
901         vi  +=2;
902         tmp0 = tmps[i0];
903         tmp1 = tmps[i1];
904         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
905         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
906         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
907         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
908       }
909       if (j == nz-1){
910         tmp0 = tmps[*vi++];
911         sum1 -= *v1++ *tmp0;
912         sum2 -= *v2++ *tmp0;
913         sum3 -= *v3++ *tmp0;
914         sum4 -= *v4++ *tmp0;
915       }
916       sum2 -= *v2++ * sum1;
917       sum3 -= *v3++ * sum1;
918       sum4 -= *v4++ * sum1;
919       sum3 -= *v3++ * sum2;
920       sum4 -= *v4++ * sum2;
921       sum4 -= *v4++ * sum3;
922 
923       tmp[row ++]=sum1;
924       tmp[row ++]=sum2;
925       tmp[row ++]=sum3;
926       tmp[row ++]=sum4;
927       break;
928     case 5:
929       sum1 = b[*r++];
930       sum2 = b[*r++];
931       sum3 = b[*r++];
932       sum4 = b[*r++];
933       sum5 = b[*r++];
934       v2   = aa + ai[row+1];
935       v3   = aa + ai[row+2];
936       v4   = aa + ai[row+3];
937       v5   = aa + ai[row+4];
938 
939       for (j=0; j<nz-1; j+=2){
940         i0   = vi[0];
941         i1   = vi[1];
942         vi  +=2;
943         tmp0 = tmps[i0];
944         tmp1 = tmps[i1];
945         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
946         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
947         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
948         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
949         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
950       }
951       if (j == nz-1){
952         tmp0 = tmps[*vi++];
953         sum1 -= *v1++ *tmp0;
954         sum2 -= *v2++ *tmp0;
955         sum3 -= *v3++ *tmp0;
956         sum4 -= *v4++ *tmp0;
957         sum5 -= *v5++ *tmp0;
958       }
959 
960       sum2 -= *v2++ * sum1;
961       sum3 -= *v3++ * sum1;
962       sum4 -= *v4++ * sum1;
963       sum5 -= *v5++ * sum1;
964       sum3 -= *v3++ * sum2;
965       sum4 -= *v4++ * sum2;
966       sum5 -= *v5++ * sum2;
967       sum4 -= *v4++ * sum3;
968       sum5 -= *v5++ * sum3;
969       sum5 -= *v5++ * sum4;
970 
971       tmp[row ++]=sum1;
972       tmp[row ++]=sum2;
973       tmp[row ++]=sum3;
974       tmp[row ++]=sum4;
975       tmp[row ++]=sum5;
976       break;
977     default:
978       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
979     }
980   }
981   /* backward solve the upper triangular */
982   for (i=node_max -1 ,row = n-1 ; i>=0; i--){
983     nsz = ns[i];
984     aii = ai[row+1] -1;
985     v1  = aa + aii;
986     vi  = aj + aii;
987     nz  = aii- ad[row];
988     switch (nsz){               /* Each loop in 'case' is unrolled */
989     case 1 :
990       sum1 = tmp[row];
991 
992       for(j=nz ; j>1; j-=2){
993         vi  -=2;
994         i0   = vi[2];
995         i1   = vi[1];
996         tmp0 = tmps[i0];
997         tmp1 = tmps[i1];
998         v1   -= 2;
999         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1000       }
1001       if (j==1){
1002         tmp0  = tmps[*vi--];
1003         sum1 -= *v1-- * tmp0;
1004       }
1005       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1006       break;
1007     case 2 :
1008       sum1 = tmp[row];
1009       sum2 = tmp[row -1];
1010       v2   = aa + ai[row]-1;
1011       for (j=nz ; j>1; j-=2){
1012         vi  -=2;
1013         i0   = vi[2];
1014         i1   = vi[1];
1015         tmp0 = tmps[i0];
1016         tmp1 = tmps[i1];
1017         v1   -= 2;
1018         v2   -= 2;
1019         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1020         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1021       }
1022       if (j==1){
1023         tmp0  = tmps[*vi--];
1024         sum1 -= *v1-- * tmp0;
1025         sum2 -= *v2-- * tmp0;
1026       }
1027 
1028       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1029       sum2   -= *v2-- * tmp0;
1030       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1031       break;
1032     case 3 :
1033       sum1 = tmp[row];
1034       sum2 = tmp[row -1];
1035       sum3 = tmp[row -2];
1036       v2   = aa + ai[row]-1;
1037       v3   = aa + ai[row -1]-1;
1038       for (j=nz ; j>1; j-=2){
1039         vi  -=2;
1040         i0   = vi[2];
1041         i1   = vi[1];
1042         tmp0 = tmps[i0];
1043         tmp1 = tmps[i1];
1044         v1   -= 2;
1045         v2   -= 2;
1046         v3   -= 2;
1047         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1048         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1049         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1050       }
1051       if (j==1){
1052         tmp0  = tmps[*vi--];
1053         sum1 -= *v1-- * tmp0;
1054         sum2 -= *v2-- * tmp0;
1055         sum3 -= *v3-- * tmp0;
1056       }
1057       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1058       sum2   -= *v2-- * tmp0;
1059       sum3   -= *v3-- * tmp0;
1060       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1061       sum3   -= *v3-- * tmp0;
1062       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1063 
1064       break;
1065     case 4 :
1066       sum1 = tmp[row];
1067       sum2 = tmp[row -1];
1068       sum3 = tmp[row -2];
1069       sum4 = tmp[row -3];
1070       v2   = aa + ai[row]-1;
1071       v3   = aa + ai[row -1]-1;
1072       v4   = aa + ai[row -2]-1;
1073 
1074       for (j=nz ; j>1; j-=2){
1075         vi  -=2;
1076         i0   = vi[2];
1077         i1   = vi[1];
1078         tmp0 = tmps[i0];
1079         tmp1 = tmps[i1];
1080         v1  -= 2;
1081         v2  -= 2;
1082         v3  -= 2;
1083         v4  -= 2;
1084         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1085         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1086         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1087         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1088       }
1089       if (j==1){
1090         tmp0  = tmps[*vi--];
1091         sum1 -= *v1-- * tmp0;
1092         sum2 -= *v2-- * tmp0;
1093         sum3 -= *v3-- * tmp0;
1094         sum4 -= *v4-- * tmp0;
1095       }
1096 
1097       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1098       sum2   -= *v2-- * tmp0;
1099       sum3   -= *v3-- * tmp0;
1100       sum4   -= *v4-- * tmp0;
1101       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1102       sum3   -= *v3-- * tmp0;
1103       sum4   -= *v4-- * tmp0;
1104       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1105       sum4   -= *v4-- * tmp0;
1106       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1107       break;
1108     case 5 :
1109       sum1 = tmp[row];
1110       sum2 = tmp[row -1];
1111       sum3 = tmp[row -2];
1112       sum4 = tmp[row -3];
1113       sum5 = tmp[row -4];
1114       v2   = aa + ai[row]-1;
1115       v3   = aa + ai[row -1]-1;
1116       v4   = aa + ai[row -2]-1;
1117       v5   = aa + ai[row -3]-1;
1118       for (j=nz ; j>1; j-=2){
1119         vi  -= 2;
1120         i0   = vi[2];
1121         i1   = vi[1];
1122         tmp0 = tmps[i0];
1123         tmp1 = tmps[i1];
1124         v1   -= 2;
1125         v2   -= 2;
1126         v3   -= 2;
1127         v4   -= 2;
1128         v5   -= 2;
1129         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1130         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1131         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1132         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1133         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1134       }
1135       if (j==1){
1136         tmp0  = tmps[*vi--];
1137         sum1 -= *v1-- * tmp0;
1138         sum2 -= *v2-- * tmp0;
1139         sum3 -= *v3-- * tmp0;
1140         sum4 -= *v4-- * tmp0;
1141         sum5 -= *v5-- * tmp0;
1142       }
1143 
1144       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1145       sum2   -= *v2-- * tmp0;
1146       sum3   -= *v3-- * tmp0;
1147       sum4   -= *v4-- * tmp0;
1148       sum5   -= *v5-- * tmp0;
1149       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1150       sum3   -= *v3-- * tmp0;
1151       sum4   -= *v4-- * tmp0;
1152       sum5   -= *v5-- * tmp0;
1153       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1154       sum4   -= *v4-- * tmp0;
1155       sum5   -= *v5-- * tmp0;
1156       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1157       sum5   -= *v5-- * tmp0;
1158       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1159       break;
1160     default:
1161       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1162     }
1163   }
1164   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1165   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1166   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1167   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1168   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
1169   PetscFunctionReturn(0);
1170 }
1171 
1172 #undef __FUNCT__
1173 #define __FUNCT__ "MatLUFactorNumeric_Inode"
1174 /*
1175     Not currently used because the basic version is often a good amount faster
1176 */
1177 PetscErrorCode MatLUFactorNumeric_Inode(Mat B,Mat A,const MatFactorInfo *info)
1178 {
1179   Mat               C = B;
1180   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1181   IS                iscol = b->col,isrow = b->row,isicol = b->icol;
1182   PetscErrorCode    ierr;
1183   const PetscInt    *ns,*r,*ic,*c,*ics,*bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,*bi = b->i,*ai = a->i,*aj = a->j,*bd = b->diag,*pj;
1184   PetscInt          n = A->rmap->n;
1185   PetscInt          nz,nz_tmp,row,prow;
1186   PetscInt          i,j,idx,node_max,nodesz;
1187   PetscInt          *tmp_vec1,*tmp_vec2,*nsmap;
1188   PetscScalar       mul1,mul2,mul3,tmp;
1189   const MatScalar   *pv,*v1,*v2,*v3,*aa = a->a,*rtmp1;
1190   MatScalar         *rtmp11,*pc1,*pc2,*pc3,*ba = b->a,*rtmp22,*rtmp33;;
1191   PetscReal         rs=0.0;
1192   LUShift_Ctx       sctx;
1193   PetscInt          newshift;
1194 
1195   PetscFunctionBegin;
1196   sctx.shift_top      = 0;
1197   sctx.nshift_max     = 0;
1198   sctx.shift_lo       = 0;
1199   sctx.shift_hi       = 0;
1200   sctx.shift_fraction = 0;
1201 
1202   /* if both shift schemes are chosen by user, only use info->shiftpd */
1203   if (info->shiftpd) { /* set sctx.shift_top=max{rs} */
1204     sctx.shift_top = 0;
1205     for (i=0; i<n; i++) {
1206       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1207       rs    = 0.0;
1208       ajtmp = aj + ai[i];
1209       rtmp1 = aa + ai[i];
1210       nz = ai[i+1] - ai[i];
1211       for (j=0; j<nz; j++){
1212         if (*ajtmp != i){
1213           rs += PetscAbsScalar(*rtmp1++);
1214         } else {
1215           rs -= PetscRealPart(*rtmp1++);
1216         }
1217         ajtmp++;
1218       }
1219       if (rs>sctx.shift_top) sctx.shift_top = rs;
1220     }
1221     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1222     sctx.shift_top *= 1.1;
1223     sctx.nshift_max = 5;
1224     sctx.shift_lo   = 0.;
1225     sctx.shift_hi   = 1.;
1226   }
1227   sctx.shift_amount = 0;
1228   sctx.nshift       = 0;
1229 
1230   ierr  = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1231   ierr  = ISGetIndices(iscol,&c);CHKERRQ(ierr);
1232   ierr  = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1233   ierr  = PetscMalloc3(n,PetscScalar,&rtmp11,n,PetscScalar,&rtmp22,n,PetscScalar,&rtmp33);CHKERRQ(ierr);
1234   ierr  = PetscMemzero(rtmp11,n*sizeof(PetscScalar));CHKERRQ(ierr);
1235   ierr  = PetscMemzero(rtmp22,n*sizeof(PetscScalar));CHKERRQ(ierr);
1236   ierr  = PetscMemzero(rtmp33,n*sizeof(PetscScalar));CHKERRQ(ierr);
1237   ics   = ic ;
1238 
1239   node_max = a->inode.node_count;
1240   ns       = a->inode.size;
1241   if (!ns){
1242     SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information");
1243   }
1244 
1245   /* If max inode size > 3, split it into two inodes.*/
1246   /* also map the inode sizes according to the ordering */
1247   ierr = PetscMalloc2(n,PetscInt,&tmp_vec1,n,PetscInt,&nsmap);CHKERRQ(ierr);
1248   for (i=0,j=0; i<node_max; ++i,++j){
1249     if (ns[i]>3) {
1250       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1251       ++j;
1252       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1253     } else {
1254       tmp_vec1[j] = ns[i];
1255     }
1256   }
1257   /* Use the correct node_max */
1258   node_max = j;
1259 
1260   /* Now reorder the inode info based on mat re-ordering info */
1261   /* First create a row -> inode_size_array_index map */
1262   ierr = PetscMalloc(node_max*sizeof(PetscInt),&tmp_vec2);CHKERRQ(ierr);
1263   for (i=0,row=0; i<node_max; i++) {
1264     nodesz = tmp_vec1[i];
1265     for (j=0; j<nodesz; j++,row++) {
1266       nsmap[row] = i;
1267     }
1268   }
1269   /* Using nsmap, create a reordered ns structure */
1270   for (i=0,j=0; i< node_max; i++) {
1271     nodesz       = tmp_vec1[nsmap[r[j]]];    /* here the reordered row_no is in r[] */
1272     tmp_vec2[i]  = nodesz;
1273     j           += nodesz;
1274   }
1275   ierr = PetscFree2(tmp_vec1,nsmap);CHKERRQ(ierr);
1276   /* Now use the correct ns */
1277   ns = tmp_vec2;
1278 
1279   do {
1280     sctx.lushift = PETSC_FALSE;
1281     /* Now loop over each block-row, and do the factorization */
1282     for (i=0,row=0; i<node_max; i++) {
1283       nodesz = ns[i];
1284       nz     = bi[row+1] - bi[row];
1285       bjtmp  = bj + bi[row];
1286 
1287       switch (nodesz){
1288       case 1:
1289         for  (j=0; j<nz; j++){
1290           rtmp11[bjtmp[j]] = 0.0;
1291         }
1292 
1293         /* load in initial (unfactored row) */
1294         idx    = r[row];
1295         nz_tmp = ai[idx+1] - ai[idx];
1296         ajtmp  = aj + ai[idx];
1297         v1     = aa + ai[idx];
1298 
1299         for (j=0; j<nz_tmp; j++) {
1300           idx        = ics[ajtmp[j]];
1301           rtmp11[idx] = v1[j];
1302         }
1303         rtmp11[ics[r[row]]] += sctx.shift_amount;
1304 
1305         prow = *bjtmp++ ;
1306         while (prow < row) {
1307           pc1 = rtmp11 + prow;
1308           if (*pc1 != 0.0){
1309             pv   = ba + bd[prow];
1310             pj   = nbj + bd[prow];
1311             mul1 = *pc1 * *pv++;
1312             *pc1 = mul1;
1313             nz_tmp = bi[prow+1] - bd[prow] - 1;
1314             ierr = PetscLogFlops(2.0*nz_tmp);CHKERRQ(ierr);
1315             for (j=0; j<nz_tmp; j++) {
1316               rtmp11[pj[i]] -= mul1 * pv[j];
1317             }
1318           }
1319           prow = *bjtmp++ ;
1320         }
1321         pj  = bj + bi[row];
1322         pc1 = ba + bi[row];
1323 
1324         sctx.pv    = rtmp11[row];
1325         rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
1326         rs         = 0.0;
1327         for (j=0; j<nz; j++) {
1328           idx    = pj[j];
1329           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
1330           if (idx != row) rs += PetscAbsScalar(pc1[j]);
1331         }
1332         sctx.rs  = rs;
1333         ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr);
1334         if (newshift == 1) goto endofwhile;
1335         break;
1336 
1337       case 2:
1338         for (j=0; j<nz; j++) {
1339           idx        = bjtmp[j];
1340           rtmp11[idx] = 0.0;
1341           rtmp22[idx] = 0.0;
1342         }
1343 
1344         /* load in initial (unfactored row) */
1345         idx    = r[row];
1346         nz_tmp = ai[idx+1] - ai[idx];
1347         ajtmp  = aj + ai[idx];
1348         v1     = aa + ai[idx];
1349         v2     = aa + ai[idx+1];
1350         for (j=0; j<nz_tmp; j++) {
1351           idx        = ics[ajtmp[j]];
1352           rtmp11[idx] = v1[j];
1353           rtmp22[idx] = v2[j];
1354         }
1355         rtmp11[ics[r[row]]]   += sctx.shift_amount;
1356         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
1357 
1358         prow = *bjtmp++ ;
1359         while (prow < row) {
1360           pc1 = rtmp11 + prow;
1361           pc2 = rtmp22 + prow;
1362           if (*pc1 != 0.0 || *pc2 != 0.0){
1363             pv   = ba + bd[prow];
1364             pj   = nbj + bd[prow];
1365             mul1 = *pc1 * *pv;
1366             mul2 = *pc2 * *pv;
1367             ++pv;
1368             *pc1 = mul1;
1369             *pc2 = mul2;
1370 
1371             nz_tmp = bi[prow+1] - bd[prow] - 1;
1372             for (j=0; j<nz_tmp; j++) {
1373               tmp = pv[j];
1374               idx = pj[j];
1375               rtmp11[idx] -= mul1 * tmp;
1376               rtmp22[idx] -= mul2 * tmp;
1377             }
1378             ierr = PetscLogFlops(4.0*nz_tmp);CHKERRQ(ierr);
1379           }
1380           prow = *bjtmp++ ;
1381         }
1382 
1383         /* Now take care of diagonal 2x2 block. Note: prow = row here */
1384         pc1 = rtmp11 + prow;
1385         pc2 = rtmp22 + prow;
1386 
1387         sctx.pv = *pc1;
1388         pj      = bj + bi[prow];
1389         rs      = 0.0;
1390         for (j=0; j<nz; j++){
1391           idx = pj[j];
1392           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
1393         }
1394         sctx.rs = rs;
1395         ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr);
1396         if (newshift == 1) goto endofwhile;
1397 
1398         if (*pc2 != 0.0){
1399           pj     = nbj + bd[prow];
1400           mul2   = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
1401           *pc2   = mul2;
1402           nz_tmp = bi[prow+1] - bd[prow] - 1;
1403           for (j=0; j<nz_tmp; j++) {
1404             idx = pj[j] ;
1405             tmp = rtmp11[idx];
1406             rtmp22[idx] -= mul2 * tmp;
1407           }
1408           ierr = PetscLogFlops(2.0*nz_tmp);CHKERRQ(ierr);
1409         }
1410 
1411         pj  = bj + bi[row];
1412         pc1 = ba + bi[row];
1413         pc2 = ba + bi[row+1];
1414 
1415         sctx.pv = rtmp22[row+1];
1416         rs = 0.0;
1417         rtmp11[row]   = 1.0/rtmp11[row];
1418         rtmp22[row+1] = 1.0/rtmp22[row+1];
1419         /* copy row entries from dense representation to sparse */
1420         for (j=0; j<nz; j++) {
1421           idx    = pj[j];
1422           pc1[j] = rtmp11[idx];
1423           pc2[j] = rtmp22[idx];
1424           if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
1425         }
1426         sctx.rs = rs;
1427         ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr);
1428         if (newshift == 1) goto endofwhile;
1429         break;
1430 
1431       case 3:
1432         for  (j=0; j<nz; j++) {
1433           idx        = bjtmp[j];
1434           rtmp11[idx] = 0.0;
1435           rtmp22[idx] = 0.0;
1436           rtmp33[idx] = 0.0;
1437         }
1438         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
1439         idx    = r[row];
1440         nz_tmp = ai[idx+1] - ai[idx];
1441         ajtmp = aj + ai[idx];
1442         v1    = aa + ai[idx];
1443         v2    = aa + ai[idx+1];
1444         v3    = aa + ai[idx+2];
1445         for (j=0; j<nz_tmp; j++) {
1446           idx        = ics[ajtmp[j]];
1447           rtmp11[idx] = v1[j];
1448           rtmp22[idx] = v2[j];
1449           rtmp33[idx] = v3[j];
1450         }
1451         rtmp11[ics[r[row]]]   += sctx.shift_amount;
1452         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
1453         rtmp33[ics[r[row+2]]] += sctx.shift_amount;
1454 
1455         /* loop over all pivot row blocks above this row block */
1456         prow = *bjtmp++ ;
1457         while (prow < row) {
1458           pc1 = rtmp11 + prow;
1459           pc2 = rtmp22 + prow;
1460           pc3 = rtmp33 + prow;
1461           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){
1462             pv   = ba  + bd[prow];
1463             pj   = nbj + bd[prow];
1464             mul1 = *pc1 * *pv;
1465             mul2 = *pc2 * *pv;
1466             mul3 = *pc3 * *pv;
1467             ++pv;
1468             *pc1 = mul1;
1469             *pc2 = mul2;
1470             *pc3 = mul3;
1471 
1472             nz_tmp = bi[prow+1] - bd[prow] - 1;
1473             /* update this row based on pivot row */
1474             for (j=0; j<nz_tmp; j++) {
1475               tmp = pv[j];
1476               idx = pj[j];
1477               rtmp11[idx] -= mul1 * tmp;
1478               rtmp22[idx] -= mul2 * tmp;
1479               rtmp33[idx] -= mul3 * tmp;
1480             }
1481             ierr = PetscLogFlops(6.0*nz_tmp);CHKERRQ(ierr);
1482           }
1483           prow = *bjtmp++ ;
1484         }
1485 
1486         /* Now take care of diagonal 3x3 block in this set of rows */
1487         /* note: prow = row here */
1488         pc1 = rtmp11 + prow;
1489         pc2 = rtmp22 + prow;
1490         pc3 = rtmp33 + prow;
1491 
1492         sctx.pv = *pc1;
1493         pj      = bj + bi[prow];
1494         rs      = 0.0;
1495         for (j=0; j<nz; j++){
1496           idx = pj[j];
1497           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
1498         }
1499         sctx.rs = rs;
1500         ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr);
1501         if (newshift == 1) goto endofwhile;
1502 
1503         if (*pc2 != 0.0 || *pc3 != 0.0){
1504           mul2 = (*pc2)/(*pc1);
1505           mul3 = (*pc3)/(*pc1);
1506           *pc2 = mul2;
1507           *pc3 = mul3;
1508           nz_tmp = bi[prow+1] - bd[prow] - 1;
1509           pj     = nbj + bd[prow];
1510           for (j=0; j<nz_tmp; j++) {
1511             idx = pj[j] ;
1512             tmp = rtmp11[idx];
1513             rtmp22[idx] -= mul2 * tmp;
1514             rtmp33[idx] -= mul3 * tmp;
1515           }
1516           ierr = PetscLogFlops(4.0*nz_tmp);CHKERRQ(ierr);
1517         }
1518         ++prow;
1519 
1520         pc2 = rtmp22 + prow;
1521         pc3 = rtmp33 + prow;
1522         sctx.pv = *pc2;
1523         pj      = bj + bi[prow];
1524         rs      = 0.0;
1525         for (j=0; j<nz; j++){
1526           idx = pj[j];
1527           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
1528         }
1529         sctx.rs = rs;
1530         ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr);
1531         if (newshift == 1) goto endofwhile;
1532 
1533         if (*pc3 != 0.0){
1534           mul3   = (*pc3)/(*pc2);
1535           *pc3   = mul3;
1536           pj     = nbj + bd[prow];
1537           nz_tmp = bi[prow+1] - bd[prow] - 1;
1538           for (j=0; j<nz_tmp; j++) {
1539             idx = pj[j] ;
1540             tmp = rtmp22[idx];
1541             rtmp33[idx] -= mul3 * tmp;
1542           }
1543           ierr = PetscLogFlops(4.0*nz_tmp);CHKERRQ(ierr);
1544         }
1545 
1546         pj  = bj + bi[row];
1547         pc1 = ba + bi[row];
1548         pc2 = ba + bi[row+1];
1549         pc3 = ba + bi[row+2];
1550 
1551         sctx.pv = rtmp33[row+2];
1552         rs = 0.0;
1553         rtmp11[row]   = 1.0/rtmp11[row];
1554         rtmp22[row+1] = 1.0/rtmp22[row+1];
1555         rtmp33[row+2] = 1.0/rtmp33[row+2];
1556         /* copy row entries from dense representation to sparse */
1557         for (j=0; j<nz; j++) {
1558           idx    = pj[j];
1559           pc1[j] = rtmp11[idx];
1560           pc2[j] = rtmp22[idx];
1561           pc3[j] = rtmp33[idx];
1562           if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
1563         }
1564 
1565         sctx.rs = rs;
1566         ierr = MatLUCheckShift_inline(info,sctx,row+2,newshift);CHKERRQ(ierr);
1567         if (newshift == 1) goto endofwhile;
1568         break;
1569 
1570       default:
1571         SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1572       }
1573       row += nodesz;                 /* Update the row */
1574     }
1575     endofwhile:;
1576   } while (sctx.lushift);
1577   ierr = PetscFree3(rtmp11,rtmp22,rtmp33);CHKERRQ(ierr);
1578   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
1579   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
1580   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1581   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
1582   (B)->ops->solve           = MatSolve_Inode;
1583   /* do not set solve add, since MatSolve_Inode + Add is faster */
1584   C->ops->solvetranspose     = MatSolveTranspose_SeqAIJ;
1585   C->ops->solvetransposeadd  = MatSolveTransposeAdd_SeqAIJ;
1586   C->assembled   = PETSC_TRUE;
1587   C->preallocated = PETSC_TRUE;
1588   if (sctx.nshift) {
1589     if (info->shiftpd) {
1590       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr);
1591     } else if (info->shiftnz) {
1592       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
1593     }
1594   }
1595   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
1596   PetscFunctionReturn(0);
1597 }
1598 
1599 /*
1600      Makes a longer coloring[] array and calls the usual code with that
1601 */
1602 #undef __FUNCT__
1603 #define __FUNCT__ "MatColoringPatch_Inode"
1604 PetscErrorCode MatColoringPatch_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
1605 {
1606   Mat_SeqAIJ       *a = (Mat_SeqAIJ*)mat->data;
1607   PetscErrorCode  ierr;
1608   PetscInt        n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
1609   PetscInt        *colorused,i;
1610   ISColoringValue *newcolor;
1611 
1612   PetscFunctionBegin;
1613   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr);
1614   /* loop over inodes, marking a color for each column*/
1615   row = 0;
1616   for (i=0; i<m; i++){
1617     for (j=0; j<ns[i]; j++) {
1618       newcolor[row++] = coloring[i] + j*ncolors;
1619     }
1620   }
1621 
1622   /* eliminate unneeded colors */
1623   ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr);
1624   ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr);
1625   for (i=0; i<n; i++) {
1626     colorused[newcolor[i]] = 1;
1627   }
1628 
1629   for (i=1; i<5*ncolors; i++) {
1630     colorused[i] += colorused[i-1];
1631   }
1632   ncolors = colorused[5*ncolors-1];
1633   for (i=0; i<n; i++) {
1634     newcolor[i] = colorused[newcolor[i]]-1;
1635   }
1636   ierr = PetscFree(colorused);CHKERRQ(ierr);
1637   ierr = ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr);
1638   ierr = PetscFree(coloring);CHKERRQ(ierr);
1639   PetscFunctionReturn(0);
1640 }
1641 
1642 #include "../src/mat/blockinvert.h"
1643 
1644 #undef __FUNCT__
1645 #define __FUNCT__ "MatRelax_Inode"
1646 PetscErrorCode MatRelax_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
1647 {
1648   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1649   PetscScalar        sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3;
1650   MatScalar          *ibdiag,*bdiag;
1651   PetscScalar        *x,*xb,tmp4,tmp5,x1,x2,x3,x4,x5;
1652   const MatScalar    *v = a->a,*v1,*v2,*v3,*v4,*v5;
1653   PetscReal          zeropivot = 1.0e-15, shift = 0.0;
1654   PetscErrorCode     ierr;
1655   PetscInt           n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2;
1656   PetscInt           *idx,*diag = a->diag,*ii = a->i,sz,k;
1657 
1658   PetscFunctionBegin;
1659   if (omega != 1.0) SETERRQ(PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
1660   if (fshift != 0.0) SETERRQ(PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
1661   if (its > 1) {
1662     /* switch to non-inode version */
1663     ierr = MatRelax_SeqAIJ(A,bb,omega,flag,fshift,its,lits,xx);CHKERRQ(ierr);
1664     PetscFunctionReturn(0);
1665   }
1666 
1667   if (!a->inode.ibdiagvalid) {
1668     if (!a->inode.ibdiag) {
1669       /* calculate space needed for diagonal blocks */
1670       for (i=0; i<m; i++) {
1671 	cnt += sizes[i]*sizes[i];
1672       }
1673       a->inode.bdiagsize = cnt;
1674       ierr   = PetscMalloc3(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag,A->rmap->n,MatScalar,&a->inode.ssor_work);CHKERRQ(ierr);
1675     }
1676 
1677     /* copy over the diagonal blocks and invert them */
1678     ibdiag = a->inode.ibdiag;
1679     bdiag  = a->inode.bdiag;
1680     cnt = 0;
1681     for (i=0, row = 0; i<m; i++) {
1682       for (j=0; j<sizes[i]; j++) {
1683         for (k=0; k<sizes[i]; k++) {
1684           bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
1685         }
1686       }
1687       ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr);
1688 
1689       switch(sizes[i]) {
1690         case 1:
1691           /* Create matrix data structure */
1692           if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
1693           ibdiag[cnt] = 1.0/ibdiag[cnt];
1694           break;
1695         case 2:
1696           ierr = Kernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr);
1697           break;
1698         case 3:
1699           ierr = Kernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr);
1700           break;
1701         case 4:
1702           ierr = Kernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr);
1703           break;
1704         case 5:
1705           ierr = Kernel_A_gets_inverse_A_5(ibdiag+cnt,shift);CHKERRQ(ierr);
1706           break;
1707        default:
1708 	 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
1709       }
1710       cnt += sizes[i]*sizes[i];
1711       row += sizes[i];
1712     }
1713     a->inode.ibdiagvalid = PETSC_TRUE;
1714   }
1715   ibdiag = a->inode.ibdiag;
1716   bdiag  = a->inode.bdiag;
1717 
1718 
1719   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
1720   if (flag & SOR_ZERO_INITIAL_GUESS) {
1721     PetscScalar *b;
1722     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1723     if (xx != bb) {
1724       ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1725     } else {
1726       b = x;
1727     }
1728     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){
1729 
1730       for (i=0, row=0; i<m; i++) {
1731         sz  = diag[row] - ii[row];
1732         v1  = a->a + ii[row];
1733         idx = a->j + ii[row];
1734 
1735         /* see comments for MatMult_Inode() for how this is coded */
1736         switch (sizes[i]){
1737           case 1:
1738 
1739             sum1  = b[row];
1740             for(n = 0; n<sz-1; n+=2) {
1741               i1   = idx[0];
1742               i2   = idx[1];
1743               idx += 2;
1744               tmp0 = x[i1];
1745               tmp1 = x[i2];
1746               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1747             }
1748 
1749             if (n == sz-1){
1750               tmp0  = x[*idx];
1751               sum1 -= *v1 * tmp0;
1752             }
1753             x[row++] = sum1*(*ibdiag++);
1754             break;
1755           case 2:
1756             v2    = a->a + ii[row+1];
1757             sum1  = b[row];
1758             sum2  = b[row+1];
1759             for(n = 0; n<sz-1; n+=2) {
1760               i1   = idx[0];
1761               i2   = idx[1];
1762               idx += 2;
1763               tmp0 = x[i1];
1764               tmp1 = x[i2];
1765               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1766               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1767             }
1768 
1769             if (n == sz-1){
1770               tmp0  = x[*idx];
1771               sum1 -= v1[0] * tmp0;
1772               sum2 -= v2[0] * tmp0;
1773             }
1774             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
1775             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
1776             ibdiag  += 4;
1777             break;
1778           case 3:
1779             v2    = a->a + ii[row+1];
1780             v3    = a->a + ii[row+2];
1781             sum1  = b[row];
1782             sum2  = b[row+1];
1783             sum3  = b[row+2];
1784             for(n = 0; n<sz-1; n+=2) {
1785               i1   = idx[0];
1786               i2   = idx[1];
1787               idx += 2;
1788               tmp0 = x[i1];
1789               tmp1 = x[i2];
1790               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1791               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1792               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1793             }
1794 
1795             if (n == sz-1){
1796               tmp0  = x[*idx];
1797               sum1 -= v1[0] * tmp0;
1798               sum2 -= v2[0] * tmp0;
1799               sum3 -= v3[0] * tmp0;
1800             }
1801             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
1802             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
1803             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
1804             ibdiag  += 9;
1805             break;
1806           case 4:
1807             v2    = a->a + ii[row+1];
1808             v3    = a->a + ii[row+2];
1809             v4    = a->a + ii[row+3];
1810             sum1  = b[row];
1811             sum2  = b[row+1];
1812             sum3  = b[row+2];
1813             sum4  = b[row+3];
1814             for(n = 0; n<sz-1; n+=2) {
1815               i1   = idx[0];
1816               i2   = idx[1];
1817               idx += 2;
1818               tmp0 = x[i1];
1819               tmp1 = x[i2];
1820               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1821               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1822               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1823               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
1824             }
1825 
1826             if (n == sz-1){
1827               tmp0  = x[*idx];
1828               sum1 -= v1[0] * tmp0;
1829               sum2 -= v2[0] * tmp0;
1830               sum3 -= v3[0] * tmp0;
1831               sum4 -= v4[0] * tmp0;
1832             }
1833             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
1834             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
1835             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
1836             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
1837             ibdiag  += 16;
1838             break;
1839           case 5:
1840             v2    = a->a + ii[row+1];
1841             v3    = a->a + ii[row+2];
1842             v4    = a->a + ii[row+3];
1843             v5    = a->a + ii[row+4];
1844             sum1  = b[row];
1845             sum2  = b[row+1];
1846             sum3  = b[row+2];
1847             sum4  = b[row+3];
1848             sum5  = b[row+4];
1849             for(n = 0; n<sz-1; n+=2) {
1850               i1   = idx[0];
1851               i2   = idx[1];
1852               idx += 2;
1853               tmp0 = x[i1];
1854               tmp1 = x[i2];
1855               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1856               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1857               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1858               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
1859               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
1860             }
1861 
1862             if (n == sz-1){
1863               tmp0  = x[*idx];
1864               sum1 -= v1[0] * tmp0;
1865               sum2 -= v2[0] * tmp0;
1866               sum3 -= v3[0] * tmp0;
1867               sum4 -= v4[0] * tmp0;
1868               sum5 -= v5[0] * tmp0;
1869             }
1870             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
1871             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
1872             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
1873             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
1874             x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
1875             ibdiag  += 25;
1876             break;
1877 	  default:
1878    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
1879         }
1880       }
1881 
1882       xb = x;
1883       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
1884     } else xb = b;
1885     if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) &&
1886         (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1887       cnt = 0;
1888       for (i=0, row=0; i<m; i++) {
1889 
1890         switch (sizes[i]){
1891           case 1:
1892             x[row++] *= bdiag[cnt++];
1893             break;
1894           case 2:
1895             x1   = x[row]; x2 = x[row+1];
1896             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
1897             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
1898             x[row++] = tmp1;
1899             x[row++] = tmp2;
1900             cnt += 4;
1901             break;
1902           case 3:
1903             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2];
1904             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
1905             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
1906             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
1907             x[row++] = tmp1;
1908             x[row++] = tmp2;
1909             x[row++] = tmp3;
1910             cnt += 9;
1911             break;
1912           case 4:
1913             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
1914             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
1915             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
1916             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
1917             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
1918             x[row++] = tmp1;
1919             x[row++] = tmp2;
1920             x[row++] = tmp3;
1921             x[row++] = tmp4;
1922             cnt += 16;
1923             break;
1924           case 5:
1925             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
1926             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
1927             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
1928             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
1929             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
1930             tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
1931             x[row++] = tmp1;
1932             x[row++] = tmp2;
1933             x[row++] = tmp3;
1934             x[row++] = tmp4;
1935             x[row++] = tmp5;
1936             cnt += 25;
1937             break;
1938 	  default:
1939    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
1940         }
1941       }
1942       ierr = PetscLogFlops(m);CHKERRQ(ierr);
1943     }
1944     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){
1945 
1946       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
1947       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
1948         ibdiag -= sizes[i]*sizes[i];
1949         sz      = ii[row+1] - diag[row] - 1;
1950         v1      = a->a + diag[row] + 1;
1951         idx     = a->j + diag[row] + 1;
1952 
1953         /* see comments for MatMult_Inode() for how this is coded */
1954         switch (sizes[i]){
1955           case 1:
1956 
1957             sum1  = xb[row];
1958             for(n = 0; n<sz-1; n+=2) {
1959               i1   = idx[0];
1960               i2   = idx[1];
1961               idx += 2;
1962               tmp0 = x[i1];
1963               tmp1 = x[i2];
1964               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1965             }
1966 
1967             if (n == sz-1){
1968               tmp0  = x[*idx];
1969               sum1 -= *v1*tmp0;
1970             }
1971             x[row--] = sum1*(*ibdiag);
1972             break;
1973 
1974           case 2:
1975 
1976             sum1  = xb[row];
1977             sum2  = xb[row-1];
1978             /* note that sum1 is associated with the second of the two rows */
1979             v2    = a->a + diag[row-1] + 2;
1980             for(n = 0; n<sz-1; n+=2) {
1981               i1   = idx[0];
1982               i2   = idx[1];
1983               idx += 2;
1984               tmp0 = x[i1];
1985               tmp1 = x[i2];
1986               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1987               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1988             }
1989 
1990             if (n == sz-1){
1991               tmp0  = x[*idx];
1992               sum1 -= *v1*tmp0;
1993               sum2 -= *v2*tmp0;
1994             }
1995             x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
1996             x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
1997             break;
1998           case 3:
1999 
2000             sum1  = xb[row];
2001             sum2  = xb[row-1];
2002             sum3  = xb[row-2];
2003             v2    = a->a + diag[row-1] + 2;
2004             v3    = a->a + diag[row-2] + 3;
2005             for(n = 0; n<sz-1; n+=2) {
2006               i1   = idx[0];
2007               i2   = idx[1];
2008               idx += 2;
2009               tmp0 = x[i1];
2010               tmp1 = x[i2];
2011               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2012               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2013               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2014             }
2015 
2016             if (n == sz-1){
2017               tmp0  = x[*idx];
2018               sum1 -= *v1*tmp0;
2019               sum2 -= *v2*tmp0;
2020               sum3 -= *v3*tmp0;
2021             }
2022             x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
2023             x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
2024             x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
2025             break;
2026           case 4:
2027 
2028             sum1  = xb[row];
2029             sum2  = xb[row-1];
2030             sum3  = xb[row-2];
2031             sum4  = xb[row-3];
2032             v2    = a->a + diag[row-1] + 2;
2033             v3    = a->a + diag[row-2] + 3;
2034             v4    = a->a + diag[row-3] + 4;
2035             for(n = 0; n<sz-1; n+=2) {
2036               i1   = idx[0];
2037               i2   = idx[1];
2038               idx += 2;
2039               tmp0 = x[i1];
2040               tmp1 = x[i2];
2041               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2042               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2043               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2044               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2045             }
2046 
2047             if (n == sz-1){
2048               tmp0  = x[*idx];
2049               sum1 -= *v1*tmp0;
2050               sum2 -= *v2*tmp0;
2051               sum3 -= *v3*tmp0;
2052               sum4 -= *v4*tmp0;
2053             }
2054             x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
2055             x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
2056             x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
2057             x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
2058             break;
2059           case 5:
2060 
2061             sum1  = xb[row];
2062             sum2  = xb[row-1];
2063             sum3  = xb[row-2];
2064             sum4  = xb[row-3];
2065             sum5  = xb[row-4];
2066             v2    = a->a + diag[row-1] + 2;
2067             v3    = a->a + diag[row-2] + 3;
2068             v4    = a->a + diag[row-3] + 4;
2069             v5    = a->a + diag[row-4] + 5;
2070             for(n = 0; n<sz-1; n+=2) {
2071               i1   = idx[0];
2072               i2   = idx[1];
2073               idx += 2;
2074               tmp0 = x[i1];
2075               tmp1 = x[i2];
2076               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2077               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2078               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2079               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2080               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2081             }
2082 
2083             if (n == sz-1){
2084               tmp0  = x[*idx];
2085               sum1 -= *v1*tmp0;
2086               sum2 -= *v2*tmp0;
2087               sum3 -= *v3*tmp0;
2088               sum4 -= *v4*tmp0;
2089               sum5 -= *v5*tmp0;
2090             }
2091             x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
2092             x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
2093             x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
2094             x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
2095             x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
2096             break;
2097 	  default:
2098    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2099         }
2100       }
2101 
2102       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2103     }
2104     its--;
2105     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2106     if (bb != xx) {ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);}
2107   }
2108   if (flag & SOR_EISENSTAT) {
2109     const PetscScalar *b;
2110     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2111     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2112     MatScalar *t = a->inode.ssor_work;
2113     /*
2114           Apply  (U + D)^-1  where D is now the block diagonal
2115     */
2116     ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
2117     for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
2118       ibdiag -= sizes[i]*sizes[i];
2119       sz      = ii[row+1] - diag[row] - 1;
2120       v1      = a->a + diag[row] + 1;
2121       idx     = a->j + diag[row] + 1;
2122       CHKMEMQ;
2123       /* see comments for MatMult_Inode() for how this is coded */
2124       switch (sizes[i]){
2125         case 1:
2126 
2127 	  sum1  = b[row];
2128 	  for(n = 0; n<sz-1; n+=2) {
2129 	    i1   = idx[0];
2130 	    i2   = idx[1];
2131 	    idx += 2;
2132 	    tmp0 = x[i1];
2133 	    tmp1 = x[i2];
2134 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2135 	  }
2136 
2137 	  if (n == sz-1){
2138 	    tmp0  = x[*idx];
2139 	    sum1 -= *v1*tmp0;
2140 	  }
2141 	  x[row] = sum1*(*ibdiag);row--;
2142 	  break;
2143 
2144         case 2:
2145 
2146 	  sum1  = b[row];
2147 	  sum2  = b[row-1];
2148 	  /* note that sum1 is associated with the second of the two rows */
2149 	  v2    = a->a + diag[row-1] + 2;
2150 	  for(n = 0; n<sz-1; n+=2) {
2151 	    i1   = idx[0];
2152 	    i2   = idx[1];
2153 	    idx += 2;
2154 	    tmp0 = x[i1];
2155 	    tmp1 = x[i2];
2156 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2157 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2158 	  }
2159 
2160 	  if (n == sz-1){
2161 	    tmp0  = x[*idx];
2162 	    sum1 -= *v1*tmp0;
2163 	    sum2 -= *v2*tmp0;
2164 	  }
2165 	  x[row] = sum2*ibdiag[1] + sum1*ibdiag[3];
2166 	  x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
2167           row -= 2;
2168 	  break;
2169         case 3:
2170 
2171 	  sum1  = b[row];
2172 	  sum2  = b[row-1];
2173 	  sum3  = b[row-2];
2174 	  v2    = a->a + diag[row-1] + 2;
2175 	  v3    = a->a + diag[row-2] + 3;
2176 	  for(n = 0; n<sz-1; n+=2) {
2177 	    i1   = idx[0];
2178 	    i2   = idx[1];
2179 	    idx += 2;
2180 	    tmp0 = x[i1];
2181 	    tmp1 = x[i2];
2182 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2183 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2184 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2185 	  }
2186 
2187 	  if (n == sz-1){
2188 	    tmp0  = x[*idx];
2189 	    sum1 -= *v1*tmp0;
2190 	    sum2 -= *v2*tmp0;
2191 	    sum3 -= *v3*tmp0;
2192 	  }
2193 	  x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
2194 	  x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
2195 	  x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
2196           row -= 3;
2197 	  break;
2198         case 4:
2199 
2200 	  sum1  = b[row];
2201 	  sum2  = b[row-1];
2202 	  sum3  = b[row-2];
2203 	  sum4  = b[row-3];
2204 	  v2    = a->a + diag[row-1] + 2;
2205 	  v3    = a->a + diag[row-2] + 3;
2206 	  v4    = a->a + diag[row-3] + 4;
2207 	  for(n = 0; n<sz-1; n+=2) {
2208 	    i1   = idx[0];
2209 	    i2   = idx[1];
2210 	    idx += 2;
2211 	    tmp0 = x[i1];
2212 	    tmp1 = x[i2];
2213 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2214 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2215 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2216 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2217 	  }
2218 
2219 	  if (n == sz-1){
2220 	    tmp0  = x[*idx];
2221 	    sum1 -= *v1*tmp0;
2222 	    sum2 -= *v2*tmp0;
2223 	    sum3 -= *v3*tmp0;
2224 	    sum4 -= *v4*tmp0;
2225 	  }
2226 	  x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
2227 	  x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
2228 	  x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
2229 	  x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
2230           row -= 4;
2231 	  break;
2232         case 5:
2233 
2234 	  sum1  = b[row];
2235 	  sum2  = b[row-1];
2236 	  sum3  = b[row-2];
2237 	  sum4  = b[row-3];
2238 	  sum5  = b[row-4];
2239 	  v2    = a->a + diag[row-1] + 2;
2240 	  v3    = a->a + diag[row-2] + 3;
2241 	  v4    = a->a + diag[row-3] + 4;
2242 	  v5    = a->a + diag[row-4] + 5;
2243 	  for(n = 0; n<sz-1; n+=2) {
2244 	    i1   = idx[0];
2245 	    i2   = idx[1];
2246 	    idx += 2;
2247 	    tmp0 = x[i1];
2248 	    tmp1 = x[i2];
2249 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2250 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2251 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2252 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2253 	    sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2254 	  }
2255 
2256 	  if (n == sz-1){
2257 	    tmp0  = x[*idx];
2258 	    sum1 -= *v1*tmp0;
2259 	    sum2 -= *v2*tmp0;
2260 	    sum3 -= *v3*tmp0;
2261 	    sum4 -= *v4*tmp0;
2262 	    sum5 -= *v5*tmp0;
2263 	  }
2264 	  x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
2265 	  x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
2266 	  x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
2267 	  x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
2268 	  x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
2269           row -= 5;
2270 	  break;
2271         default:
2272 	  SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2273       }
2274       CHKMEMQ;
2275     }
2276     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2277 
2278     /*
2279            t = b - D x    where D is the block diagonal
2280     */
2281     cnt = 0;
2282     for (i=0, row=0; i<m; i++) {
2283       CHKMEMQ;
2284       switch (sizes[i]){
2285         case 1:
2286 	  t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
2287 	  break;
2288         case 2:
2289 	  x1   = x[row]; x2 = x[row+1];
2290 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
2291 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
2292 	  t[row]   = b[row] - tmp1;
2293 	  t[row+1] = b[row+1] - tmp2; row += 2;
2294 	  cnt += 4;
2295 	  break;
2296         case 3:
2297 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2];
2298 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
2299 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
2300 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
2301 	  t[row] = b[row] - tmp1;
2302 	  t[row+1] = b[row+1] - tmp2;
2303 	  t[row+2] = b[row+2] - tmp3; row += 3;
2304 	  cnt += 9;
2305 	  break;
2306         case 4:
2307 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
2308 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
2309 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
2310 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
2311 	  tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
2312 	  t[row] = b[row] - tmp1;
2313 	  t[row+1] = b[row+1] - tmp2;
2314 	  t[row+2] = b[row+2] - tmp3;
2315 	  t[row+3] = b[row+3] - tmp4; row += 4;
2316 	  cnt += 16;
2317 	  break;
2318         case 5:
2319 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
2320 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
2321 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
2322 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
2323 	  tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
2324 	  tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
2325 	  t[row] = b[row] - tmp1;
2326 	  t[row+1] = b[row+1] - tmp2;
2327 	  t[row+2] = b[row+2] - tmp3;
2328 	  t[row+3] = b[row+3] - tmp4;
2329 	  t[row+4] = b[row+4] - tmp5;row += 5;
2330 	  cnt += 25;
2331 	  break;
2332         default:
2333 	  SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2334       }
2335       CHKMEMQ;
2336     }
2337     ierr = PetscLogFlops(m);CHKERRQ(ierr);
2338 
2339 
2340 
2341     /*
2342           Apply (L + D)^-1 where D is the block diagonal
2343     */
2344     for (i=0, row=0; i<m; i++) {
2345       sz  = diag[row] - ii[row];
2346       v1  = a->a + ii[row];
2347       idx = a->j + ii[row];
2348       CHKMEMQ;
2349       /* see comments for MatMult_Inode() for how this is coded */
2350       switch (sizes[i]){
2351         case 1:
2352 
2353 	  sum1  = t[row];
2354 	  for(n = 0; n<sz-1; n+=2) {
2355 	    i1   = idx[0];
2356 	    i2   = idx[1];
2357 	    idx += 2;
2358 	    tmp0 = t[i1];
2359 	    tmp1 = t[i2];
2360 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2361 	  }
2362 
2363 	  if (n == sz-1){
2364 	    tmp0  = t[*idx];
2365 	    sum1 -= *v1 * tmp0;
2366 	  }
2367 	  x[row] += t[row] = sum1*(*ibdiag++); row++;
2368 	  break;
2369         case 2:
2370 	  v2    = a->a + ii[row+1];
2371 	  sum1  = t[row];
2372 	  sum2  = t[row+1];
2373 	  for(n = 0; n<sz-1; n+=2) {
2374 	    i1   = idx[0];
2375 	    i2   = idx[1];
2376 	    idx += 2;
2377 	    tmp0 = t[i1];
2378 	    tmp1 = t[i2];
2379 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2380 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2381 	  }
2382 
2383 	  if (n == sz-1){
2384 	    tmp0  = t[*idx];
2385 	    sum1 -= v1[0] * tmp0;
2386 	    sum2 -= v2[0] * tmp0;
2387 	  }
2388 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
2389 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
2390 	  ibdiag  += 4; row += 2;
2391 	  break;
2392         case 3:
2393 	  v2    = a->a + ii[row+1];
2394 	  v3    = a->a + ii[row+2];
2395 	  sum1  = t[row];
2396 	  sum2  = t[row+1];
2397 	  sum3  = t[row+2];
2398 	  for(n = 0; n<sz-1; n+=2) {
2399 	    i1   = idx[0];
2400 	    i2   = idx[1];
2401 	    idx += 2;
2402 	    tmp0 = t[i1];
2403 	    tmp1 = t[i2];
2404 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2405 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2406 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2407 	  }
2408 
2409 	  if (n == sz-1){
2410 	    tmp0  = t[*idx];
2411 	    sum1 -= v1[0] * tmp0;
2412 	    sum2 -= v2[0] * tmp0;
2413 	    sum3 -= v3[0] * tmp0;
2414 	  }
2415 	  x[row]  += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2416 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2417 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2418 	  ibdiag  += 9; row += 3;
2419 	  break;
2420         case 4:
2421 	  v2    = a->a + ii[row+1];
2422 	  v3    = a->a + ii[row+2];
2423 	  v4    = a->a + ii[row+3];
2424 	  sum1  = t[row];
2425 	  sum2  = t[row+1];
2426 	  sum3  = t[row+2];
2427 	  sum4  = t[row+3];
2428 	  for(n = 0; n<sz-1; n+=2) {
2429 	    i1   = idx[0];
2430 	    i2   = idx[1];
2431 	    idx += 2;
2432 	    tmp0 = t[i1];
2433 	    tmp1 = t[i2];
2434 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2435 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2436 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2437 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2438 	  }
2439 
2440 	  if (n == sz-1){
2441 	    tmp0  = t[*idx];
2442 	    sum1 -= v1[0] * tmp0;
2443 	    sum2 -= v2[0] * tmp0;
2444 	    sum3 -= v3[0] * tmp0;
2445 	    sum4 -= v4[0] * tmp0;
2446 	  }
2447 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2448 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2449 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2450 	  x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2451 	  ibdiag  += 16; row += 4;
2452 	  break;
2453         case 5:
2454 	  v2    = a->a + ii[row+1];
2455 	  v3    = a->a + ii[row+2];
2456 	  v4    = a->a + ii[row+3];
2457 	  v5    = a->a + ii[row+4];
2458 	  sum1  = t[row];
2459 	  sum2  = t[row+1];
2460 	  sum3  = t[row+2];
2461 	  sum4  = t[row+3];
2462 	  sum5  = t[row+4];
2463 	  for(n = 0; n<sz-1; n+=2) {
2464 	    i1   = idx[0];
2465 	    i2   = idx[1];
2466 	    idx += 2;
2467 	    tmp0 = t[i1];
2468 	    tmp1 = t[i2];
2469 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2470 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2471 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2472 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2473 	    sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2474 	  }
2475 
2476 	  if (n == sz-1){
2477 	    tmp0  = t[*idx];
2478 	    sum1 -= v1[0] * tmp0;
2479 	    sum2 -= v2[0] * tmp0;
2480 	    sum3 -= v3[0] * tmp0;
2481 	    sum4 -= v4[0] * tmp0;
2482 	    sum5 -= v5[0] * tmp0;
2483 	  }
2484 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2485 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
2486 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
2487 	  x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
2488 	  x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
2489 	  ibdiag  += 25; row += 5;
2490 	  break;
2491         default:
2492 	  SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2493       }
2494       CHKMEMQ;
2495     }
2496     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2497     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2498     ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2499   }
2500   PetscFunctionReturn(0);
2501 }
2502 
2503 #undef __FUNCT__
2504 #define __FUNCT__ "MatMultDiagonalBlock_Inode"
2505 PetscErrorCode MatMultDiagonalBlock_Inode(Mat A,Vec bb,Vec xx)
2506 {
2507   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
2508   PetscScalar        *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
2509   const MatScalar    *bdiag = a->inode.bdiag;
2510   const PetscScalar  *b;
2511   PetscErrorCode      ierr;
2512   PetscInt            m = a->inode.node_count,cnt = 0,i,row;
2513   const PetscInt      *sizes = a->inode.size;
2514 
2515   PetscFunctionBegin;
2516   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2517   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2518   cnt = 0;
2519   for (i=0, row=0; i<m; i++) {
2520     switch (sizes[i]){
2521       case 1:
2522 	x[row] = b[row]*bdiag[cnt++];row++;
2523 	break;
2524       case 2:
2525 	x1   = b[row]; x2 = b[row+1];
2526 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
2527 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
2528 	x[row++] = tmp1;
2529 	x[row++] = tmp2;
2530 	cnt += 4;
2531 	break;
2532       case 3:
2533 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2];
2534 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
2535 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
2536 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
2537 	x[row++] = tmp1;
2538 	x[row++] = tmp2;
2539 	x[row++] = tmp3;
2540 	cnt += 9;
2541 	break;
2542       case 4:
2543 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
2544 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
2545 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
2546 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
2547 	tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
2548 	x[row++] = tmp1;
2549 	x[row++] = tmp2;
2550 	x[row++] = tmp3;
2551 	x[row++] = tmp4;
2552 	cnt += 16;
2553 	break;
2554       case 5:
2555 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
2556 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
2557 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
2558 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
2559 	tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
2560 	tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
2561 	x[row++] = tmp1;
2562 	x[row++] = tmp2;
2563 	x[row++] = tmp3;
2564 	x[row++] = tmp4;
2565 	x[row++] = tmp5;
2566 	cnt += 25;
2567 	break;
2568       default:
2569 	SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2570     }
2571   }
2572   ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr);
2573   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2574   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2575   PetscFunctionReturn(0);
2576 }
2577 
2578 extern PetscErrorCode MatMultDiagonalBlock_Inode(Mat,Vec,Vec);
2579 /*
2580     samestructure indicates that the matrix has not changed its nonzero structure so we
2581     do not need to recompute the inodes
2582 */
2583 #undef __FUNCT__
2584 #define __FUNCT__ "Mat_CheckInode"
2585 PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure)
2586 {
2587   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
2588   PetscErrorCode ierr;
2589   PetscInt       i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size;
2590   PetscTruth     flag;
2591 
2592   PetscFunctionBegin;
2593   if (!a->inode.use)                     PetscFunctionReturn(0);
2594   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
2595 
2596 
2597   m = A->rmap->n;
2598   if (a->inode.size) {ns = a->inode.size;}
2599   else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);}
2600 
2601   i          = 0;
2602   node_count = 0;
2603   idx        = a->j;
2604   ii         = a->i;
2605   while (i < m){                /* For each row */
2606     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
2607     /* Limits the number of elements in a node to 'a->inode.limit' */
2608     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
2609       nzy     = ii[j+1] - ii[j]; /* Same number of nonzeros */
2610       if (nzy != nzx) break;
2611       idy  += nzx;             /* Same nonzero pattern */
2612       ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
2613       if (!flag) break;
2614     }
2615     ns[node_count++] = blk_size;
2616     idx += blk_size*nzx;
2617     i    = j;
2618   }
2619   /* If not enough inodes found,, do not use inode version of the routines */
2620   if (!a->inode.size && m && node_count > .9*m) {
2621     ierr = PetscFree(ns);CHKERRQ(ierr);
2622     a->inode.node_count     = 0;
2623     a->inode.size           = PETSC_NULL;
2624     a->inode.use            = PETSC_FALSE;
2625     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
2626   } else {
2627     A->ops->mult              = MatMult_Inode;
2628     A->ops->relax             = MatRelax_Inode;
2629     A->ops->multadd           = MatMultAdd_Inode;
2630     A->ops->getrowij          = MatGetRowIJ_Inode;
2631     A->ops->restorerowij      = MatRestoreRowIJ_Inode;
2632     A->ops->getcolumnij       = MatGetColumnIJ_Inode;
2633     A->ops->restorecolumnij   = MatRestoreColumnIJ_Inode;
2634     A->ops->coloringpatch     = MatColoringPatch_Inode;
2635     A->ops->multdiagonalblock = MatMultDiagonalBlock_Inode;
2636     a->inode.node_count       = node_count;
2637     a->inode.size             = ns;
2638     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
2639   }
2640   PetscFunctionReturn(0);
2641 }
2642 
2643 /*
2644      This is really ugly. if inodes are used this replaces the
2645   permutations with ones that correspond to rows/cols of the matrix
2646   rather then inode blocks
2647 */
2648 #undef __FUNCT__
2649 #define __FUNCT__ "MatInodeAdjustForInodes"
2650 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
2651 {
2652   PetscErrorCode ierr,(*f)(Mat,IS*,IS*);
2653 
2654   PetscFunctionBegin;
2655   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);CHKERRQ(ierr);
2656   if (f) {
2657     ierr = (*f)(A,rperm,cperm);CHKERRQ(ierr);
2658   }
2659   PetscFunctionReturn(0);
2660 }
2661 
2662 EXTERN_C_BEGIN
2663 #undef __FUNCT__
2664 #define __FUNCT__ "MatAdjustForInodes_Inode"
2665 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes_Inode(Mat A,IS *rperm,IS *cperm)
2666 {
2667   Mat_SeqAIJ      *a=(Mat_SeqAIJ*)A->data;
2668   PetscErrorCode ierr;
2669   PetscInt       m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
2670   const PetscInt *ridx,*cidx;
2671   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
2672   PetscInt       nslim_col,*ns_col;
2673   IS             ris = *rperm,cis = *cperm;
2674 
2675   PetscFunctionBegin;
2676   if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */
2677   if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */
2678 
2679   ierr  = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
2680   ierr  = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
2681   ierr  = PetscMalloc((m+n+1)*sizeof(PetscInt),&permr);CHKERRQ(ierr);
2682   permc = permr + m;
2683 
2684   ierr  = ISGetIndices(ris,&ridx);CHKERRQ(ierr);
2685   ierr  = ISGetIndices(cis,&cidx);CHKERRQ(ierr);
2686 
2687   /* Form the inode structure for the rows of permuted matric using inv perm*/
2688   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
2689 
2690   /* Construct the permutations for rows*/
2691   for (i=0,row = 0; i<nslim_row; ++i){
2692     indx      = ridx[i];
2693     start_val = tns[indx];
2694     end_val   = tns[indx + 1];
2695     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
2696   }
2697 
2698   /* Form the inode structure for the columns of permuted matrix using inv perm*/
2699   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
2700 
2701  /* Construct permutations for columns */
2702   for (i=0,col=0; i<nslim_col; ++i){
2703     indx      = cidx[i];
2704     start_val = tns[indx];
2705     end_val   = tns[indx + 1];
2706     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
2707   }
2708 
2709   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);CHKERRQ(ierr);
2710   ISSetPermutation(*rperm);
2711   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);CHKERRQ(ierr);
2712   ISSetPermutation(*cperm);
2713 
2714   ierr  = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr);
2715   ierr  = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr);
2716 
2717   ierr = PetscFree(ns_col);CHKERRQ(ierr);
2718   ierr = PetscFree(permr);CHKERRQ(ierr);
2719   ierr = ISDestroy(cis);CHKERRQ(ierr);
2720   ierr = ISDestroy(ris);CHKERRQ(ierr);
2721   ierr = PetscFree(tns);CHKERRQ(ierr);
2722   PetscFunctionReturn(0);
2723 }
2724 EXTERN_C_END
2725 
2726 #undef __FUNCT__
2727 #define __FUNCT__ "MatInodeGetInodeSizes"
2728 /*@C
2729    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
2730 
2731    Collective on Mat
2732 
2733    Input Parameter:
2734 .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
2735 
2736    Output Parameter:
2737 +  node_count - no of inodes present in the matrix.
2738 .  sizes      - an array of size node_count,with sizes of each inode.
2739 -  limit      - the max size used to generate the inodes.
2740 
2741    Level: advanced
2742 
2743    Notes: This routine returns some internal storage information
2744    of the matrix, it is intended to be used by advanced users.
2745    It should be called after the matrix is assembled.
2746    The contents of the sizes[] array should not be changed.
2747    PETSC_NULL may be passed for information not requested.
2748 
2749 .keywords: matrix, seqaij, get, inode
2750 
2751 .seealso: MatGetInfo()
2752 @*/
2753 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
2754 {
2755   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
2756 
2757   PetscFunctionBegin;
2758   if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
2759   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr);
2760   if (f) {
2761     ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr);
2762   }
2763   PetscFunctionReturn(0);
2764 }
2765 
2766 EXTERN_C_BEGIN
2767 #undef __FUNCT__
2768 #define __FUNCT__ "MatInodeGetInodeSizes_Inode"
2769 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
2770 {
2771   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
2772 
2773   PetscFunctionBegin;
2774   if (node_count) *node_count = a->inode.node_count;
2775   if (sizes)      *sizes      = a->inode.size;
2776   if (limit)      *limit      = a->inode.limit;
2777   PetscFunctionReturn(0);
2778 }
2779 EXTERN_C_END
2780