xref: /petsc/src/mat/impls/aij/seq/inode.c (revision 3d33046db24e4f573678e739063cfd2dc46a9b5e)
1 #define PETSCMAT_DLL
2 
3 /*
4   This file provides high performance routines for the Inode format (compressed sparse row)
5   by taking advantage of rows with identical nonzero structure (I-nodes).
6 */
7 #include "../src/mat/impls/aij/seq/aij.h"
8 
9 #undef __FUNCT__
10 #define __FUNCT__ "Mat_CreateColInode"
11 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns)
12 {
13   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
14   PetscErrorCode ierr;
15   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;
16 
17   PetscFunctionBegin;
18   n      = A->cmap->n;
19   m      = A->rmap->n;
20   ns_row = a->inode.size;
21 
22   min_mn = (m < n) ? m : n;
23   if (!ns) {
24     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++);
25     for(; count+1 < n; count++,i++);
26     if (count < n)  {
27       i++;
28     }
29     *size = i;
30     PetscFunctionReturn(0);
31   }
32   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr);
33 
34   /* Use the same row structure wherever feasible. */
35   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
36     ns_col[i] = ns_row[i];
37   }
38 
39   /* if m < n; pad up the remainder with inode_limit */
40   for(; count+1 < n; count++,i++) {
41     ns_col[i] = 1;
42   }
43   /* The last node is the odd ball. padd it up with the remaining rows; */
44   if (count < n)  {
45     ns_col[i] = n - count;
46     i++;
47   } else if (count > n) {
48     /* Adjust for the over estimation */
49     ns_col[i-1] += n - count;
50   }
51   *size = i;
52   *ns   = ns_col;
53   PetscFunctionReturn(0);
54 }
55 
56 
57 /*
58       This builds symmetric version of nonzero structure,
59 */
60 #undef __FUNCT__
61 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric"
62 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
63 {
64   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
65   PetscErrorCode ierr;
66   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n;
67   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j;
68 
69   PetscFunctionBegin;
70   nslim_row = a->inode.node_count;
71   m         = A->rmap->n;
72   n         = A->cmap->n;
73   if (m != n) SETERRQ(PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
74 
75   /* Use the row_inode as column_inode */
76   nslim_col = nslim_row;
77   ns_col    = ns_row;
78 
79   /* allocate space for reformated inode structure */
80   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
81   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
82   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
83 
84   for (i1=0,col=0; i1<nslim_col; ++i1){
85     nsz = ns_col[i1];
86     for (i2=0; i2<nsz; ++i2,++col)
87       tvc[col] = i1;
88   }
89   /* allocate space for row pointers */
90   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
91   *iia = ia;
92   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
93   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
94 
95   /* determine the number of columns in each row */
96   ia[0] = oshift;
97   for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) {
98 
99     j    = aj + ai[row] + ishift;
100     jmax = aj + ai[row+1] + ishift;
101     i2   = 0;
102     col  = *j++ + ishift;
103     i2   = tvc[col];
104     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
105       ia[i1+1]++;
106       ia[i2+1]++;
107       i2++;                     /* Start col of next node */
108       while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j;
109       i2 = tvc[col];
110     }
111     if(i2 == i1) ia[i2+1]++;    /* now the diagonal element */
112   }
113 
114   /* shift ia[i] to point to next row */
115   for (i1=1; i1<nslim_row+1; i1++) {
116     row        = ia[i1-1];
117     ia[i1]    += row;
118     work[i1-1] = row - oshift;
119   }
120 
121   /* allocate space for column pointers */
122   nz   = ia[nslim_row] + (!ishift);
123   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
124   *jja = ja;
125 
126  /* loop over lower triangular part putting into ja */
127   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
128     j    = aj + ai[row] + ishift;
129     jmax = aj + ai[row+1] + ishift;
130     i2   = 0;                     /* Col inode index */
131     col  = *j++ + ishift;
132     i2   = tvc[col];
133     while (i2<i1 && j<jmax) {
134       ja[work[i2]++] = i1 + oshift;
135       ja[work[i1]++] = i2 + oshift;
136       ++i2;
137       while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */
138       i2 = tvc[col];
139     }
140     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
141 
142   }
143   ierr = PetscFree(work);CHKERRQ(ierr);
144   ierr = PetscFree(tns);CHKERRQ(ierr);
145   ierr = PetscFree(tvc);CHKERRQ(ierr);
146   PetscFunctionReturn(0);
147 }
148 
149 /*
150       This builds nonsymmetric version of nonzero structure,
151 */
152 #undef __FUNCT__
153 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric"
154 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
155 {
156   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
157   PetscErrorCode ierr;
158   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col;
159   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
160 
161   PetscFunctionBegin;
162   nslim_row = a->inode.node_count;
163   n         = A->cmap->n;
164 
165   /* Create The column_inode for this matrix */
166   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
167 
168   /* allocate space for reformated column_inode structure */
169   ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
170   ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
171   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
172 
173   for (i1=0,col=0; i1<nslim_col; ++i1){
174     nsz = ns_col[i1];
175     for (i2=0; i2<nsz; ++i2,++col)
176       tvc[col] = i1;
177   }
178   /* allocate space for row pointers */
179   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
180   *iia = ia;
181   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
182   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
183 
184   /* determine the number of columns in each row */
185   ia[0] = oshift;
186   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
187     j   = aj + ai[row] + ishift;
188     col = *j++ + ishift;
189     i2  = tvc[col];
190     nz  = ai[row+1] - ai[row];
191     while (nz-- > 0) {           /* off-diagonal elemets */
192       ia[i1+1]++;
193       i2++;                     /* Start col of next node */
194       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
195       if (nz > 0) i2 = tvc[col];
196     }
197   }
198 
199   /* shift ia[i] to point to next row */
200   for (i1=1; i1<nslim_row+1; i1++) {
201     row        = ia[i1-1];
202     ia[i1]    += row;
203     work[i1-1] = row - oshift;
204   }
205 
206   /* allocate space for column pointers */
207   nz   = ia[nslim_row] + (!ishift);
208   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
209   *jja = ja;
210 
211  /* loop over matrix putting into ja */
212   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
213     j   = aj + ai[row] + ishift;
214     i2  = 0;                     /* Col inode index */
215     col = *j++ + ishift;
216     i2  = tvc[col];
217     nz  = ai[row+1] - ai[row];
218     while (nz-- > 0) {
219       ja[work[i1]++] = i2 + oshift;
220       ++i2;
221       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
222       if (nz > 0) i2 = tvc[col];
223     }
224   }
225   ierr = PetscFree(ns_col);CHKERRQ(ierr);
226   ierr = PetscFree(work);CHKERRQ(ierr);
227   ierr = PetscFree(tns);CHKERRQ(ierr);
228   ierr = PetscFree(tvc);CHKERRQ(ierr);
229   PetscFunctionReturn(0);
230 }
231 
232 #undef __FUNCT__
233 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode"
234 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
235 {
236   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
237   PetscErrorCode ierr;
238 
239   PetscFunctionBegin;
240   *n     = a->inode.node_count;
241   if (!ia) PetscFunctionReturn(0);
242   if (!blockcompressed) {
243     ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
244   } else if (symmetric) {
245     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
246   } else {
247     ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
248   }
249   PetscFunctionReturn(0);
250 }
251 
252 #undef __FUNCT__
253 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode"
254 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
255 {
256   PetscErrorCode ierr;
257 
258   PetscFunctionBegin;
259   if (!ia) PetscFunctionReturn(0);
260 
261   if (!blockcompressed) {
262     ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
263   } else {
264     ierr = PetscFree(*ia);CHKERRQ(ierr);
265     ierr = PetscFree(*ja);CHKERRQ(ierr);
266   }
267 
268   PetscFunctionReturn(0);
269 }
270 
271 /* ----------------------------------------------------------- */
272 
273 #undef __FUNCT__
274 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric"
275 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
276 {
277   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
278   PetscErrorCode ierr;
279   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
280   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
281 
282   PetscFunctionBegin;
283   nslim_row = a->inode.node_count;
284   n         = A->cmap->n;
285 
286   /* Create The column_inode for this matrix */
287   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
288 
289   /* allocate space for reformated column_inode structure */
290   ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
291   ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
292   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
293 
294   for (i1=0,col=0; i1<nslim_col; ++i1){
295     nsz = ns_col[i1];
296     for (i2=0; i2<nsz; ++i2,++col)
297       tvc[col] = i1;
298   }
299   /* allocate space for column pointers */
300   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
301   *iia = ia;
302   ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr);
303   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
304 
305   /* determine the number of columns in each row */
306   ia[0] = oshift;
307   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
308     j   = aj + ai[row] + ishift;
309     col = *j++ + ishift;
310     i2  = tvc[col];
311     nz  = ai[row+1] - ai[row];
312     while (nz-- > 0) {           /* off-diagonal elemets */
313       /* ia[i1+1]++; */
314       ia[i2+1]++;
315       i2++;
316       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
317       if (nz > 0) i2 = tvc[col];
318     }
319   }
320 
321   /* shift ia[i] to point to next col */
322   for (i1=1; i1<nslim_col+1; i1++) {
323     col        = ia[i1-1];
324     ia[i1]    += col;
325     work[i1-1] = col - oshift;
326   }
327 
328   /* allocate space for column pointers */
329   nz   = ia[nslim_col] + (!ishift);
330   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
331   *jja = ja;
332 
333  /* loop over matrix putting into ja */
334   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
335     j   = aj + ai[row] + ishift;
336     i2  = 0;                     /* Col inode index */
337     col = *j++ + ishift;
338     i2  = tvc[col];
339     nz  = ai[row+1] - ai[row];
340     while (nz-- > 0) {
341       /* ja[work[i1]++] = i2 + oshift; */
342       ja[work[i2]++] = i1 + oshift;
343       i2++;
344       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
345       if (nz > 0) i2 = tvc[col];
346     }
347   }
348   ierr = PetscFree(ns_col);CHKERRQ(ierr);
349   ierr = PetscFree(work);CHKERRQ(ierr);
350   ierr = PetscFree(tns);CHKERRQ(ierr);
351   ierr = PetscFree(tvc);CHKERRQ(ierr);
352   PetscFunctionReturn(0);
353 }
354 
355 #undef __FUNCT__
356 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode"
357 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
358 {
359   PetscErrorCode ierr;
360 
361   PetscFunctionBegin;
362   ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr);
363   if (!ia) PetscFunctionReturn(0);
364 
365   if (!blockcompressed) {
366     ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
367   } else if (symmetric) {
368     /* Since the indices are symmetric it does'nt matter */
369     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
370   } else {
371     ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
372   }
373   PetscFunctionReturn(0);
374 }
375 
376 #undef __FUNCT__
377 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode"
378 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
379 {
380   PetscErrorCode ierr;
381 
382   PetscFunctionBegin;
383   if (!ia) PetscFunctionReturn(0);
384   if (!blockcompressed) {
385     ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
386   } else {
387     ierr = PetscFree(*ia);CHKERRQ(ierr);
388     ierr = PetscFree(*ja);CHKERRQ(ierr);
389   }
390   PetscFunctionReturn(0);
391 }
392 
393 /* ----------------------------------------------------------- */
394 
395 #undef __FUNCT__
396 #define __FUNCT__ "MatMult_SeqAIJ_Inode"
397 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
398 {
399   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
400   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
401   PetscScalar       *y;
402   const PetscScalar *x;
403   const MatScalar   *v1,*v2,*v3,*v4,*v5;
404   PetscErrorCode    ierr;
405   PetscInt          *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0;
406 
407 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
408 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
409 #endif
410 
411   PetscFunctionBegin;
412   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
413   node_max = a->inode.node_count;
414   ns       = a->inode.size;     /* Node Size array */
415   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
416   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
417   idx  = a->j;
418   v1   = a->a;
419   ii   = a->i;
420 
421   for (i = 0,row = 0; i< node_max; ++i){
422     nsz  = ns[i];
423     n    = ii[1] - ii[0];
424     nonzerorow += (n>0)*nsz;
425     ii  += nsz;
426     PetscPrefetchBlock(idx+nsz*n,n,0,0);    /* Prefetch the indices for the block row after the current one */
427     PetscPrefetchBlock(v1+nsz*n,nsz*n,0,0); /* Prefetch the values for the block row after the current one  */
428     sz   = n;                   /* No of non zeros in this row */
429                                 /* Switch on the size of Node */
430     switch (nsz){               /* Each loop in 'case' is unrolled */
431     case 1 :
432       sum1  = 0.;
433 
434       for(n = 0; n< sz-1; n+=2) {
435         i1   = idx[0];          /* The instructions are ordered to */
436         i2   = idx[1];          /* make the compiler's job easy */
437         idx += 2;
438         tmp0 = x[i1];
439         tmp1 = x[i2];
440         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
441        }
442 
443       if (n == sz-1){          /* Take care of the last nonzero  */
444         tmp0  = x[*idx++];
445         sum1 += *v1++ * tmp0;
446       }
447       y[row++]=sum1;
448       break;
449     case 2:
450       sum1  = 0.;
451       sum2  = 0.;
452       v2    = v1 + n;
453 
454       for (n = 0; n< sz-1; n+=2) {
455         i1   = idx[0];
456         i2   = idx[1];
457         idx += 2;
458         tmp0 = x[i1];
459         tmp1 = x[i2];
460         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
461         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
462       }
463       if (n == sz-1){
464         tmp0  = x[*idx++];
465         sum1 += *v1++ * tmp0;
466         sum2 += *v2++ * tmp0;
467       }
468       y[row++]=sum1;
469       y[row++]=sum2;
470       v1      =v2;              /* Since the next block to be processed starts there*/
471       idx    +=sz;
472       break;
473     case 3:
474       sum1  = 0.;
475       sum2  = 0.;
476       sum3  = 0.;
477       v2    = v1 + n;
478       v3    = v2 + n;
479 
480       for (n = 0; n< sz-1; n+=2) {
481         i1   = idx[0];
482         i2   = idx[1];
483         idx += 2;
484         tmp0 = x[i1];
485         tmp1 = x[i2];
486         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
487         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
488         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
489       }
490       if (n == sz-1){
491         tmp0  = x[*idx++];
492         sum1 += *v1++ * tmp0;
493         sum2 += *v2++ * tmp0;
494         sum3 += *v3++ * tmp0;
495       }
496       y[row++]=sum1;
497       y[row++]=sum2;
498       y[row++]=sum3;
499       v1       =v3;             /* Since the next block to be processed starts there*/
500       idx     +=2*sz;
501       break;
502     case 4:
503       sum1  = 0.;
504       sum2  = 0.;
505       sum3  = 0.;
506       sum4  = 0.;
507       v2    = v1 + n;
508       v3    = v2 + n;
509       v4    = v3 + n;
510 
511       for (n = 0; n< sz-1; n+=2) {
512         i1   = idx[0];
513         i2   = idx[1];
514         idx += 2;
515         tmp0 = x[i1];
516         tmp1 = x[i2];
517         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
518         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
519         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
520         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
521       }
522       if (n == sz-1){
523         tmp0  = x[*idx++];
524         sum1 += *v1++ * tmp0;
525         sum2 += *v2++ * tmp0;
526         sum3 += *v3++ * tmp0;
527         sum4 += *v4++ * tmp0;
528       }
529       y[row++]=sum1;
530       y[row++]=sum2;
531       y[row++]=sum3;
532       y[row++]=sum4;
533       v1      =v4;              /* Since the next block to be processed starts there*/
534       idx    +=3*sz;
535       break;
536     case 5:
537       sum1  = 0.;
538       sum2  = 0.;
539       sum3  = 0.;
540       sum4  = 0.;
541       sum5  = 0.;
542       v2    = v1 + n;
543       v3    = v2 + n;
544       v4    = v3 + n;
545       v5    = v4 + n;
546 
547       for (n = 0; n<sz-1; n+=2) {
548         i1   = idx[0];
549         i2   = idx[1];
550         idx += 2;
551         tmp0 = x[i1];
552         tmp1 = x[i2];
553         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
554         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
555         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
556         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
557         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
558       }
559       if (n == sz-1){
560         tmp0  = x[*idx++];
561         sum1 += *v1++ * tmp0;
562         sum2 += *v2++ * tmp0;
563         sum3 += *v3++ * tmp0;
564         sum4 += *v4++ * tmp0;
565         sum5 += *v5++ * tmp0;
566       }
567       y[row++]=sum1;
568       y[row++]=sum2;
569       y[row++]=sum3;
570       y[row++]=sum4;
571       y[row++]=sum5;
572       v1      =v5;       /* Since the next block to be processed starts there */
573       idx    +=4*sz;
574       break;
575     default :
576       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
577     }
578   }
579   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
580   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
581   ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr);
582   PetscFunctionReturn(0);
583 }
584 /* ----------------------------------------------------------- */
585 /* Almost same code as the MatMult_SeqAIJ_Inode() */
586 #undef __FUNCT__
587 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode"
588 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
589 {
590   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
591   PetscScalar    sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
592   MatScalar      *v1,*v2,*v3,*v4,*v5;
593   PetscScalar    *x,*y,*z,*zt;
594   PetscErrorCode ierr;
595   PetscInt       *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
596 
597   PetscFunctionBegin;
598   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
599   node_max = a->inode.node_count;
600   ns       = a->inode.size;     /* Node Size array */
601   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
602   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
603   if (zz != yy) {
604     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
605   } else {
606     z = y;
607   }
608   zt = z;
609 
610   idx  = a->j;
611   v1   = a->a;
612   ii   = a->i;
613 
614   for (i = 0,row = 0; i< node_max; ++i){
615     nsz  = ns[i];
616     n    = ii[1] - ii[0];
617     ii  += nsz;
618     sz   = n;                   /* No of non zeros in this row */
619                                 /* Switch on the size of Node */
620     switch (nsz){               /* Each loop in 'case' is unrolled */
621     case 1 :
622       sum1  = *zt++;
623 
624       for(n = 0; n< sz-1; n+=2) {
625         i1   = idx[0];          /* The instructions are ordered to */
626         i2   = idx[1];          /* make the compiler's job easy */
627         idx += 2;
628         tmp0 = x[i1];
629         tmp1 = x[i2];
630         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
631        }
632 
633       if(n   == sz-1){          /* Take care of the last nonzero  */
634         tmp0  = x[*idx++];
635         sum1 += *v1++ * tmp0;
636       }
637       y[row++]=sum1;
638       break;
639     case 2:
640       sum1  = *zt++;
641       sum2  = *zt++;
642       v2    = v1 + n;
643 
644       for(n = 0; n< sz-1; n+=2) {
645         i1   = idx[0];
646         i2   = idx[1];
647         idx += 2;
648         tmp0 = x[i1];
649         tmp1 = x[i2];
650         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
651         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
652       }
653       if(n   == sz-1){
654         tmp0  = x[*idx++];
655         sum1 += *v1++ * tmp0;
656         sum2 += *v2++ * tmp0;
657       }
658       y[row++]=sum1;
659       y[row++]=sum2;
660       v1      =v2;              /* Since the next block to be processed starts there*/
661       idx    +=sz;
662       break;
663     case 3:
664       sum1  = *zt++;
665       sum2  = *zt++;
666       sum3  = *zt++;
667       v2    = v1 + n;
668       v3    = v2 + n;
669 
670       for (n = 0; n< sz-1; n+=2) {
671         i1   = idx[0];
672         i2   = idx[1];
673         idx += 2;
674         tmp0 = x[i1];
675         tmp1 = x[i2];
676         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
677         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
678         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
679       }
680       if (n == sz-1){
681         tmp0  = x[*idx++];
682         sum1 += *v1++ * tmp0;
683         sum2 += *v2++ * tmp0;
684         sum3 += *v3++ * tmp0;
685       }
686       y[row++]=sum1;
687       y[row++]=sum2;
688       y[row++]=sum3;
689       v1       =v3;             /* Since the next block to be processed starts there*/
690       idx     +=2*sz;
691       break;
692     case 4:
693       sum1  = *zt++;
694       sum2  = *zt++;
695       sum3  = *zt++;
696       sum4  = *zt++;
697       v2    = v1 + n;
698       v3    = v2 + n;
699       v4    = v3 + n;
700 
701       for (n = 0; n< sz-1; n+=2) {
702         i1   = idx[0];
703         i2   = idx[1];
704         idx += 2;
705         tmp0 = x[i1];
706         tmp1 = x[i2];
707         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
708         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
709         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
710         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
711       }
712       if (n == sz-1){
713         tmp0  = x[*idx++];
714         sum1 += *v1++ * tmp0;
715         sum2 += *v2++ * tmp0;
716         sum3 += *v3++ * tmp0;
717         sum4 += *v4++ * tmp0;
718       }
719       y[row++]=sum1;
720       y[row++]=sum2;
721       y[row++]=sum3;
722       y[row++]=sum4;
723       v1      =v4;              /* Since the next block to be processed starts there*/
724       idx    +=3*sz;
725       break;
726     case 5:
727       sum1  = *zt++;
728       sum2  = *zt++;
729       sum3  = *zt++;
730       sum4  = *zt++;
731       sum5  = *zt++;
732       v2    = v1 + n;
733       v3    = v2 + n;
734       v4    = v3 + n;
735       v5    = v4 + n;
736 
737       for (n = 0; n<sz-1; n+=2) {
738         i1   = idx[0];
739         i2   = idx[1];
740         idx += 2;
741         tmp0 = x[i1];
742         tmp1 = x[i2];
743         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
744         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
745         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
746         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
747         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
748       }
749       if(n   == sz-1){
750         tmp0  = x[*idx++];
751         sum1 += *v1++ * tmp0;
752         sum2 += *v2++ * tmp0;
753         sum3 += *v3++ * tmp0;
754         sum4 += *v4++ * tmp0;
755         sum5 += *v5++ * tmp0;
756       }
757       y[row++]=sum1;
758       y[row++]=sum2;
759       y[row++]=sum3;
760       y[row++]=sum4;
761       y[row++]=sum5;
762       v1      =v5;       /* Since the next block to be processed starts there */
763       idx    +=4*sz;
764       break;
765     default :
766       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
767     }
768   }
769   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
770   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
771   if (zz != yy) {
772     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
773   }
774   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
775   PetscFunctionReturn(0);
776 }
777 
778 /* ----------------------------------------------------------- */
779 #undef __FUNCT__
780 #define __FUNCT__ "MatSolve_SeqAIJ_Inode_inplace"
781 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)
782 {
783   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
784   IS                iscol = a->col,isrow = a->row;
785   PetscErrorCode    ierr;
786   const PetscInt    *r,*c,*rout,*cout;
787   PetscInt          i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
788   PetscInt          node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
789   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
790   PetscScalar       sum1,sum2,sum3,sum4,sum5;
791   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
792   const PetscScalar *b;
793 
794   PetscFunctionBegin;
795   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
796   node_max = a->inode.node_count;
797   ns       = a->inode.size;     /* Node Size array */
798 
799   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
800   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
801   tmp  = a->solve_work;
802 
803   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
804   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
805 
806   /* forward solve the lower triangular */
807   tmps = tmp ;
808   aa   = a_a ;
809   aj   = a_j ;
810   ad   = a->diag;
811 
812   for (i = 0,row = 0; i< node_max; ++i){
813     nsz = ns[i];
814     aii = ai[row];
815     v1  = aa + aii;
816     vi  = aj + aii;
817     nz  = ad[row]- aii;
818     if (i < node_max-1) {
819       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
820       * but our indexing to determine it's size could. */
821       PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,0); /* indices */
822       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
823       PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,0);
824       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
825     }
826 
827     switch (nsz){               /* Each loop in 'case' is unrolled */
828     case 1 :
829       sum1 = b[*r++];
830       for(j=0; j<nz-1; j+=2){
831         i0   = vi[0];
832         i1   = vi[1];
833         vi  +=2;
834         tmp0 = tmps[i0];
835         tmp1 = tmps[i1];
836         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
837       }
838       if(j == nz-1){
839         tmp0 = tmps[*vi++];
840         sum1 -= *v1++ *tmp0;
841       }
842       tmp[row ++]=sum1;
843       break;
844     case 2:
845       sum1 = b[*r++];
846       sum2 = b[*r++];
847       v2   = aa + ai[row+1];
848 
849       for(j=0; j<nz-1; j+=2){
850         i0   = vi[0];
851         i1   = vi[1];
852         vi  +=2;
853         tmp0 = tmps[i0];
854         tmp1 = tmps[i1];
855         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
856         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
857       }
858       if(j == nz-1){
859         tmp0 = tmps[*vi++];
860         sum1 -= *v1++ *tmp0;
861         sum2 -= *v2++ *tmp0;
862       }
863       sum2 -= *v2++ * sum1;
864       tmp[row ++]=sum1;
865       tmp[row ++]=sum2;
866       break;
867     case 3:
868       sum1 = b[*r++];
869       sum2 = b[*r++];
870       sum3 = b[*r++];
871       v2   = aa + ai[row+1];
872       v3   = aa + ai[row+2];
873 
874       for (j=0; j<nz-1; j+=2){
875         i0   = vi[0];
876         i1   = vi[1];
877         vi  +=2;
878         tmp0 = tmps[i0];
879         tmp1 = tmps[i1];
880         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
881         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
882         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
883       }
884       if (j == nz-1){
885         tmp0 = tmps[*vi++];
886         sum1 -= *v1++ *tmp0;
887         sum2 -= *v2++ *tmp0;
888         sum3 -= *v3++ *tmp0;
889       }
890       sum2 -= *v2++ * sum1;
891       sum3 -= *v3++ * sum1;
892       sum3 -= *v3++ * sum2;
893       tmp[row ++]=sum1;
894       tmp[row ++]=sum2;
895       tmp[row ++]=sum3;
896       break;
897 
898     case 4:
899       sum1 = b[*r++];
900       sum2 = b[*r++];
901       sum3 = b[*r++];
902       sum4 = b[*r++];
903       v2   = aa + ai[row+1];
904       v3   = aa + ai[row+2];
905       v4   = aa + ai[row+3];
906 
907       for (j=0; j<nz-1; j+=2){
908         i0   = vi[0];
909         i1   = vi[1];
910         vi  +=2;
911         tmp0 = tmps[i0];
912         tmp1 = tmps[i1];
913         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
914         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
915         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
916         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
917       }
918       if (j == nz-1){
919         tmp0 = tmps[*vi++];
920         sum1 -= *v1++ *tmp0;
921         sum2 -= *v2++ *tmp0;
922         sum3 -= *v3++ *tmp0;
923         sum4 -= *v4++ *tmp0;
924       }
925       sum2 -= *v2++ * sum1;
926       sum3 -= *v3++ * sum1;
927       sum4 -= *v4++ * sum1;
928       sum3 -= *v3++ * sum2;
929       sum4 -= *v4++ * sum2;
930       sum4 -= *v4++ * sum3;
931 
932       tmp[row ++]=sum1;
933       tmp[row ++]=sum2;
934       tmp[row ++]=sum3;
935       tmp[row ++]=sum4;
936       break;
937     case 5:
938       sum1 = b[*r++];
939       sum2 = b[*r++];
940       sum3 = b[*r++];
941       sum4 = b[*r++];
942       sum5 = b[*r++];
943       v2   = aa + ai[row+1];
944       v3   = aa + ai[row+2];
945       v4   = aa + ai[row+3];
946       v5   = aa + ai[row+4];
947 
948       for (j=0; j<nz-1; j+=2){
949         i0   = vi[0];
950         i1   = vi[1];
951         vi  +=2;
952         tmp0 = tmps[i0];
953         tmp1 = tmps[i1];
954         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
955         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
956         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
957         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
958         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
959       }
960       if (j == nz-1){
961         tmp0 = tmps[*vi++];
962         sum1 -= *v1++ *tmp0;
963         sum2 -= *v2++ *tmp0;
964         sum3 -= *v3++ *tmp0;
965         sum4 -= *v4++ *tmp0;
966         sum5 -= *v5++ *tmp0;
967       }
968 
969       sum2 -= *v2++ * sum1;
970       sum3 -= *v3++ * sum1;
971       sum4 -= *v4++ * sum1;
972       sum5 -= *v5++ * sum1;
973       sum3 -= *v3++ * sum2;
974       sum4 -= *v4++ * sum2;
975       sum5 -= *v5++ * sum2;
976       sum4 -= *v4++ * sum3;
977       sum5 -= *v5++ * sum3;
978       sum5 -= *v5++ * sum4;
979 
980       tmp[row ++]=sum1;
981       tmp[row ++]=sum2;
982       tmp[row ++]=sum3;
983       tmp[row ++]=sum4;
984       tmp[row ++]=sum5;
985       break;
986     default:
987       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
988     }
989   }
990   /* backward solve the upper triangular */
991   for (i=node_max -1 ,row = n-1 ; i>=0; i--){
992     nsz = ns[i];
993     aii = ai[row+1] -1;
994     v1  = aa + aii;
995     vi  = aj + aii;
996     nz  = aii- ad[row];
997     switch (nsz){               /* Each loop in 'case' is unrolled */
998     case 1 :
999       sum1 = tmp[row];
1000 
1001       for(j=nz ; j>1; j-=2){
1002         vi  -=2;
1003         i0   = vi[2];
1004         i1   = vi[1];
1005         tmp0 = tmps[i0];
1006         tmp1 = tmps[i1];
1007         v1   -= 2;
1008         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1009       }
1010       if (j==1){
1011         tmp0  = tmps[*vi--];
1012         sum1 -= *v1-- * tmp0;
1013       }
1014       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1015       break;
1016     case 2 :
1017       sum1 = tmp[row];
1018       sum2 = tmp[row -1];
1019       v2   = aa + ai[row]-1;
1020       for (j=nz ; j>1; j-=2){
1021         vi  -=2;
1022         i0   = vi[2];
1023         i1   = vi[1];
1024         tmp0 = tmps[i0];
1025         tmp1 = tmps[i1];
1026         v1   -= 2;
1027         v2   -= 2;
1028         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1029         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1030       }
1031       if (j==1){
1032         tmp0  = tmps[*vi--];
1033         sum1 -= *v1-- * tmp0;
1034         sum2 -= *v2-- * tmp0;
1035       }
1036 
1037       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1038       sum2   -= *v2-- * tmp0;
1039       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1040       break;
1041     case 3 :
1042       sum1 = tmp[row];
1043       sum2 = tmp[row -1];
1044       sum3 = tmp[row -2];
1045       v2   = aa + ai[row]-1;
1046       v3   = aa + ai[row -1]-1;
1047       for (j=nz ; j>1; j-=2){
1048         vi  -=2;
1049         i0   = vi[2];
1050         i1   = vi[1];
1051         tmp0 = tmps[i0];
1052         tmp1 = tmps[i1];
1053         v1   -= 2;
1054         v2   -= 2;
1055         v3   -= 2;
1056         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1057         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1058         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1059       }
1060       if (j==1){
1061         tmp0  = tmps[*vi--];
1062         sum1 -= *v1-- * tmp0;
1063         sum2 -= *v2-- * tmp0;
1064         sum3 -= *v3-- * tmp0;
1065       }
1066       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1067       sum2   -= *v2-- * tmp0;
1068       sum3   -= *v3-- * tmp0;
1069       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1070       sum3   -= *v3-- * tmp0;
1071       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1072 
1073       break;
1074     case 4 :
1075       sum1 = tmp[row];
1076       sum2 = tmp[row -1];
1077       sum3 = tmp[row -2];
1078       sum4 = tmp[row -3];
1079       v2   = aa + ai[row]-1;
1080       v3   = aa + ai[row -1]-1;
1081       v4   = aa + ai[row -2]-1;
1082 
1083       for (j=nz ; j>1; j-=2){
1084         vi  -=2;
1085         i0   = vi[2];
1086         i1   = vi[1];
1087         tmp0 = tmps[i0];
1088         tmp1 = tmps[i1];
1089         v1  -= 2;
1090         v2  -= 2;
1091         v3  -= 2;
1092         v4  -= 2;
1093         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1094         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1095         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1096         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1097       }
1098       if (j==1){
1099         tmp0  = tmps[*vi--];
1100         sum1 -= *v1-- * tmp0;
1101         sum2 -= *v2-- * tmp0;
1102         sum3 -= *v3-- * tmp0;
1103         sum4 -= *v4-- * tmp0;
1104       }
1105 
1106       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1107       sum2   -= *v2-- * tmp0;
1108       sum3   -= *v3-- * tmp0;
1109       sum4   -= *v4-- * tmp0;
1110       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1111       sum3   -= *v3-- * tmp0;
1112       sum4   -= *v4-- * tmp0;
1113       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1114       sum4   -= *v4-- * tmp0;
1115       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1116       break;
1117     case 5 :
1118       sum1 = tmp[row];
1119       sum2 = tmp[row -1];
1120       sum3 = tmp[row -2];
1121       sum4 = tmp[row -3];
1122       sum5 = tmp[row -4];
1123       v2   = aa + ai[row]-1;
1124       v3   = aa + ai[row -1]-1;
1125       v4   = aa + ai[row -2]-1;
1126       v5   = aa + ai[row -3]-1;
1127       for (j=nz ; j>1; j-=2){
1128         vi  -= 2;
1129         i0   = vi[2];
1130         i1   = vi[1];
1131         tmp0 = tmps[i0];
1132         tmp1 = tmps[i1];
1133         v1   -= 2;
1134         v2   -= 2;
1135         v3   -= 2;
1136         v4   -= 2;
1137         v5   -= 2;
1138         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1139         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1140         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1141         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1142         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1143       }
1144       if (j==1){
1145         tmp0  = tmps[*vi--];
1146         sum1 -= *v1-- * tmp0;
1147         sum2 -= *v2-- * tmp0;
1148         sum3 -= *v3-- * tmp0;
1149         sum4 -= *v4-- * tmp0;
1150         sum5 -= *v5-- * tmp0;
1151       }
1152 
1153       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1154       sum2   -= *v2-- * tmp0;
1155       sum3   -= *v3-- * tmp0;
1156       sum4   -= *v4-- * tmp0;
1157       sum5   -= *v5-- * tmp0;
1158       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1159       sum3   -= *v3-- * tmp0;
1160       sum4   -= *v4-- * tmp0;
1161       sum5   -= *v5-- * tmp0;
1162       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1163       sum4   -= *v4-- * tmp0;
1164       sum5   -= *v5-- * tmp0;
1165       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1166       sum5   -= *v5-- * tmp0;
1167       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1168       break;
1169     default:
1170       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1171     }
1172   }
1173   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1174   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1175   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1176   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1177   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
1178   PetscFunctionReturn(0);
1179 }
1180 
1181 #undef __FUNCT__
1182 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode"
1183 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info)
1184 {
1185   Mat              C=B;
1186   Mat_SeqAIJ       *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ *)C->data;
1187   IS               isrow = b->row,isicol = b->icol;
1188   PetscErrorCode   ierr;
1189   const PetscInt   *r,*ic,*ics;
1190   const PetscInt   n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag;
1191   PetscInt         i,j,k,nz,nzL,row,*pj;
1192   const PetscInt   *ajtmp,*bjtmp;
1193   MatScalar        *rtmp,*pc,*pc1,*pc2,multiplier,multiplier1,multiplier2,*pv,*rtmp11,*rtmp22,*rtmp33;
1194   const  MatScalar *aa=a->a,*v,*v1,*v2;
1195   FactorShiftCtx   sctx;
1196   PetscInt         *ddiag;
1197   PetscReal        rs;
1198   MatScalar        d;
1199   PetscInt         inod,nodesz,kk1,kk2;
1200 
1201 
1202   PetscFunctionBegin;
1203   printf("MatLUFactorNumeric_SeqAIJ_Inode...\n");
1204   /* MatPivotSetUp(): initialize shift context sctx */
1205   ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr);
1206 
1207   if (info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1208     ddiag          = a->diag;
1209     sctx.shift_top = info->zeropivot;
1210     for (i=0; i<n; i++) {
1211       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1212       d  = (aa)[ddiag[i]];
1213       rs = -PetscAbsScalar(d) - PetscRealPart(d);
1214       v  = aa+ai[i];
1215       nz = ai[i+1] - ai[i];
1216       for (j=0; j<nz; j++)
1217 	rs += PetscAbsScalar(v[j]);
1218       if (rs>sctx.shift_top) sctx.shift_top = rs;
1219     }
1220     sctx.shift_top   *= 1.1;
1221     sctx.nshift_max   = 5;
1222     sctx.shift_lo     = 0.;
1223     sctx.shift_hi     = 1.;
1224   }
1225 
1226   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1227   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1228   //ierr = PetscMalloc((n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
1229   ierr  = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp11);CHKERRQ(ierr);
1230   ierr  = PetscMemzero(rtmp11,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr);
1231   rtmp   = rtmp11;
1232   rtmp22 = rtmp11 + n;
1233   rtmp33 = rtmp22 + n;
1234   ics  = ic;
1235 
1236   PetscInt node_max,*ns;
1237   node_max = a->inode.node_count;
1238   ns       = a->inode.size;
1239   if (!ns){
1240     SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information");
1241   }
1242 
1243   do {
1244     sctx.useshift = PETSC_FALSE;
1245     /* Now loop over each block-row, and do the factorization */
1246     for (inod=0,i=0; inod<node_max; inod++){ /* i: row index; inod: inode index */
1247       //for (i=0; i<n; i++){
1248       nodesz = ns[inod];
1249       //printf("row %d, inod %d, nodesz %d\n",i,inod,nodesz);
1250 
1251       switch (nodesz){
1252       case 1:
1253       /* zero rtmp */
1254       /* L part */
1255       nz    = bi[i+1] - bi[i];
1256       bjtmp = bj + bi[i];
1257       for  (j=0; j<nz; j++) rtmp[bjtmp[j]] = 0.0;
1258 
1259       /* U part */
1260       nz = bdiag[i]-bdiag[i+1];
1261       bjtmp = bj + bdiag[i+1]+1;
1262       for  (j=0; j<nz; j++) rtmp[bjtmp[j]] = 0.0;
1263 
1264       /* load in initial (unfactored row) */
1265       nz    = ai[r[i]+1] - ai[r[i]];
1266       ajtmp = aj + ai[r[i]];
1267       v     = aa + ai[r[i]];
1268       for (j=0; j<nz; j++) {
1269         rtmp[ics[ajtmp[j]]] = v[j];
1270       }
1271       /* ZeropivotApply() */
1272       rtmp[i] += sctx.shift_amount;  /* shift the diagonal of the matrix */
1273 
1274       /* elimination */
1275       bjtmp = bj + bi[i];
1276       row   = *bjtmp++;
1277       nzL   = bi[i+1] - bi[i];
1278       for(k=0; k < nzL;k++) {
1279         pc = rtmp + row;
1280         if (*pc != 0.0) {
1281           pv         = b->a + bdiag[row];
1282           multiplier = *pc * (*pv);
1283           *pc        = multiplier;
1284           pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1285 	  pv = b->a + bdiag[row+1]+1;
1286 	  nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1287           for (j=0; j<nz; j++) rtmp[pj[j]] -= multiplier * pv[j];
1288           ierr = PetscLogFlops(2.0*nz);CHKERRQ(ierr);
1289         }
1290         row = *bjtmp++;
1291       }
1292 
1293       /* finished row so stick it into b->a */
1294       rs = 0.0;
1295       /* L part */
1296       pv   = b->a + bi[i] ;
1297       pj   = b->j + bi[i] ;
1298       nz   = bi[i+1] - bi[i];
1299       for (j=0; j<nz; j++) {
1300         pv[j] = rtmp[pj[j]]; rs += PetscAbsScalar(pv[j]);
1301       }
1302 
1303       /* U part */
1304       pv = b->a + bdiag[i+1]+1;
1305       pj = b->j + bdiag[i+1]+1;
1306       nz = bdiag[i] - bdiag[i+1]-1;
1307       for (j=0; j<nz; j++) {
1308         pv[j] = rtmp[pj[j]]; rs += PetscAbsScalar(pv[j]);
1309       }
1310 
1311       /* MatPivotCheck() */
1312       sctx.rs  = rs;
1313       sctx.pv  = rtmp[i];
1314       if (info->shifttype == MAT_SHIFT_NONZERO){
1315         ierr = MatPivotCheck_nz(info,sctx,i);CHKERRQ(ierr);
1316       } else if (info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE){
1317         ierr = MatPivotCheck_pd(info,sctx,i);CHKERRQ(ierr);
1318       } else if (info->shifttype == MAT_SHIFT_INBLOCKS){
1319         ierr = MatPivotCheck_inblocks(info,sctx,i);CHKERRQ(ierr);
1320       } else {
1321         ierr = MatPivotCheck_none(info,sctx,i);CHKERRQ(ierr);
1322       }
1323       rtmp[i] = sctx.pv;
1324 
1325       /* Mark diagonal and invert diagonal for simplier triangular solves */
1326       pv  = b->a + bdiag[i];
1327       *pv = 1.0/rtmp[i];
1328 
1329       break;
1330 
1331       case 2:
1332       /*----------*/
1333         /* zero rtmp */
1334         /* L part */
1335         nz    = bi[i+1] - bi[i];
1336         bjtmp = bj + bi[i];
1337         for  (j=0; j<nz; j++) {
1338           rtmp11[bjtmp[j]] = 0.0; rtmp22[bjtmp[j]] = 0.0;
1339         }
1340 
1341         /* U part */
1342         nz = bdiag[i]-bdiag[i+1];
1343         bjtmp = bj + bdiag[i+1]+1;
1344         for  (j=0; j<nz; j++) {
1345           rtmp11[bjtmp[j]] = 0.0; rtmp22[bjtmp[j]] = 0.0;
1346         }
1347 
1348         /* load in initial (unfactored row) */
1349         nz    = ai[r[i]+1] - ai[r[i]];
1350         ajtmp = aj + ai[r[i]];
1351         v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1];
1352         for (j=0; j<nz; j++) {
1353           rtmp11[ics[ajtmp[j]]] = v1[j]; rtmp22[ics[ajtmp[j]]] = v2[j];
1354         }
1355         /* ZeropivotApply(): shift the diagonal of the matrix  */
1356         rtmp11[i] += sctx.shift_amount; rtmp22[i+1] += sctx.shift_amount;
1357 
1358         /* elimination */
1359         bjtmp = bj + bi[i];
1360         row   = *bjtmp++; /* pivot row */
1361         nzL   = bi[i+1] - bi[i];
1362         for(k=0; k < nzL;k++) {
1363 
1364           pc1 = rtmp11 + row;
1365           pc2 = rtmp22 + row;
1366           if (*pc1 != 0.0 || *pc2 != 0.0) {
1367             pv         = b->a + bdiag[row];
1368             multiplier1 = *pc1*(*pv);    multiplier2 = *pc2*(*pv);
1369             *pc1        = multiplier1;   *pc2 = multiplier2;
1370 
1371             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1372             pv = b->a + bdiag[row+1]+1;
1373             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1374             for (j=0; j<nz; j++){
1375               rtmp11[pj[j]] -= multiplier1 * pv[j];
1376               rtmp22[pj[j]] -= multiplier2 * pv[j];
1377             }
1378             ierr = PetscLogFlops(4*nz);CHKERRQ(ierr);
1379           }
1380           row = *bjtmp++;
1381         }
1382 
1383         /* Now take care of diagonal 2x2 block. */
1384         pc1 = rtmp11 + i;
1385         pc2 = rtmp22 + i;
1386 
1387         if (*pc2 != 0.0){
1388           multiplier = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
1389           *pc2       = multiplier;    /* insert L entry */
1390           pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */
1391           nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1392           for (j=0; j<nz; j++) {
1393             rtmp22[pj[j]] -= multiplier * rtmp11[pj[j]];
1394           }
1395           ierr = PetscLogFlops(2*nz);CHKERRQ(ierr);
1396         }
1397 
1398 
1399         /* finished rows so stick it into b->a */
1400         rs = 0.0;
1401         /* L part */
1402         kk1 = kk2 =0;
1403         pc1 = b->a + bi[i]; pc2 = b->a + bi[i+1];
1404         pj  = b->j + bi[i] ;
1405         nz  = bi[i+1] - bi[i];
1406         for (j=0; j<nz; j++) {
1407           if (pj[j] < i) {pc1[kk1] = rtmp11[pj[j]]; kk1++; rs += PetscAbsScalar(pc1[j]);}
1408           if (pj[j] < i+1) pc2[kk2] = rtmp22[pj[j]]; kk2++;
1409           //printf("%d, rtmp11 %g, rtmp22 = %g\n",pj[j],rtmp11[pj[j]],rtmp22[pj[j]]);
1410         }
1411 
1412         /* U part */
1413         pc1 = b->a + bdiag[i+1]+1; pc2 = b->a + bdiag[i+2]+1;
1414         pj = b->j + bdiag[i+1]+1;
1415         nz = bdiag[i] - bdiag[i+1]; // inlcude diagonal
1416 
1417         kk1 = kk2 =0;
1418         for (j=0; j<nz; j++) {
1419           if (pj[j]>i) {
1420             pc1[kk1] = rtmp11[pj[j]]; rs += PetscAbsScalar(pc1[j]);
1421             kk1++;
1422           }
1423           if (pj[j] > i+1){
1424             pc2[kk2] = rtmp22[pj[j]]; kk2++;
1425             //printf("%d, rtmp11 %g, rtmp22 = %g\n",pj[j],rtmp11[pj[j]],rtmp22[pj[j]]);
1426           }
1427         }
1428 
1429 #if defined(TMP)
1430       /* MatPivotCheck() */
1431       sctx.rs  = rs;
1432       sctx.pv  = rtmp11[i];
1433       if (info->shifttype == MAT_SHIFT_NONZERO){
1434         ierr = MatPivotCheck_nz(info,sctx,i);CHKERRQ(ierr);
1435       } else if (info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE){
1436         ierr = MatPivotCheck_pd(info,sctx,i);CHKERRQ(ierr);
1437       } else if (info->shifttype == MAT_SHIFT_INBLOCKS){
1438         ierr = MatPivotCheck_inblocks(info,sctx,i);CHKERRQ(ierr);
1439       } else {
1440         ierr = MatPivotCheck_none(info,sctx,i);CHKERRQ(ierr);
1441       }
1442       rtmp11[i] = sctx.pv;
1443 #endif
1444       /* Mark diagonal and invert diagonal for simplier triangular solves */
1445       pc1  = b->a + bdiag[i];
1446       *pc1 = 1.0/rtmp11[i];
1447 #if defined(TMP)
1448       sctx.rs  = rs;
1449       sctx.pv  = rtmp22[i+1];
1450       if (info->shifttype == MAT_SHIFT_NONZERO){
1451         ierr = MatPivotCheck_nz(info,sctx,i+1);CHKERRQ(ierr);
1452       } else if (info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE){
1453         ierr = MatPivotCheck_pd(info,sctx,i+1);CHKERRQ(ierr);
1454       } else if (info->shifttype == MAT_SHIFT_INBLOCKS){
1455         ierr = MatPivotCheck_inblocks(info,sctx,i+1);CHKERRQ(ierr);
1456       } else {
1457         ierr = MatPivotCheck_none(info,sctx,i+1);CHKERRQ(ierr);
1458       }
1459       rtmp22[i+1] = sctx.pv;
1460 #endif
1461       /* Mark diagonal and invert diagonal for simplier triangular solves */
1462       pc2  = b->a + bdiag[i+1];
1463       *pc2 = 1.0/rtmp22[i+1];
1464 #if defined(MV)
1465       // L part
1466       printf("row %d: ",i+1);
1467       for (j=b->i[i+1]; j<b->i[i+1+1]; j++) {
1468         printf( "(%d, %g), ",b->j[j],b->a[j]);
1469       }
1470       /* diagonal */
1471       j = b->diag[i+1];
1472       printf( "(%d, %g), ",b->j[j],b->a[j]);
1473       // U
1474       for (j=b->diag[i+1+1]+1; j<b->diag[i+1]; j++) {
1475         printf( "(%d, %g), ",b->j[j],b->a[j]);
1476       }
1477       printf("\n");
1478 #endif
1479         break;
1480       default:
1481         SETERRQ(PETSC_ERR_SUP,"Node size not yet supported \n");
1482       }
1483       i += nodesz;                 /* Update the row */
1484 
1485     } /* endof for (i=0; i<n; i++){ */
1486 
1487     /* MatPivotRefine() */
1488     if (info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE && !sctx.useshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max){
1489       /*
1490        * if no shift in this attempt & shifting & started shifting & can refine,
1491        * then try lower shift
1492        */
1493       sctx.shift_hi       = sctx.shift_fraction;
1494       sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.;
1495       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
1496       sctx.useshift        = PETSC_TRUE;
1497       sctx.nshift++;
1498     }
1499   } while (sctx.useshift);
1500 
1501   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1502   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
1503   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1504 
1505   C->ops->solve              = MatSolve_SeqAIJ_Inode;
1506   C->ops->solveadd           = MatSolveAdd_SeqAIJ;
1507   C->ops->solvetranspose     = MatSolveTranspose_SeqAIJ;
1508   C->ops->solvetransposeadd  = MatSolveTransposeAdd_SeqAIJ;
1509   C->ops->matsolve           = MatMatSolve_SeqAIJ;
1510   C->assembled    = PETSC_TRUE;
1511   C->preallocated = PETSC_TRUE;
1512   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
1513 
1514   /* MatShiftView(A,info,&sctx) */
1515   if (sctx.nshift){
1516     if (info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE) {
1517       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr);
1518     } else if (info->shifttype == MAT_SHIFT_NONZERO) {
1519       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
1520     } else if (info->shifttype == MAT_SHIFT_INBLOCKS){
1521       ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %G\n",sctx.nshift,info->shiftamount);CHKERRQ(ierr);
1522     }
1523   }
1524   PetscFunctionReturn(0);
1525 }
1526 
1527 #undef __FUNCT__
1528 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode_inplace"
1529 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info)
1530 {
1531   Mat               C = B;
1532   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1533   IS                iscol = b->col,isrow = b->row,isicol = b->icol;
1534   PetscErrorCode    ierr;
1535   const PetscInt    *r,*ic,*c,*ics;
1536   PetscInt          n = A->rmap->n,*bi = b->i;
1537   PetscInt          *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1538   PetscInt          i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz;
1539   PetscInt          *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1540   PetscScalar       mul1,mul2,mul3,tmp;
1541   MatScalar         *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1542   const MatScalar   *v1,*v2,*v3,*aa = a->a,*rtmp1;
1543   PetscReal         rs=0.0;
1544   FactorShiftCtx    sctx;
1545   PetscInt          newshift;
1546 
1547   PetscFunctionBegin;
1548   sctx.shift_top      = 0;
1549   sctx.nshift_max     = 0;
1550   sctx.shift_lo       = 0;
1551   sctx.shift_hi       = 0;
1552   sctx.shift_fraction = 0;
1553 
1554   /* if both shift schemes are chosen by user, only use info->shiftpd */
1555   if (info->shifttype==MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1556     sctx.shift_top = 0;
1557     for (i=0; i<n; i++) {
1558       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1559       rs    = 0.0;
1560       ajtmp = aj + ai[i];
1561       rtmp1 = aa + ai[i];
1562       nz = ai[i+1] - ai[i];
1563       for (j=0; j<nz; j++){
1564         if (*ajtmp != i){
1565           rs += PetscAbsScalar(*rtmp1++);
1566         } else {
1567           rs -= PetscRealPart(*rtmp1++);
1568         }
1569         ajtmp++;
1570       }
1571       if (rs>sctx.shift_top) sctx.shift_top = rs;
1572     }
1573     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1574     sctx.shift_top *= 1.1;
1575     sctx.nshift_max = 5;
1576     sctx.shift_lo   = 0.;
1577     sctx.shift_hi   = 1.;
1578   }
1579   sctx.shift_amount = 0;
1580   sctx.nshift       = 0;
1581 
1582   ierr  = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1583   ierr  = ISGetIndices(iscol,&c);CHKERRQ(ierr);
1584   ierr  = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1585   ierr  = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp11);CHKERRQ(ierr);
1586   ierr  = PetscMemzero(rtmp11,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr);
1587   ics   = ic ;
1588   rtmp22 = rtmp11 + n;
1589   rtmp33 = rtmp22 + n;
1590 
1591   node_max = a->inode.node_count;
1592   ns       = a->inode.size;
1593   if (!ns){
1594     SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information");
1595   }
1596 
1597   /* If max inode size > 3, split it into two inodes.*/
1598   /* also map the inode sizes according to the ordering */
1599   ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr);
1600   for (i=0,j=0; i<node_max; ++i,++j){
1601     if (ns[i]>3) {
1602       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1603       ++j;
1604       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1605     } else {
1606       tmp_vec1[j] = ns[i];
1607     }
1608   }
1609   /* Use the correct node_max */
1610   node_max = j;
1611 
1612   /* Now reorder the inode info based on mat re-ordering info */
1613   /* First create a row -> inode_size_array_index map */
1614   ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr);
1615   ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr);
1616   for (i=0,row=0; i<node_max; i++) {
1617     nodesz = tmp_vec1[i];
1618     for (j=0; j<nodesz; j++,row++) {
1619       nsmap[row] = i;
1620     }
1621   }
1622   /* Using nsmap, create a reordered ns structure */
1623   for (i=0,j=0; i< node_max; i++) {
1624     nodesz       = tmp_vec1[nsmap[r[j]]];    /* here the reordered row_no is in r[] */
1625     tmp_vec2[i]  = nodesz;
1626     j           += nodesz;
1627   }
1628   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1629   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1630   /* Now use the correct ns */
1631   ns = tmp_vec2;
1632 
1633   do {
1634     sctx.useshift = PETSC_FALSE;
1635     /* Now loop over each block-row, and do the factorization */
1636     for (i=0,row=0; i<node_max; i++) {
1637       nodesz = ns[i];
1638       nz     = bi[row+1] - bi[row];
1639       bjtmp  = bj + bi[row];
1640 
1641       switch (nodesz){
1642       case 1:
1643         for  (j=0; j<nz; j++){
1644           idx        = bjtmp[j];
1645           rtmp11[idx] = 0.0;
1646         }
1647 
1648         /* load in initial (unfactored row) */
1649         idx    = r[row];
1650         nz_tmp = ai[idx+1] - ai[idx];
1651         ajtmp  = aj + ai[idx];
1652         v1     = aa + ai[idx];
1653 
1654         for (j=0; j<nz_tmp; j++) {
1655           idx        = ics[ajtmp[j]];
1656           rtmp11[idx] = v1[j];
1657         }
1658         rtmp11[ics[r[row]]] += sctx.shift_amount;
1659 
1660         prow = *bjtmp++ ;
1661         while (prow < row) {
1662           pc1 = rtmp11 + prow;
1663           if (*pc1 != 0.0){
1664             pv   = ba + bd[prow];
1665             pj   = nbj + bd[prow];
1666             mul1 = *pc1 * *pv++;
1667             *pc1 = mul1;
1668             nz_tmp = bi[prow+1] - bd[prow] - 1;
1669             ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr);
1670             for (j=0; j<nz_tmp; j++) {
1671               tmp = pv[j];
1672               idx = pj[j];
1673               rtmp11[idx] -= mul1 * tmp;
1674             }
1675           }
1676           prow = *bjtmp++ ;
1677         }
1678         pj  = bj + bi[row];
1679         pc1 = ba + bi[row];
1680 
1681         sctx.pv    = rtmp11[row];
1682         rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
1683         rs         = 0.0;
1684         for (j=0; j<nz; j++) {
1685           idx    = pj[j];
1686           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
1687           if (idx != row) rs += PetscAbsScalar(pc1[j]);
1688         }
1689         sctx.rs  = rs;
1690         ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr);
1691         if (newshift == 1) goto endofwhile;
1692         break;
1693 
1694       case 2:
1695         for (j=0; j<nz; j++) {
1696           idx        = bjtmp[j];
1697           rtmp11[idx] = 0.0;
1698           rtmp22[idx] = 0.0;
1699         }
1700 
1701         /* load in initial (unfactored row) */
1702         idx    = r[row];
1703         nz_tmp = ai[idx+1] - ai[idx];
1704         ajtmp  = aj + ai[idx];
1705         v1     = aa + ai[idx];
1706         v2     = aa + ai[idx+1];
1707         for (j=0; j<nz_tmp; j++) {
1708           idx        = ics[ajtmp[j]];
1709           rtmp11[idx] = v1[j];
1710           rtmp22[idx] = v2[j];
1711         }
1712         rtmp11[ics[r[row]]]   += sctx.shift_amount;
1713         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
1714 
1715         prow = *bjtmp++ ;
1716         while (prow < row) {
1717           pc1 = rtmp11 + prow;
1718           pc2 = rtmp22 + prow;
1719           if (*pc1 != 0.0 || *pc2 != 0.0){
1720             pv   = ba + bd[prow];
1721             pj   = nbj + bd[prow];
1722             mul1 = *pc1 * *pv;
1723             mul2 = *pc2 * *pv;
1724             ++pv;
1725             *pc1 = mul1;
1726             *pc2 = mul2;
1727 
1728             nz_tmp = bi[prow+1] - bd[prow] - 1;
1729             for (j=0; j<nz_tmp; j++) {
1730               tmp = pv[j];
1731               idx = pj[j];
1732               rtmp11[idx] -= mul1 * tmp;
1733               rtmp22[idx] -= mul2 * tmp;
1734             }
1735             ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr);
1736           }
1737           prow = *bjtmp++ ;
1738         }
1739 
1740         /* Now take care of diagonal 2x2 block. Note: prow = row here */
1741         pc1 = rtmp11 + prow;
1742         pc2 = rtmp22 + prow;
1743 
1744         sctx.pv = *pc1;
1745         pj      = bj + bi[prow];
1746         rs      = 0.0;
1747         for (j=0; j<nz; j++){
1748           idx = pj[j];
1749           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
1750         }
1751         sctx.rs = rs;
1752         ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr);
1753         if (newshift == 1) goto endofwhile;
1754 
1755         if (*pc2 != 0.0){
1756           pj     = nbj + bd[prow];
1757           mul2   = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
1758           *pc2   = mul2;
1759           nz_tmp = bi[prow+1] - bd[prow] - 1;
1760           for (j=0; j<nz_tmp; j++) {
1761             idx = pj[j] ;
1762             tmp = rtmp11[idx];
1763             rtmp22[idx] -= mul2 * tmp;
1764           }
1765           ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr);
1766         }
1767 
1768         pj  = bj + bi[row];
1769         pc1 = ba + bi[row];
1770         pc2 = ba + bi[row+1];
1771 
1772         sctx.pv = rtmp22[row+1];
1773         rs = 0.0;
1774         rtmp11[row]   = 1.0/rtmp11[row];
1775         rtmp22[row+1] = 1.0/rtmp22[row+1];
1776         /* copy row entries from dense representation to sparse */
1777         for (j=0; j<nz; j++) {
1778           idx    = pj[j];
1779           pc1[j] = rtmp11[idx];
1780           pc2[j] = rtmp22[idx];
1781           if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
1782         }
1783         sctx.rs = rs;
1784         ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr);
1785         if (newshift == 1) goto endofwhile;
1786         break;
1787 
1788       case 3:
1789         for  (j=0; j<nz; j++) {
1790           idx        = bjtmp[j];
1791           rtmp11[idx] = 0.0;
1792           rtmp22[idx] = 0.0;
1793           rtmp33[idx] = 0.0;
1794         }
1795         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
1796         idx    = r[row];
1797         nz_tmp = ai[idx+1] - ai[idx];
1798         ajtmp = aj + ai[idx];
1799         v1    = aa + ai[idx];
1800         v2    = aa + ai[idx+1];
1801         v3    = aa + ai[idx+2];
1802         for (j=0; j<nz_tmp; j++) {
1803           idx        = ics[ajtmp[j]];
1804           rtmp11[idx] = v1[j];
1805           rtmp22[idx] = v2[j];
1806           rtmp33[idx] = v3[j];
1807         }
1808         rtmp11[ics[r[row]]]   += sctx.shift_amount;
1809         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
1810         rtmp33[ics[r[row+2]]] += sctx.shift_amount;
1811 
1812         /* loop over all pivot row blocks above this row block */
1813         prow = *bjtmp++ ;
1814         while (prow < row) {
1815           pc1 = rtmp11 + prow;
1816           pc2 = rtmp22 + prow;
1817           pc3 = rtmp33 + prow;
1818           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){
1819             pv   = ba  + bd[prow];
1820             pj   = nbj + bd[prow];
1821             mul1 = *pc1 * *pv;
1822             mul2 = *pc2 * *pv;
1823             mul3 = *pc3 * *pv;
1824             ++pv;
1825             *pc1 = mul1;
1826             *pc2 = mul2;
1827             *pc3 = mul3;
1828 
1829             nz_tmp = bi[prow+1] - bd[prow] - 1;
1830             /* update this row based on pivot row */
1831             for (j=0; j<nz_tmp; j++) {
1832               tmp = pv[j];
1833               idx = pj[j];
1834               rtmp11[idx] -= mul1 * tmp;
1835               rtmp22[idx] -= mul2 * tmp;
1836               rtmp33[idx] -= mul3 * tmp;
1837             }
1838             ierr = PetscLogFlops(6*nz_tmp);CHKERRQ(ierr);
1839           }
1840           prow = *bjtmp++ ;
1841         }
1842 
1843         /* Now take care of diagonal 3x3 block in this set of rows */
1844         /* note: prow = row here */
1845         pc1 = rtmp11 + prow;
1846         pc2 = rtmp22 + prow;
1847         pc3 = rtmp33 + prow;
1848 
1849         sctx.pv = *pc1;
1850         pj      = bj + bi[prow];
1851         rs      = 0.0;
1852         for (j=0; j<nz; j++){
1853           idx = pj[j];
1854           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
1855         }
1856         sctx.rs = rs;
1857         ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr);
1858         if (newshift == 1) goto endofwhile;
1859 
1860         if (*pc2 != 0.0 || *pc3 != 0.0){
1861           mul2 = (*pc2)/(*pc1);
1862           mul3 = (*pc3)/(*pc1);
1863           *pc2 = mul2;
1864           *pc3 = mul3;
1865           nz_tmp = bi[prow+1] - bd[prow] - 1;
1866           pj     = nbj + bd[prow];
1867           for (j=0; j<nz_tmp; j++) {
1868             idx = pj[j] ;
1869             tmp = rtmp11[idx];
1870             rtmp22[idx] -= mul2 * tmp;
1871             rtmp33[idx] -= mul3 * tmp;
1872           }
1873           ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr);
1874         }
1875         ++prow;
1876 
1877         pc2 = rtmp22 + prow;
1878         pc3 = rtmp33 + prow;
1879         sctx.pv = *pc2;
1880         pj      = bj + bi[prow];
1881         rs      = 0.0;
1882         for (j=0; j<nz; j++){
1883           idx = pj[j];
1884           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
1885         }
1886         sctx.rs = rs;
1887         ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr);
1888         if (newshift == 1) goto endofwhile;
1889 
1890         if (*pc3 != 0.0){
1891           mul3   = (*pc3)/(*pc2);
1892           *pc3   = mul3;
1893           pj     = nbj + bd[prow];
1894           nz_tmp = bi[prow+1] - bd[prow] - 1;
1895           for (j=0; j<nz_tmp; j++) {
1896             idx = pj[j] ;
1897             tmp = rtmp22[idx];
1898             rtmp33[idx] -= mul3 * tmp;
1899           }
1900           ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr);
1901         }
1902 
1903         pj  = bj + bi[row];
1904         pc1 = ba + bi[row];
1905         pc2 = ba + bi[row+1];
1906         pc3 = ba + bi[row+2];
1907 
1908         sctx.pv = rtmp33[row+2];
1909         rs = 0.0;
1910         rtmp11[row]   = 1.0/rtmp11[row];
1911         rtmp22[row+1] = 1.0/rtmp22[row+1];
1912         rtmp33[row+2] = 1.0/rtmp33[row+2];
1913         /* copy row entries from dense representation to sparse */
1914         for (j=0; j<nz; j++) {
1915           idx    = pj[j];
1916           pc1[j] = rtmp11[idx];
1917           pc2[j] = rtmp22[idx];
1918           pc3[j] = rtmp33[idx];
1919           if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
1920         }
1921 
1922         sctx.rs = rs;
1923         ierr = MatLUCheckShift_inline(info,sctx,row+2,newshift);CHKERRQ(ierr);
1924         if (newshift == 1) goto endofwhile;
1925         break;
1926 
1927       default:
1928         SETERRQ(PETSC_ERR_SUP,"Node size not yet supported \n");
1929       }
1930       row += nodesz;                 /* Update the row */
1931     }
1932     endofwhile:;
1933   } while (sctx.useshift);
1934   ierr = PetscFree(rtmp11);CHKERRQ(ierr);
1935   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
1936   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
1937   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1938   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
1939   (B)->ops->solve           = MatSolve_SeqAIJ_Inode_inplace;
1940   /* do not set solve add, since MatSolve_Inode + Add is faster */
1941   C->ops->solvetranspose     = MatSolveTranspose_SeqAIJ_inplace;
1942   C->ops->solvetransposeadd  = MatSolveTransposeAdd_SeqAIJ_inplace;
1943   C->assembled   = PETSC_TRUE;
1944   C->preallocated = PETSC_TRUE;
1945   if (sctx.nshift) {
1946     if (info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE) {
1947       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr);
1948     } else if (info->shifttype == MAT_SHIFT_NONZERO) {
1949       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
1950     }
1951   }
1952   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
1953   PetscFunctionReturn(0);
1954 }
1955 
1956 
1957 /* ----------------------------------------------------------- */
1958 #undef __FUNCT__
1959 #define __FUNCT__ "MatSolve_SeqAIJ_Inode"
1960 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
1961 {
1962   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
1963   IS                iscol = a->col,isrow = a->row;
1964   PetscErrorCode    ierr;
1965   const PetscInt    *r,*c,*rout,*cout;
1966   PetscInt          i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
1967   PetscInt          node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
1968   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
1969   PetscScalar       sum1,sum2,sum3,sum4,sum5;
1970   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
1971   const PetscScalar *b;
1972 
1973   PetscFunctionBegin;
1974   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
1975   node_max = a->inode.node_count;
1976   ns       = a->inode.size;     /* Node Size array */
1977 
1978   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1979   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1980   tmp  = a->solve_work;
1981 
1982   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1983   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1984 
1985   /* forward solve the lower triangular */
1986   tmps = tmp ;
1987   aa   = a_a ;
1988   aj   = a_j ;
1989   ad   = a->diag;
1990 
1991   for (i = 0,row = 0; i< node_max; ++i){
1992     nsz = ns[i];
1993     aii = ai[row];
1994     v1  = aa + aii;
1995     vi  = aj + aii;
1996     nz  = ai[row+1]- ai[row];
1997 
1998     if (i < node_max-1) {
1999       /* Prefetch the indices for the next block */
2000       PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,0); /* indices */
2001       /* Prefetch the data for the next block */
2002       PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,0);
2003     }
2004 
2005     switch (nsz){               /* Each loop in 'case' is unrolled */
2006     case 1 :
2007       sum1 = b[r[row]];
2008       for(j=0; j<nz-1; j+=2){
2009         i0   = vi[j];
2010         i1   = vi[j+1];
2011         tmp0 = tmps[i0];
2012         tmp1 = tmps[i1];
2013         sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1;
2014       }
2015       if(j == nz-1){
2016         tmp0 = tmps[vi[j]];
2017         sum1 -= v1[j]*tmp0;
2018       }
2019       tmp[row++]=sum1;
2020       break;
2021     case 2:
2022       sum1 = b[r[row]];
2023       sum2 = b[r[row+1]];
2024       v2   = aa + ai[row+1];
2025 
2026       for(j=0; j<nz-1; j+=2){
2027         i0   = vi[j];
2028         i1   = vi[j+1];
2029         tmp0 = tmps[i0];
2030         tmp1 = tmps[i1];
2031         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2032         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2033       }
2034       if(j == nz-1){
2035         tmp0 = tmps[vi[j]];
2036         sum1 -= v1[j] *tmp0;
2037         sum2 -= v2[j] *tmp0;
2038       }
2039       sum2 -= v2[nz] * sum1;
2040       tmp[row ++]=sum1;
2041       tmp[row ++]=sum2;
2042       break;
2043     case 3:
2044       sum1 = b[r[row]];
2045       sum2 = b[r[row+1]];
2046       sum3 = b[r[row+2]];
2047       v2   = aa + ai[row+1];
2048       v3   = aa + ai[row+2];
2049 
2050       for (j=0; j<nz-1; j+=2){
2051         i0   = vi[j];
2052         i1   = vi[j+1];
2053         tmp0 = tmps[i0];
2054         tmp1 = tmps[i1];
2055         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2056         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2057         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2058       }
2059       if (j == nz-1){
2060         tmp0 = tmps[vi[j]];
2061         sum1 -= v1[j] *tmp0;
2062         sum2 -= v2[j] *tmp0;
2063         sum3 -= v3[j] *tmp0;
2064       }
2065       sum2 -= v2[nz] * sum1;
2066       sum3 -= v3[nz] * sum1;
2067       sum3 -= v3[nz+1] * sum2;
2068       tmp[row ++]=sum1;
2069       tmp[row ++]=sum2;
2070       tmp[row ++]=sum3;
2071       break;
2072 
2073     case 4:
2074       sum1 = b[r[row]];
2075       sum2 = b[r[row+1]];
2076       sum3 = b[r[row+2]];
2077       sum4 = b[r[row+3]];
2078       v2   = aa + ai[row+1];
2079       v3   = aa + ai[row+2];
2080       v4   = aa + ai[row+3];
2081 
2082       for (j=0; j<nz-1; j+=2){
2083         i0   = vi[j];
2084         i1   = vi[j+1];
2085         tmp0 = tmps[i0];
2086         tmp1 = tmps[i1];
2087         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2088         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2089         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2090         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2091       }
2092       if (j == nz-1){
2093         tmp0 = tmps[vi[j]];
2094         sum1 -= v1[j] *tmp0;
2095         sum2 -= v2[j] *tmp0;
2096         sum3 -= v3[j] *tmp0;
2097         sum4 -= v4[j] *tmp0;
2098       }
2099       sum2 -= v2[nz] * sum1;
2100       sum3 -= v3[nz] * sum1;
2101       sum4 -= v4[nz] * sum1;
2102       sum3 -= v3[nz+1] * sum2;
2103       sum4 -= v4[nz+1] * sum2;
2104       sum4 -= v4[nz+2] * sum3;
2105 
2106       tmp[row ++]=sum1;
2107       tmp[row ++]=sum2;
2108       tmp[row ++]=sum3;
2109       tmp[row ++]=sum4;
2110       break;
2111     case 5:
2112       sum1 = b[r[row]];
2113       sum2 = b[r[row+1]];
2114       sum3 = b[r[row+2]];
2115       sum4 = b[r[row+3]];
2116       sum5 = b[r[row+4]];
2117       v2   = aa + ai[row+1];
2118       v3   = aa + ai[row+2];
2119       v4   = aa + ai[row+3];
2120       v5   = aa + ai[row+4];
2121 
2122       for (j=0; j<nz-1; j+=2){
2123         i0   = vi[j];
2124         i1   = vi[j+1];
2125         tmp0 = tmps[i0];
2126         tmp1 = tmps[i1];
2127         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2128         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2129         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2130         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2131         sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1;
2132       }
2133       if (j == nz-1){
2134         tmp0 = tmps[vi[j]];
2135         sum1 -= v1[j] *tmp0;
2136         sum2 -= v2[j] *tmp0;
2137         sum3 -= v3[j] *tmp0;
2138         sum4 -= v4[j] *tmp0;
2139         sum5 -= v5[j] *tmp0;
2140       }
2141 
2142       sum2 -= v2[nz] * sum1;
2143       sum3 -= v3[nz] * sum1;
2144       sum4 -= v4[nz] * sum1;
2145       sum5 -= v5[nz] * sum1;
2146       sum3 -= v3[nz+1] * sum2;
2147       sum4 -= v4[nz+1] * sum2;
2148       sum5 -= v5[nz+1] * sum2;
2149       sum4 -= v4[nz+2] * sum3;
2150       sum5 -= v5[nz+2] * sum3;
2151       sum5 -= v5[nz+3] * sum4;
2152 
2153       tmp[row ++]=sum1;
2154       tmp[row ++]=sum2;
2155       tmp[row ++]=sum3;
2156       tmp[row ++]=sum4;
2157       tmp[row ++]=sum5;
2158       break;
2159     default:
2160       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
2161     }
2162   }
2163   /* backward solve the upper triangular */
2164   for (i=node_max -1 ,row = n-1 ; i>=0; i--){
2165     nsz = ns[i];
2166     aii = ad[row+1] + 1;
2167     v1  = aa + aii;
2168     vi  = aj + aii;
2169     nz  = ad[row]- ad[row+1] - 1;
2170 
2171     if (i > 0) {
2172       /* Prefetch the indices for the next block */
2173       PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,0); /* indices */
2174       /* Prefetch the data for the next block */
2175       PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,0);
2176     }
2177 
2178     switch (nsz){               /* Each loop in 'case' is unrolled */
2179     case 1 :
2180       sum1 = tmp[row];
2181 
2182       for(j=0 ; j<nz-1; j+=2){
2183         i0   = vi[j];
2184         i1   = vi[j+1];
2185         tmp0 = tmps[i0];
2186         tmp1 = tmps[i1];
2187         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2188       }
2189       if (j == nz-1){
2190         tmp0  = tmps[vi[j]];
2191         sum1 -= v1[j]*tmp0;
2192       }
2193       x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2194       break;
2195     case 2 :
2196       sum1 = tmp[row];
2197       sum2 = tmp[row-1];
2198       v2   = aa + ad[row] + 1;
2199       for (j=0 ; j<nz-1; j+=2){
2200         i0   = vi[j];
2201         i1   = vi[j+1];
2202         tmp0 = tmps[i0];
2203         tmp1 = tmps[i1];
2204         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2205         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2206       }
2207       if (j == nz-1){
2208         tmp0  = tmps[vi[j]];
2209         sum1 -= v1[j]* tmp0;
2210         sum2 -= v2[j+1]* tmp0;
2211       }
2212 
2213       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2214       sum2   -= v2[0] * tmp0;
2215       x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2216       break;
2217     case 3 :
2218       sum1 = tmp[row];
2219       sum2 = tmp[row -1];
2220       sum3 = tmp[row -2];
2221       v2   = aa + ad[row] + 1;
2222       v3   = aa + ad[row -1] + 1;
2223       for (j=0 ; j<nz-1; j+=2){
2224         i0   = vi[j];
2225         i1   = vi[j+1];
2226         tmp0 = tmps[i0];
2227         tmp1 = tmps[i1];
2228         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2229         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2230         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2231       }
2232       if (j== nz-1){
2233         tmp0  = tmps[vi[j]];
2234         sum1 -= v1[j] * tmp0;
2235         sum2 -= v2[j+1] * tmp0;
2236         sum3 -= v3[j+2] * tmp0;
2237       }
2238       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2239       sum2   -= v2[0]* tmp0;
2240       sum3   -= v3[1] * tmp0;
2241       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2242       sum3   -= v3[0]* tmp0;
2243       x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2244 
2245       break;
2246     case 4 :
2247       sum1 = tmp[row];
2248       sum2 = tmp[row -1];
2249       sum3 = tmp[row -2];
2250       sum4 = tmp[row -3];
2251       v2   = aa + ad[row]+1;
2252       v3   = aa + ad[row -1]+1;
2253       v4   = aa + ad[row -2]+1;
2254 
2255       for (j=0 ; j<nz-1; j+=2){
2256         i0   = vi[j];
2257         i1   = vi[j+1];
2258         tmp0 = tmps[i0];
2259         tmp1 = tmps[i1];
2260         sum1 -= v1[j] * tmp0   + v1[j+1] * tmp1;
2261         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2262         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2263         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2264       }
2265       if (j== nz-1){
2266         tmp0  = tmps[vi[j]];
2267         sum1 -= v1[j] * tmp0;
2268         sum2 -= v2[j+1] * tmp0;
2269         sum3 -= v3[j+2] * tmp0;
2270         sum4 -= v4[j+3] * tmp0;
2271       }
2272 
2273       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2274       sum2   -= v2[0] * tmp0;
2275       sum3   -= v3[1] * tmp0;
2276       sum4   -= v4[2] * tmp0;
2277       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2278       sum3   -= v3[0] * tmp0;
2279       sum4   -= v4[1] * tmp0;
2280       tmp0    = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2281       sum4   -= v4[0] * tmp0;
2282       x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2283       break;
2284     case 5 :
2285       sum1 = tmp[row];
2286       sum2 = tmp[row -1];
2287       sum3 = tmp[row -2];
2288       sum4 = tmp[row -3];
2289       sum5 = tmp[row -4];
2290       v2   = aa + ad[row]+1;
2291       v3   = aa + ad[row -1]+1;
2292       v4   = aa + ad[row -2]+1;
2293       v5   = aa + ad[row -3]+1;
2294       for (j=0 ; j<nz-1; j+=2){
2295         i0   = vi[j];
2296         i1   = vi[j+1];
2297         tmp0 = tmps[i0];
2298         tmp1 = tmps[i1];
2299         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2300         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2301         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2302         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2303         sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1;
2304       }
2305       if (j==nz-1){
2306         tmp0  = tmps[vi[j]];
2307         sum1 -= v1[j] * tmp0;
2308         sum2 -= v2[j+1] * tmp0;
2309         sum3 -= v3[j+2] * tmp0;
2310         sum4 -= v4[j+3] * tmp0;
2311         sum5 -= v5[j+4] * tmp0;
2312       }
2313 
2314       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2315       sum2   -= v2[0] * tmp0;
2316       sum3   -= v3[1] * tmp0;
2317       sum4   -= v4[2] * tmp0;
2318       sum5   -= v5[3] * tmp0;
2319       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2320       sum3   -= v3[0] * tmp0;
2321       sum4   -= v4[1] * tmp0;
2322       sum5   -= v5[2] * tmp0;
2323       tmp0    = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2324       sum4   -= v4[0] * tmp0;
2325       sum5   -= v5[1] * tmp0;
2326       tmp0    = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2327       sum5   -= v5[0] * tmp0;
2328       x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--;
2329       break;
2330     default:
2331       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
2332     }
2333   }
2334   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2335   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2336   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2337   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2338   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
2339   PetscFunctionReturn(0);
2340 }
2341 
2342 
2343 /*
2344      Makes a longer coloring[] array and calls the usual code with that
2345 */
2346 #undef __FUNCT__
2347 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode"
2348 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
2349 {
2350   Mat_SeqAIJ       *a = (Mat_SeqAIJ*)mat->data;
2351   PetscErrorCode  ierr;
2352   PetscInt        n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
2353   PetscInt        *colorused,i;
2354   ISColoringValue *newcolor;
2355 
2356   PetscFunctionBegin;
2357   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr);
2358   /* loop over inodes, marking a color for each column*/
2359   row = 0;
2360   for (i=0; i<m; i++){
2361     for (j=0; j<ns[i]; j++) {
2362       newcolor[row++] = coloring[i] + j*ncolors;
2363     }
2364   }
2365 
2366   /* eliminate unneeded colors */
2367   ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr);
2368   ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr);
2369   for (i=0; i<n; i++) {
2370     colorused[newcolor[i]] = 1;
2371   }
2372 
2373   for (i=1; i<5*ncolors; i++) {
2374     colorused[i] += colorused[i-1];
2375   }
2376   ncolors = colorused[5*ncolors-1];
2377   for (i=0; i<n; i++) {
2378     newcolor[i] = colorused[newcolor[i]]-1;
2379   }
2380   ierr = PetscFree(colorused);CHKERRQ(ierr);
2381   ierr = ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr);
2382   ierr = PetscFree(coloring);CHKERRQ(ierr);
2383   PetscFunctionReturn(0);
2384 }
2385 
2386 #include "../src/mat/blockinvert.h"
2387 
2388 #undef __FUNCT__
2389 #define __FUNCT__ "MatSOR_SeqAIJ_Inode"
2390 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
2391 {
2392   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
2393   PetscScalar        sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3;
2394   MatScalar          *ibdiag,*bdiag,work[25];
2395   PetscScalar        *x,*xb,tmp4,tmp5,x1,x2,x3,x4,x5;
2396   const MatScalar    *v = a->a,*v1,*v2,*v3,*v4,*v5;
2397   PetscReal          zeropivot = 1.0e-15, shift = 0.0;
2398   PetscErrorCode     ierr;
2399   PetscInt           n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2;
2400   PetscInt           *idx,*diag = a->diag,*ii = a->i,sz,k,ipvt[5];
2401 
2402   PetscFunctionBegin;
2403   if (omega != 1.0) SETERRQ(PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
2404   if (fshift != 0.0) SETERRQ(PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
2405   if (its > 1) {
2406     /* switch to non-inode version */
2407     ierr = MatSOR_SeqAIJ(A,bb,omega,flag,fshift,its,lits,xx);CHKERRQ(ierr);
2408     PetscFunctionReturn(0);
2409   }
2410 
2411   if (!a->inode.ibdiagvalid) {
2412     if (!a->inode.ibdiag) {
2413       /* calculate space needed for diagonal blocks */
2414       for (i=0; i<m; i++) {
2415 	cnt += sizes[i]*sizes[i];
2416       }
2417       a->inode.bdiagsize = cnt;
2418       ierr   = PetscMalloc3(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag,A->rmap->n,MatScalar,&a->inode.ssor_work);CHKERRQ(ierr);
2419     }
2420 
2421     /* copy over the diagonal blocks and invert them */
2422     ibdiag = a->inode.ibdiag;
2423     bdiag  = a->inode.bdiag;
2424     cnt = 0;
2425     for (i=0, row = 0; i<m; i++) {
2426       for (j=0; j<sizes[i]; j++) {
2427         for (k=0; k<sizes[i]; k++) {
2428           bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
2429         }
2430       }
2431       ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr);
2432 
2433       switch(sizes[i]) {
2434         case 1:
2435           /* Create matrix data structure */
2436           if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
2437           ibdiag[cnt] = 1.0/ibdiag[cnt];
2438           break;
2439         case 2:
2440           ierr = Kernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr);
2441           break;
2442         case 3:
2443           ierr = Kernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr);
2444           break;
2445         case 4:
2446           ierr = Kernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr);
2447           break;
2448         case 5:
2449           ierr = Kernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift);CHKERRQ(ierr);
2450           break;
2451        default:
2452 	 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2453       }
2454       cnt += sizes[i]*sizes[i];
2455       row += sizes[i];
2456     }
2457     a->inode.ibdiagvalid = PETSC_TRUE;
2458   }
2459   ibdiag = a->inode.ibdiag;
2460   bdiag  = a->inode.bdiag;
2461 
2462 
2463   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2464   if (flag & SOR_ZERO_INITIAL_GUESS) {
2465     PetscScalar *b;
2466     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2467     if (xx != bb) {
2468       ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2469     } else {
2470       b = x;
2471     }
2472     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){
2473 
2474       for (i=0, row=0; i<m; i++) {
2475         sz  = diag[row] - ii[row];
2476         v1  = a->a + ii[row];
2477         idx = a->j + ii[row];
2478 
2479         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2480         switch (sizes[i]){
2481           case 1:
2482 
2483             sum1  = b[row];
2484             for(n = 0; n<sz-1; n+=2) {
2485               i1   = idx[0];
2486               i2   = idx[1];
2487               idx += 2;
2488               tmp0 = x[i1];
2489               tmp1 = x[i2];
2490               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2491             }
2492 
2493             if (n == sz-1){
2494               tmp0  = x[*idx];
2495               sum1 -= *v1 * tmp0;
2496             }
2497             x[row++] = sum1*(*ibdiag++);
2498             break;
2499           case 2:
2500             v2    = a->a + ii[row+1];
2501             sum1  = b[row];
2502             sum2  = b[row+1];
2503             for(n = 0; n<sz-1; n+=2) {
2504               i1   = idx[0];
2505               i2   = idx[1];
2506               idx += 2;
2507               tmp0 = x[i1];
2508               tmp1 = x[i2];
2509               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2510               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2511             }
2512 
2513             if (n == sz-1){
2514               tmp0  = x[*idx];
2515               sum1 -= v1[0] * tmp0;
2516               sum2 -= v2[0] * tmp0;
2517             }
2518             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
2519             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
2520             ibdiag  += 4;
2521             break;
2522           case 3:
2523             v2    = a->a + ii[row+1];
2524             v3    = a->a + ii[row+2];
2525             sum1  = b[row];
2526             sum2  = b[row+1];
2527             sum3  = b[row+2];
2528             for(n = 0; n<sz-1; n+=2) {
2529               i1   = idx[0];
2530               i2   = idx[1];
2531               idx += 2;
2532               tmp0 = x[i1];
2533               tmp1 = x[i2];
2534               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2535               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2536               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2537             }
2538 
2539             if (n == sz-1){
2540               tmp0  = x[*idx];
2541               sum1 -= v1[0] * tmp0;
2542               sum2 -= v2[0] * tmp0;
2543               sum3 -= v3[0] * tmp0;
2544             }
2545             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2546             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2547             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2548             ibdiag  += 9;
2549             break;
2550           case 4:
2551             v2    = a->a + ii[row+1];
2552             v3    = a->a + ii[row+2];
2553             v4    = a->a + ii[row+3];
2554             sum1  = b[row];
2555             sum2  = b[row+1];
2556             sum3  = b[row+2];
2557             sum4  = b[row+3];
2558             for(n = 0; n<sz-1; n+=2) {
2559               i1   = idx[0];
2560               i2   = idx[1];
2561               idx += 2;
2562               tmp0 = x[i1];
2563               tmp1 = x[i2];
2564               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2565               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2566               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2567               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2568             }
2569 
2570             if (n == sz-1){
2571               tmp0  = x[*idx];
2572               sum1 -= v1[0] * tmp0;
2573               sum2 -= v2[0] * tmp0;
2574               sum3 -= v3[0] * tmp0;
2575               sum4 -= v4[0] * tmp0;
2576             }
2577             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2578             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2579             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2580             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2581             ibdiag  += 16;
2582             break;
2583           case 5:
2584             v2    = a->a + ii[row+1];
2585             v3    = a->a + ii[row+2];
2586             v4    = a->a + ii[row+3];
2587             v5    = a->a + ii[row+4];
2588             sum1  = b[row];
2589             sum2  = b[row+1];
2590             sum3  = b[row+2];
2591             sum4  = b[row+3];
2592             sum5  = b[row+4];
2593             for(n = 0; n<sz-1; n+=2) {
2594               i1   = idx[0];
2595               i2   = idx[1];
2596               idx += 2;
2597               tmp0 = x[i1];
2598               tmp1 = x[i2];
2599               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2600               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2601               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2602               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2603               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2604             }
2605 
2606             if (n == sz-1){
2607               tmp0  = x[*idx];
2608               sum1 -= v1[0] * tmp0;
2609               sum2 -= v2[0] * tmp0;
2610               sum3 -= v3[0] * tmp0;
2611               sum4 -= v4[0] * tmp0;
2612               sum5 -= v5[0] * tmp0;
2613             }
2614             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2615             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
2616             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
2617             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
2618             x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
2619             ibdiag  += 25;
2620             break;
2621 	  default:
2622    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2623         }
2624       }
2625 
2626       xb = x;
2627       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2628     } else xb = b;
2629     if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) &&
2630         (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) {
2631       cnt = 0;
2632       for (i=0, row=0; i<m; i++) {
2633 
2634         switch (sizes[i]){
2635           case 1:
2636             x[row++] *= bdiag[cnt++];
2637             break;
2638           case 2:
2639             x1   = x[row]; x2 = x[row+1];
2640             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
2641             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
2642             x[row++] = tmp1;
2643             x[row++] = tmp2;
2644             cnt += 4;
2645             break;
2646           case 3:
2647             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2];
2648             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
2649             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
2650             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
2651             x[row++] = tmp1;
2652             x[row++] = tmp2;
2653             x[row++] = tmp3;
2654             cnt += 9;
2655             break;
2656           case 4:
2657             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
2658             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
2659             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
2660             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
2661             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
2662             x[row++] = tmp1;
2663             x[row++] = tmp2;
2664             x[row++] = tmp3;
2665             x[row++] = tmp4;
2666             cnt += 16;
2667             break;
2668           case 5:
2669             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
2670             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
2671             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
2672             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
2673             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
2674             tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
2675             x[row++] = tmp1;
2676             x[row++] = tmp2;
2677             x[row++] = tmp3;
2678             x[row++] = tmp4;
2679             x[row++] = tmp5;
2680             cnt += 25;
2681             break;
2682 	  default:
2683    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2684         }
2685       }
2686       ierr = PetscLogFlops(m);CHKERRQ(ierr);
2687     }
2688     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){
2689 
2690       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
2691       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
2692         ibdiag -= sizes[i]*sizes[i];
2693         sz      = ii[row+1] - diag[row] - 1;
2694         v1      = a->a + diag[row] + 1;
2695         idx     = a->j + diag[row] + 1;
2696 
2697         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2698         switch (sizes[i]){
2699           case 1:
2700 
2701             sum1  = xb[row];
2702             for(n = 0; n<sz-1; n+=2) {
2703               i1   = idx[0];
2704               i2   = idx[1];
2705               idx += 2;
2706               tmp0 = x[i1];
2707               tmp1 = x[i2];
2708               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2709             }
2710 
2711             if (n == sz-1){
2712               tmp0  = x[*idx];
2713               sum1 -= *v1*tmp0;
2714             }
2715             x[row--] = sum1*(*ibdiag);
2716             break;
2717 
2718           case 2:
2719 
2720             sum1  = xb[row];
2721             sum2  = xb[row-1];
2722             /* note that sum1 is associated with the second of the two rows */
2723             v2    = a->a + diag[row-1] + 2;
2724             for(n = 0; n<sz-1; n+=2) {
2725               i1   = idx[0];
2726               i2   = idx[1];
2727               idx += 2;
2728               tmp0 = x[i1];
2729               tmp1 = x[i2];
2730               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2731               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2732             }
2733 
2734             if (n == sz-1){
2735               tmp0  = x[*idx];
2736               sum1 -= *v1*tmp0;
2737               sum2 -= *v2*tmp0;
2738             }
2739             x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
2740             x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
2741             break;
2742           case 3:
2743 
2744             sum1  = xb[row];
2745             sum2  = xb[row-1];
2746             sum3  = xb[row-2];
2747             v2    = a->a + diag[row-1] + 2;
2748             v3    = a->a + diag[row-2] + 3;
2749             for(n = 0; n<sz-1; n+=2) {
2750               i1   = idx[0];
2751               i2   = idx[1];
2752               idx += 2;
2753               tmp0 = x[i1];
2754               tmp1 = x[i2];
2755               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2756               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2757               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2758             }
2759 
2760             if (n == sz-1){
2761               tmp0  = x[*idx];
2762               sum1 -= *v1*tmp0;
2763               sum2 -= *v2*tmp0;
2764               sum3 -= *v3*tmp0;
2765             }
2766             x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
2767             x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
2768             x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
2769             break;
2770           case 4:
2771 
2772             sum1  = xb[row];
2773             sum2  = xb[row-1];
2774             sum3  = xb[row-2];
2775             sum4  = xb[row-3];
2776             v2    = a->a + diag[row-1] + 2;
2777             v3    = a->a + diag[row-2] + 3;
2778             v4    = a->a + diag[row-3] + 4;
2779             for(n = 0; n<sz-1; n+=2) {
2780               i1   = idx[0];
2781               i2   = idx[1];
2782               idx += 2;
2783               tmp0 = x[i1];
2784               tmp1 = x[i2];
2785               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2786               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2787               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2788               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2789             }
2790 
2791             if (n == sz-1){
2792               tmp0  = x[*idx];
2793               sum1 -= *v1*tmp0;
2794               sum2 -= *v2*tmp0;
2795               sum3 -= *v3*tmp0;
2796               sum4 -= *v4*tmp0;
2797             }
2798             x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
2799             x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
2800             x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
2801             x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
2802             break;
2803           case 5:
2804 
2805             sum1  = xb[row];
2806             sum2  = xb[row-1];
2807             sum3  = xb[row-2];
2808             sum4  = xb[row-3];
2809             sum5  = xb[row-4];
2810             v2    = a->a + diag[row-1] + 2;
2811             v3    = a->a + diag[row-2] + 3;
2812             v4    = a->a + diag[row-3] + 4;
2813             v5    = a->a + diag[row-4] + 5;
2814             for(n = 0; n<sz-1; n+=2) {
2815               i1   = idx[0];
2816               i2   = idx[1];
2817               idx += 2;
2818               tmp0 = x[i1];
2819               tmp1 = x[i2];
2820               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2821               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2822               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2823               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2824               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2825             }
2826 
2827             if (n == sz-1){
2828               tmp0  = x[*idx];
2829               sum1 -= *v1*tmp0;
2830               sum2 -= *v2*tmp0;
2831               sum3 -= *v3*tmp0;
2832               sum4 -= *v4*tmp0;
2833               sum5 -= *v5*tmp0;
2834             }
2835             x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
2836             x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
2837             x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
2838             x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
2839             x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
2840             break;
2841 	  default:
2842    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2843         }
2844       }
2845 
2846       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2847     }
2848     its--;
2849     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2850     if (bb != xx) {ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);}
2851   }
2852   if (flag & SOR_EISENSTAT) {
2853     const PetscScalar *b;
2854     MatScalar         *t = a->inode.ssor_work;
2855 
2856     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2857     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2858     /*
2859           Apply  (U + D)^-1  where D is now the block diagonal
2860     */
2861     ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
2862     for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
2863       ibdiag -= sizes[i]*sizes[i];
2864       sz      = ii[row+1] - diag[row] - 1;
2865       v1      = a->a + diag[row] + 1;
2866       idx     = a->j + diag[row] + 1;
2867       CHKMEMQ;
2868       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2869       switch (sizes[i]){
2870         case 1:
2871 
2872 	  sum1  = b[row];
2873 	  for(n = 0; n<sz-1; n+=2) {
2874 	    i1   = idx[0];
2875 	    i2   = idx[1];
2876 	    idx += 2;
2877 	    tmp0 = x[i1];
2878 	    tmp1 = x[i2];
2879 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2880 	  }
2881 
2882 	  if (n == sz-1){
2883 	    tmp0  = x[*idx];
2884 	    sum1 -= *v1*tmp0;
2885 	  }
2886 	  x[row] = sum1*(*ibdiag);row--;
2887 	  break;
2888 
2889         case 2:
2890 
2891 	  sum1  = b[row];
2892 	  sum2  = b[row-1];
2893 	  /* note that sum1 is associated with the second of the two rows */
2894 	  v2    = a->a + diag[row-1] + 2;
2895 	  for(n = 0; n<sz-1; n+=2) {
2896 	    i1   = idx[0];
2897 	    i2   = idx[1];
2898 	    idx += 2;
2899 	    tmp0 = x[i1];
2900 	    tmp1 = x[i2];
2901 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2902 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2903 	  }
2904 
2905 	  if (n == sz-1){
2906 	    tmp0  = x[*idx];
2907 	    sum1 -= *v1*tmp0;
2908 	    sum2 -= *v2*tmp0;
2909 	  }
2910 	  x[row] = sum2*ibdiag[1] + sum1*ibdiag[3];
2911 	  x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
2912           row -= 2;
2913 	  break;
2914         case 3:
2915 
2916 	  sum1  = b[row];
2917 	  sum2  = b[row-1];
2918 	  sum3  = b[row-2];
2919 	  v2    = a->a + diag[row-1] + 2;
2920 	  v3    = a->a + diag[row-2] + 3;
2921 	  for(n = 0; n<sz-1; n+=2) {
2922 	    i1   = idx[0];
2923 	    i2   = idx[1];
2924 	    idx += 2;
2925 	    tmp0 = x[i1];
2926 	    tmp1 = x[i2];
2927 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2928 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2929 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2930 	  }
2931 
2932 	  if (n == sz-1){
2933 	    tmp0  = x[*idx];
2934 	    sum1 -= *v1*tmp0;
2935 	    sum2 -= *v2*tmp0;
2936 	    sum3 -= *v3*tmp0;
2937 	  }
2938 	  x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
2939 	  x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
2940 	  x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
2941           row -= 3;
2942 	  break;
2943         case 4:
2944 
2945 	  sum1  = b[row];
2946 	  sum2  = b[row-1];
2947 	  sum3  = b[row-2];
2948 	  sum4  = b[row-3];
2949 	  v2    = a->a + diag[row-1] + 2;
2950 	  v3    = a->a + diag[row-2] + 3;
2951 	  v4    = a->a + diag[row-3] + 4;
2952 	  for(n = 0; n<sz-1; n+=2) {
2953 	    i1   = idx[0];
2954 	    i2   = idx[1];
2955 	    idx += 2;
2956 	    tmp0 = x[i1];
2957 	    tmp1 = x[i2];
2958 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2959 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2960 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2961 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2962 	  }
2963 
2964 	  if (n == sz-1){
2965 	    tmp0  = x[*idx];
2966 	    sum1 -= *v1*tmp0;
2967 	    sum2 -= *v2*tmp0;
2968 	    sum3 -= *v3*tmp0;
2969 	    sum4 -= *v4*tmp0;
2970 	  }
2971 	  x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
2972 	  x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
2973 	  x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
2974 	  x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
2975           row -= 4;
2976 	  break;
2977         case 5:
2978 
2979 	  sum1  = b[row];
2980 	  sum2  = b[row-1];
2981 	  sum3  = b[row-2];
2982 	  sum4  = b[row-3];
2983 	  sum5  = b[row-4];
2984 	  v2    = a->a + diag[row-1] + 2;
2985 	  v3    = a->a + diag[row-2] + 3;
2986 	  v4    = a->a + diag[row-3] + 4;
2987 	  v5    = a->a + diag[row-4] + 5;
2988 	  for(n = 0; n<sz-1; n+=2) {
2989 	    i1   = idx[0];
2990 	    i2   = idx[1];
2991 	    idx += 2;
2992 	    tmp0 = x[i1];
2993 	    tmp1 = x[i2];
2994 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2995 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2996 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2997 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2998 	    sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2999 	  }
3000 
3001 	  if (n == sz-1){
3002 	    tmp0  = x[*idx];
3003 	    sum1 -= *v1*tmp0;
3004 	    sum2 -= *v2*tmp0;
3005 	    sum3 -= *v3*tmp0;
3006 	    sum4 -= *v4*tmp0;
3007 	    sum5 -= *v5*tmp0;
3008 	  }
3009 	  x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3010 	  x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3011 	  x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3012 	  x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3013 	  x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3014           row -= 5;
3015 	  break;
3016         default:
3017 	  SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3018       }
3019       CHKMEMQ;
3020     }
3021     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3022 
3023     /*
3024            t = b - D x    where D is the block diagonal
3025     */
3026     cnt = 0;
3027     for (i=0, row=0; i<m; i++) {
3028       CHKMEMQ;
3029       switch (sizes[i]){
3030         case 1:
3031 	  t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
3032 	  break;
3033         case 2:
3034 	  x1   = x[row]; x2 = x[row+1];
3035 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3036 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3037 	  t[row]   = b[row] - tmp1;
3038 	  t[row+1] = b[row+1] - tmp2; row += 2;
3039 	  cnt += 4;
3040 	  break;
3041         case 3:
3042 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2];
3043 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3044 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3045 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3046 	  t[row] = b[row] - tmp1;
3047 	  t[row+1] = b[row+1] - tmp2;
3048 	  t[row+2] = b[row+2] - tmp3; row += 3;
3049 	  cnt += 9;
3050 	  break;
3051         case 4:
3052 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3053 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3054 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3055 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3056 	  tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3057 	  t[row] = b[row] - tmp1;
3058 	  t[row+1] = b[row+1] - tmp2;
3059 	  t[row+2] = b[row+2] - tmp3;
3060 	  t[row+3] = b[row+3] - tmp4; row += 4;
3061 	  cnt += 16;
3062 	  break;
3063         case 5:
3064 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3065 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3066 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3067 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3068 	  tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3069 	  tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3070 	  t[row] = b[row] - tmp1;
3071 	  t[row+1] = b[row+1] - tmp2;
3072 	  t[row+2] = b[row+2] - tmp3;
3073 	  t[row+3] = b[row+3] - tmp4;
3074 	  t[row+4] = b[row+4] - tmp5;row += 5;
3075 	  cnt += 25;
3076 	  break;
3077         default:
3078 	  SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3079       }
3080       CHKMEMQ;
3081     }
3082     ierr = PetscLogFlops(m);CHKERRQ(ierr);
3083 
3084 
3085 
3086     /*
3087           Apply (L + D)^-1 where D is the block diagonal
3088     */
3089     for (i=0, row=0; i<m; i++) {
3090       sz  = diag[row] - ii[row];
3091       v1  = a->a + ii[row];
3092       idx = a->j + ii[row];
3093       CHKMEMQ;
3094       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3095       switch (sizes[i]){
3096         case 1:
3097 
3098 	  sum1  = t[row];
3099 	  for(n = 0; n<sz-1; n+=2) {
3100 	    i1   = idx[0];
3101 	    i2   = idx[1];
3102 	    idx += 2;
3103 	    tmp0 = t[i1];
3104 	    tmp1 = t[i2];
3105 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3106 	  }
3107 
3108 	  if (n == sz-1){
3109 	    tmp0  = t[*idx];
3110 	    sum1 -= *v1 * tmp0;
3111 	  }
3112 	  x[row] += t[row] = sum1*(*ibdiag++); row++;
3113 	  break;
3114         case 2:
3115 	  v2    = a->a + ii[row+1];
3116 	  sum1  = t[row];
3117 	  sum2  = t[row+1];
3118 	  for(n = 0; n<sz-1; n+=2) {
3119 	    i1   = idx[0];
3120 	    i2   = idx[1];
3121 	    idx += 2;
3122 	    tmp0 = t[i1];
3123 	    tmp1 = t[i2];
3124 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3125 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3126 	  }
3127 
3128 	  if (n == sz-1){
3129 	    tmp0  = t[*idx];
3130 	    sum1 -= v1[0] * tmp0;
3131 	    sum2 -= v2[0] * tmp0;
3132 	  }
3133 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3134 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3135 	  ibdiag  += 4; row += 2;
3136 	  break;
3137         case 3:
3138 	  v2    = a->a + ii[row+1];
3139 	  v3    = a->a + ii[row+2];
3140 	  sum1  = t[row];
3141 	  sum2  = t[row+1];
3142 	  sum3  = t[row+2];
3143 	  for(n = 0; n<sz-1; n+=2) {
3144 	    i1   = idx[0];
3145 	    i2   = idx[1];
3146 	    idx += 2;
3147 	    tmp0 = t[i1];
3148 	    tmp1 = t[i2];
3149 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3150 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3151 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3152 	  }
3153 
3154 	  if (n == sz-1){
3155 	    tmp0  = t[*idx];
3156 	    sum1 -= v1[0] * tmp0;
3157 	    sum2 -= v2[0] * tmp0;
3158 	    sum3 -= v3[0] * tmp0;
3159 	  }
3160 	  x[row]  += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3161 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3162 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3163 	  ibdiag  += 9; row += 3;
3164 	  break;
3165         case 4:
3166 	  v2    = a->a + ii[row+1];
3167 	  v3    = a->a + ii[row+2];
3168 	  v4    = a->a + ii[row+3];
3169 	  sum1  = t[row];
3170 	  sum2  = t[row+1];
3171 	  sum3  = t[row+2];
3172 	  sum4  = t[row+3];
3173 	  for(n = 0; n<sz-1; n+=2) {
3174 	    i1   = idx[0];
3175 	    i2   = idx[1];
3176 	    idx += 2;
3177 	    tmp0 = t[i1];
3178 	    tmp1 = t[i2];
3179 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3180 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3181 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3182 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3183 	  }
3184 
3185 	  if (n == sz-1){
3186 	    tmp0  = t[*idx];
3187 	    sum1 -= v1[0] * tmp0;
3188 	    sum2 -= v2[0] * tmp0;
3189 	    sum3 -= v3[0] * tmp0;
3190 	    sum4 -= v4[0] * tmp0;
3191 	  }
3192 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3193 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3194 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3195 	  x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3196 	  ibdiag  += 16; row += 4;
3197 	  break;
3198         case 5:
3199 	  v2    = a->a + ii[row+1];
3200 	  v3    = a->a + ii[row+2];
3201 	  v4    = a->a + ii[row+3];
3202 	  v5    = a->a + ii[row+4];
3203 	  sum1  = t[row];
3204 	  sum2  = t[row+1];
3205 	  sum3  = t[row+2];
3206 	  sum4  = t[row+3];
3207 	  sum5  = t[row+4];
3208 	  for(n = 0; n<sz-1; n+=2) {
3209 	    i1   = idx[0];
3210 	    i2   = idx[1];
3211 	    idx += 2;
3212 	    tmp0 = t[i1];
3213 	    tmp1 = t[i2];
3214 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3215 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3216 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3217 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3218 	    sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3219 	  }
3220 
3221 	  if (n == sz-1){
3222 	    tmp0  = t[*idx];
3223 	    sum1 -= v1[0] * tmp0;
3224 	    sum2 -= v2[0] * tmp0;
3225 	    sum3 -= v3[0] * tmp0;
3226 	    sum4 -= v4[0] * tmp0;
3227 	    sum5 -= v5[0] * tmp0;
3228 	  }
3229 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3230 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3231 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3232 	  x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3233 	  x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3234 	  ibdiag  += 25; row += 5;
3235 	  break;
3236         default:
3237 	  SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3238       }
3239       CHKMEMQ;
3240     }
3241     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3242     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3243     ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3244   }
3245   PetscFunctionReturn(0);
3246 }
3247 
3248 #undef __FUNCT__
3249 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode"
3250 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
3251 {
3252   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3253   PetscScalar        *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
3254   const MatScalar    *bdiag = a->inode.bdiag;
3255   const PetscScalar  *b;
3256   PetscErrorCode      ierr;
3257   PetscInt            m = a->inode.node_count,cnt = 0,i,row;
3258   const PetscInt      *sizes = a->inode.size;
3259 
3260   PetscFunctionBegin;
3261   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3262   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3263   cnt = 0;
3264   for (i=0, row=0; i<m; i++) {
3265     switch (sizes[i]){
3266       case 1:
3267 	x[row] = b[row]*bdiag[cnt++];row++;
3268 	break;
3269       case 2:
3270 	x1   = b[row]; x2 = b[row+1];
3271 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3272 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3273 	x[row++] = tmp1;
3274 	x[row++] = tmp2;
3275 	cnt += 4;
3276 	break;
3277       case 3:
3278 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2];
3279 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3280 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3281 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3282 	x[row++] = tmp1;
3283 	x[row++] = tmp2;
3284 	x[row++] = tmp3;
3285 	cnt += 9;
3286 	break;
3287       case 4:
3288 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
3289 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3290 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3291 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3292 	tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3293 	x[row++] = tmp1;
3294 	x[row++] = tmp2;
3295 	x[row++] = tmp3;
3296 	x[row++] = tmp4;
3297 	cnt += 16;
3298 	break;
3299       case 5:
3300 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
3301 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3302 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3303 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3304 	tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3305 	tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3306 	x[row++] = tmp1;
3307 	x[row++] = tmp2;
3308 	x[row++] = tmp3;
3309 	x[row++] = tmp4;
3310 	x[row++] = tmp5;
3311 	cnt += 25;
3312 	break;
3313       default:
3314 	SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3315     }
3316   }
3317   ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr);
3318   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3319   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3320   PetscFunctionReturn(0);
3321 }
3322 
3323 /*
3324     samestructure indicates that the matrix has not changed its nonzero structure so we
3325     do not need to recompute the inodes
3326 */
3327 #undef __FUNCT__
3328 #define __FUNCT__ "Mat_CheckInode"
3329 PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure)
3330 {
3331   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3332   PetscErrorCode ierr;
3333   PetscInt       i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size;
3334   PetscTruth     flag;
3335 
3336   PetscFunctionBegin;
3337   if (!a->inode.use)                     PetscFunctionReturn(0);
3338   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
3339 
3340 
3341   m = A->rmap->n;
3342   if (a->inode.size) {ns = a->inode.size;}
3343   else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);}
3344 
3345   i          = 0;
3346   node_count = 0;
3347   idx        = a->j;
3348   ii         = a->i;
3349   while (i < m){                /* For each row */
3350     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
3351     /* Limits the number of elements in a node to 'a->inode.limit' */
3352     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
3353       nzy     = ii[j+1] - ii[j]; /* Same number of nonzeros */
3354       if (nzy != nzx) break;
3355       idy  += nzx;             /* Same nonzero pattern */
3356       ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
3357       if (!flag) break;
3358     }
3359     ns[node_count++] = blk_size;
3360     idx += blk_size*nzx;
3361     i    = j;
3362   }
3363   /* If not enough inodes found,, do not use inode version of the routines */
3364   if (!a->inode.size && m && node_count > .9*m) {
3365     ierr = PetscFree(ns);CHKERRQ(ierr);
3366     a->inode.node_count     = 0;
3367     a->inode.size           = PETSC_NULL;
3368     a->inode.use            = PETSC_FALSE;
3369     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
3370   } else {
3371     A->ops->mult              = MatMult_SeqAIJ_Inode;
3372     A->ops->sor               = MatSOR_SeqAIJ_Inode;
3373     A->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
3374     A->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
3375     A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
3376     A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
3377     A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
3378     A->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
3379     A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
3380     A->ops->lufactornumeric   = MatLUFactorNumeric_SeqAIJ_Inode_inplace;
3381     a->inode.node_count       = node_count;
3382     a->inode.size             = ns;
3383     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
3384   }
3385   PetscFunctionReturn(0);
3386 }
3387 
3388 #define MatGetRow_FactoredLU(cols,nzl,nzu,nz,ai,aj,adiag,row) {	\
3389 PetscInt __k, *__vi; \
3390 __vi = aj + ai[row];				\
3391 for(__k=0;__k<nzl;__k++) cols[__k] = __vi[__k]; \
3392 __vi = aj + adiag[row];				\
3393 cols[nzl] = __vi[0];\
3394 __vi = aj + adiag[row+1]+1;\
3395 for(__k=0;__k<nzu;__k++) cols[nzl+1+__k] = __vi[__k];}
3396 
3397 
3398 /*
3399    Mat_CheckInode_FactorLU - Check Inode for factored seqaij matrix.
3400    Modified from Mat_CheckInode().
3401 
3402    Input Parameters:
3403 +  Mat A - ILU or LU matrix factor
3404 -  samestructure - TURE indicates that the matrix has not changed its nonzero structure so we
3405     do not need to recompute the inodes
3406 */
3407 #undef __FUNCT__
3408 #define __FUNCT__ "Mat_CheckInode_FactorLU"
3409 PetscErrorCode Mat_CheckInode_FactorLU(Mat A,PetscTruth samestructure)
3410 {
3411   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3412   PetscErrorCode ierr;
3413   PetscInt       i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size;
3414   PetscInt       *cols1,*cols2,*ns,*ai = a->i,*aj = a->j, *adiag = a->diag;
3415   PetscTruth     flag;
3416 
3417   PetscFunctionBegin;
3418   if (!a->inode.use)                     PetscFunctionReturn(0);
3419   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
3420 
3421   m = A->rmap->n;
3422   if (a->inode.size) {ns = a->inode.size;}
3423   else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);}
3424 
3425   i          = 0;
3426   node_count = 0;
3427   while (i < m){                /* For each row */
3428     nzl1 = ai[i+1] - ai[i];       /* Number of nonzeros in L */
3429     nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/
3430     nzx  = nzl1 + nzu1 + 1;
3431     ierr = PetscMalloc((nzx+1)*sizeof(PetscInt),&cols1);CHKERRQ(ierr);
3432     MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i);
3433 
3434     /* Limits the number of elements in a node to 'a->inode.limit' */
3435     for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
3436       nzl2    = ai[j+1] - ai[j];
3437       nzu2    = adiag[j] - adiag[j+1] - 1;
3438       nzy     = nzl2 + nzu2 + 1;
3439       if( nzy != nzx) break;
3440       ierr    = PetscMalloc((nzy+1)*sizeof(PetscInt),&cols2);CHKERRQ(ierr);
3441       MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);
3442       ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
3443       if (!flag) {ierr = PetscFree(cols2);CHKERRQ(ierr);break;}
3444       ierr = PetscFree(cols2);CHKERRQ(ierr);
3445     }
3446     ns[node_count++] = blk_size;
3447     ierr = PetscFree(cols1);CHKERRQ(ierr);
3448     i    = j;
3449   }
3450   /* If not enough inodes found,, do not use inode version of the routines */
3451   if (!a->inode.size && m && node_count > .9*m) {
3452     ierr = PetscFree(ns);CHKERRQ(ierr);
3453     a->inode.node_count     = 0;
3454     a->inode.size           = PETSC_NULL;
3455     a->inode.use            = PETSC_FALSE;
3456     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
3457   } else {
3458     A->ops->mult              = 0;
3459     A->ops->sor               = 0;
3460     A->ops->multadd           = 0;
3461     A->ops->getrowij          = 0;
3462     A->ops->restorerowij      = 0;
3463     A->ops->getcolumnij       = 0;
3464     A->ops->restorecolumnij   = 0;
3465     A->ops->coloringpatch     = 0;
3466     A->ops->multdiagonalblock = 0;
3467     /* A->ops->lufactornumeric   = MatLUFactorNumeric_SeqAIJ_Inode; not done yet */
3468     a->inode.node_count       = node_count;
3469     a->inode.size             = ns;
3470     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
3471   }
3472   PetscFunctionReturn(0);
3473 }
3474 
3475 /*
3476      This is really ugly. if inodes are used this replaces the
3477   permutations with ones that correspond to rows/cols of the matrix
3478   rather then inode blocks
3479 */
3480 #undef __FUNCT__
3481 #define __FUNCT__ "MatInodeAdjustForInodes"
3482 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
3483 {
3484   PetscErrorCode ierr,(*f)(Mat,IS*,IS*);
3485 
3486   PetscFunctionBegin;
3487   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);CHKERRQ(ierr);
3488   if (f) {
3489     ierr = (*f)(A,rperm,cperm);CHKERRQ(ierr);
3490   }
3491   PetscFunctionReturn(0);
3492 }
3493 
3494 EXTERN_C_BEGIN
3495 #undef __FUNCT__
3496 #define __FUNCT__ "MatAdjustForInodes_SeqAIJ_Inode"
3497 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
3498 {
3499   Mat_SeqAIJ      *a=(Mat_SeqAIJ*)A->data;
3500   PetscErrorCode ierr;
3501   PetscInt       m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
3502   const PetscInt *ridx,*cidx;
3503   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
3504   PetscInt       nslim_col,*ns_col;
3505   IS             ris = *rperm,cis = *cperm;
3506 
3507   PetscFunctionBegin;
3508   if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */
3509   if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */
3510 
3511   ierr  = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
3512   ierr  = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
3513   ierr  = PetscMalloc2(m,PetscInt,&permr,n,PetscInt,&permc);CHKERRQ(ierr);
3514 
3515   ierr  = ISGetIndices(ris,&ridx);CHKERRQ(ierr);
3516   ierr  = ISGetIndices(cis,&cidx);CHKERRQ(ierr);
3517 
3518   /* Form the inode structure for the rows of permuted matric using inv perm*/
3519   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
3520 
3521   /* Construct the permutations for rows*/
3522   for (i=0,row = 0; i<nslim_row; ++i){
3523     indx      = ridx[i];
3524     start_val = tns[indx];
3525     end_val   = tns[indx + 1];
3526     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
3527   }
3528 
3529   /* Form the inode structure for the columns of permuted matrix using inv perm*/
3530   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
3531 
3532  /* Construct permutations for columns */
3533   for (i=0,col=0; i<nslim_col; ++i){
3534     indx      = cidx[i];
3535     start_val = tns[indx];
3536     end_val   = tns[indx + 1];
3537     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
3538   }
3539 
3540   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);CHKERRQ(ierr);
3541   ierr = ISSetPermutation(*rperm);CHKERRQ(ierr);
3542   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);CHKERRQ(ierr);
3543   ierr = ISSetPermutation(*cperm);CHKERRQ(ierr);
3544 
3545   ierr  = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr);
3546   ierr  = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr);
3547 
3548   ierr = PetscFree(ns_col);CHKERRQ(ierr);
3549   ierr = PetscFree2(permr,permc);CHKERRQ(ierr);
3550   ierr = ISDestroy(cis);CHKERRQ(ierr);
3551   ierr = ISDestroy(ris);CHKERRQ(ierr);
3552   ierr = PetscFree(tns);CHKERRQ(ierr);
3553   PetscFunctionReturn(0);
3554 }
3555 EXTERN_C_END
3556 
3557 #undef __FUNCT__
3558 #define __FUNCT__ "MatInodeGetInodeSizes"
3559 /*@C
3560    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
3561 
3562    Collective on Mat
3563 
3564    Input Parameter:
3565 .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
3566 
3567    Output Parameter:
3568 +  node_count - no of inodes present in the matrix.
3569 .  sizes      - an array of size node_count,with sizes of each inode.
3570 -  limit      - the max size used to generate the inodes.
3571 
3572    Level: advanced
3573 
3574    Notes: This routine returns some internal storage information
3575    of the matrix, it is intended to be used by advanced users.
3576    It should be called after the matrix is assembled.
3577    The contents of the sizes[] array should not be changed.
3578    PETSC_NULL may be passed for information not requested.
3579 
3580 .keywords: matrix, seqaij, get, inode
3581 
3582 .seealso: MatGetInfo()
3583 @*/
3584 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
3585 {
3586   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
3587 
3588   PetscFunctionBegin;
3589   if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
3590   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr);
3591   if (f) {
3592     ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr);
3593   }
3594   PetscFunctionReturn(0);
3595 }
3596 
3597 EXTERN_C_BEGIN
3598 #undef __FUNCT__
3599 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode"
3600 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
3601 {
3602   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
3603 
3604   PetscFunctionBegin;
3605   if (node_count) *node_count = a->inode.node_count;
3606   if (sizes)      *sizes      = a->inode.size;
3607   if (limit)      *limit      = a->inode.limit;
3608   PetscFunctionReturn(0);
3609 }
3610 EXTERN_C_END
3611