xref: /petsc/src/mat/impls/aij/seq/inode.c (revision 4f81c4b735514bc5c66feac99f3a97049537fe9b)
1 #define PETSCMAT_DLL
2 
3 /*
4   This file provides high performance routines for the Inode format (compressed sparse row)
5   by taking advantage of rows with identical nonzero structure (I-nodes).
6 */
7 #include "../src/mat/impls/aij/seq/aij.h"
8 
9 #undef __FUNCT__
10 #define __FUNCT__ "Mat_CreateColInode"
11 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns)
12 {
13   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
14   PetscErrorCode ierr;
15   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;
16 
17   PetscFunctionBegin;
18   n      = A->cmap->n;
19   m      = A->rmap->n;
20   ns_row = a->inode.size;
21 
22   min_mn = (m < n) ? m : n;
23   if (!ns) {
24     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++);
25     for(; count+1 < n; count++,i++);
26     if (count < n)  {
27       i++;
28     }
29     *size = i;
30     PetscFunctionReturn(0);
31   }
32   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr);
33 
34   /* Use the same row structure wherever feasible. */
35   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
36     ns_col[i] = ns_row[i];
37   }
38 
39   /* if m < n; pad up the remainder with inode_limit */
40   for(; count+1 < n; count++,i++) {
41     ns_col[i] = 1;
42   }
43   /* The last node is the odd ball. padd it up with the remaining rows; */
44   if (count < n)  {
45     ns_col[i] = n - count;
46     i++;
47   } else if (count > n) {
48     /* Adjust for the over estimation */
49     ns_col[i-1] += n - count;
50   }
51   *size = i;
52   *ns   = ns_col;
53   PetscFunctionReturn(0);
54 }
55 
56 
57 /*
58       This builds symmetric version of nonzero structure,
59 */
60 #undef __FUNCT__
61 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric"
62 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
63 {
64   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
65   PetscErrorCode ierr;
66   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n;
67   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j;
68 
69   PetscFunctionBegin;
70   nslim_row = a->inode.node_count;
71   m         = A->rmap->n;
72   n         = A->cmap->n;
73   if (m != n) SETERRQ(PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
74 
75   /* Use the row_inode as column_inode */
76   nslim_col = nslim_row;
77   ns_col    = ns_row;
78 
79   /* allocate space for reformated inode structure */
80   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
81   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
82   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
83 
84   for (i1=0,col=0; i1<nslim_col; ++i1){
85     nsz = ns_col[i1];
86     for (i2=0; i2<nsz; ++i2,++col)
87       tvc[col] = i1;
88   }
89   /* allocate space for row pointers */
90   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
91   *iia = ia;
92   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
93   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
94 
95   /* determine the number of columns in each row */
96   ia[0] = oshift;
97   for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) {
98 
99     j    = aj + ai[row] + ishift;
100     jmax = aj + ai[row+1] + ishift;
101     i2   = 0;
102     col  = *j++ + ishift;
103     i2   = tvc[col];
104     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
105       ia[i1+1]++;
106       ia[i2+1]++;
107       i2++;                     /* Start col of next node */
108       while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j;
109       i2 = tvc[col];
110     }
111     if(i2 == i1) ia[i2+1]++;    /* now the diagonal element */
112   }
113 
114   /* shift ia[i] to point to next row */
115   for (i1=1; i1<nslim_row+1; i1++) {
116     row        = ia[i1-1];
117     ia[i1]    += row;
118     work[i1-1] = row - oshift;
119   }
120 
121   /* allocate space for column pointers */
122   nz   = ia[nslim_row] + (!ishift);
123   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
124   *jja = ja;
125 
126  /* loop over lower triangular part putting into ja */
127   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
128     j    = aj + ai[row] + ishift;
129     jmax = aj + ai[row+1] + ishift;
130     i2   = 0;                     /* Col inode index */
131     col  = *j++ + ishift;
132     i2   = tvc[col];
133     while (i2<i1 && j<jmax) {
134       ja[work[i2]++] = i1 + oshift;
135       ja[work[i1]++] = i2 + oshift;
136       ++i2;
137       while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */
138       i2 = tvc[col];
139     }
140     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
141 
142   }
143   ierr = PetscFree(work);CHKERRQ(ierr);
144   ierr = PetscFree(tns);CHKERRQ(ierr);
145   ierr = PetscFree(tvc);CHKERRQ(ierr);
146   PetscFunctionReturn(0);
147 }
148 
149 /*
150       This builds nonsymmetric version of nonzero structure,
151 */
152 #undef __FUNCT__
153 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric"
154 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
155 {
156   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
157   PetscErrorCode ierr;
158   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col;
159   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
160 
161   PetscFunctionBegin;
162   nslim_row = a->inode.node_count;
163   n         = A->cmap->n;
164 
165   /* Create The column_inode for this matrix */
166   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
167 
168   /* allocate space for reformated column_inode structure */
169   ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
170   ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
171   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
172 
173   for (i1=0,col=0; i1<nslim_col; ++i1){
174     nsz = ns_col[i1];
175     for (i2=0; i2<nsz; ++i2,++col)
176       tvc[col] = i1;
177   }
178   /* allocate space for row pointers */
179   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
180   *iia = ia;
181   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
182   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
183 
184   /* determine the number of columns in each row */
185   ia[0] = oshift;
186   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
187     j   = aj + ai[row] + ishift;
188     col = *j++ + ishift;
189     i2  = tvc[col];
190     nz  = ai[row+1] - ai[row];
191     while (nz-- > 0) {           /* off-diagonal elemets */
192       ia[i1+1]++;
193       i2++;                     /* Start col of next node */
194       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
195       if (nz > 0) i2 = tvc[col];
196     }
197   }
198 
199   /* shift ia[i] to point to next row */
200   for (i1=1; i1<nslim_row+1; i1++) {
201     row        = ia[i1-1];
202     ia[i1]    += row;
203     work[i1-1] = row - oshift;
204   }
205 
206   /* allocate space for column pointers */
207   nz   = ia[nslim_row] + (!ishift);
208   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
209   *jja = ja;
210 
211  /* loop over matrix putting into ja */
212   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
213     j   = aj + ai[row] + ishift;
214     i2  = 0;                     /* Col inode index */
215     col = *j++ + ishift;
216     i2  = tvc[col];
217     nz  = ai[row+1] - ai[row];
218     while (nz-- > 0) {
219       ja[work[i1]++] = i2 + oshift;
220       ++i2;
221       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
222       if (nz > 0) i2 = tvc[col];
223     }
224   }
225   ierr = PetscFree(ns_col);CHKERRQ(ierr);
226   ierr = PetscFree(work);CHKERRQ(ierr);
227   ierr = PetscFree(tns);CHKERRQ(ierr);
228   ierr = PetscFree(tvc);CHKERRQ(ierr);
229   PetscFunctionReturn(0);
230 }
231 
232 #undef __FUNCT__
233 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode"
234 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
235 {
236   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
237   PetscErrorCode ierr;
238 
239   PetscFunctionBegin;
240   *n     = a->inode.node_count;
241   if (!ia) PetscFunctionReturn(0);
242   if (!blockcompressed) {
243     ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
244   } else if (symmetric) {
245     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
246   } else {
247     ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
248   }
249   PetscFunctionReturn(0);
250 }
251 
252 #undef __FUNCT__
253 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode"
254 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
255 {
256   PetscErrorCode ierr;
257 
258   PetscFunctionBegin;
259   if (!ia) PetscFunctionReturn(0);
260 
261   if (!blockcompressed) {
262     ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
263   } else {
264     ierr = PetscFree(*ia);CHKERRQ(ierr);
265     ierr = PetscFree(*ja);CHKERRQ(ierr);
266   }
267 
268   PetscFunctionReturn(0);
269 }
270 
271 /* ----------------------------------------------------------- */
272 
273 #undef __FUNCT__
274 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric"
275 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
276 {
277   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
278   PetscErrorCode ierr;
279   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
280   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
281 
282   PetscFunctionBegin;
283   nslim_row = a->inode.node_count;
284   n         = A->cmap->n;
285 
286   /* Create The column_inode for this matrix */
287   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
288 
289   /* allocate space for reformated column_inode structure */
290   ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
291   ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
292   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
293 
294   for (i1=0,col=0; i1<nslim_col; ++i1){
295     nsz = ns_col[i1];
296     for (i2=0; i2<nsz; ++i2,++col)
297       tvc[col] = i1;
298   }
299   /* allocate space for column pointers */
300   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
301   *iia = ia;
302   ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr);
303   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
304 
305   /* determine the number of columns in each row */
306   ia[0] = oshift;
307   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
308     j   = aj + ai[row] + ishift;
309     col = *j++ + ishift;
310     i2  = tvc[col];
311     nz  = ai[row+1] - ai[row];
312     while (nz-- > 0) {           /* off-diagonal elemets */
313       /* ia[i1+1]++; */
314       ia[i2+1]++;
315       i2++;
316       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
317       if (nz > 0) i2 = tvc[col];
318     }
319   }
320 
321   /* shift ia[i] to point to next col */
322   for (i1=1; i1<nslim_col+1; i1++) {
323     col        = ia[i1-1];
324     ia[i1]    += col;
325     work[i1-1] = col - oshift;
326   }
327 
328   /* allocate space for column pointers */
329   nz   = ia[nslim_col] + (!ishift);
330   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
331   *jja = ja;
332 
333  /* loop over matrix putting into ja */
334   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
335     j   = aj + ai[row] + ishift;
336     i2  = 0;                     /* Col inode index */
337     col = *j++ + ishift;
338     i2  = tvc[col];
339     nz  = ai[row+1] - ai[row];
340     while (nz-- > 0) {
341       /* ja[work[i1]++] = i2 + oshift; */
342       ja[work[i2]++] = i1 + oshift;
343       i2++;
344       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
345       if (nz > 0) i2 = tvc[col];
346     }
347   }
348   ierr = PetscFree(ns_col);CHKERRQ(ierr);
349   ierr = PetscFree(work);CHKERRQ(ierr);
350   ierr = PetscFree(tns);CHKERRQ(ierr);
351   ierr = PetscFree(tvc);CHKERRQ(ierr);
352   PetscFunctionReturn(0);
353 }
354 
355 #undef __FUNCT__
356 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode"
357 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
358 {
359   PetscErrorCode ierr;
360 
361   PetscFunctionBegin;
362   ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr);
363   if (!ia) PetscFunctionReturn(0);
364 
365   if (!blockcompressed) {
366     ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
367   } else if (symmetric) {
368     /* Since the indices are symmetric it does'nt matter */
369     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
370   } else {
371     ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
372   }
373   PetscFunctionReturn(0);
374 }
375 
376 #undef __FUNCT__
377 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode"
378 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
379 {
380   PetscErrorCode ierr;
381 
382   PetscFunctionBegin;
383   if (!ia) PetscFunctionReturn(0);
384   if (!blockcompressed) {
385     ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
386   } else {
387     ierr = PetscFree(*ia);CHKERRQ(ierr);
388     ierr = PetscFree(*ja);CHKERRQ(ierr);
389   }
390   PetscFunctionReturn(0);
391 }
392 
393 /* ----------------------------------------------------------- */
394 
395 #undef __FUNCT__
396 #define __FUNCT__ "MatMult_SeqAIJ_Inode"
397 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
398 {
399   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
400   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
401   PetscScalar       *y;
402   const PetscScalar *x;
403   const MatScalar   *v1,*v2,*v3,*v4,*v5;
404   PetscErrorCode    ierr;
405   PetscInt          *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0;
406 
407 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
408 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
409 #endif
410 
411   PetscFunctionBegin;
412   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
413   node_max = a->inode.node_count;
414   ns       = a->inode.size;     /* Node Size array */
415   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
416   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
417   idx  = a->j;
418   v1   = a->a;
419   ii   = a->i;
420 
421   for (i = 0,row = 0; i< node_max; ++i){
422     nsz  = ns[i];
423     n    = ii[1] - ii[0];
424     nonzerorow += (n>0)*nsz;
425     ii  += nsz;
426     PetscPrefetchBlock(idx+nsz*n,n,0,0);    /* Prefetch the indices for the block row after the current one */
427     PetscPrefetchBlock(v1+nsz*n,nsz*n,0,0); /* Prefetch the values for the block row after the current one  */
428     sz   = n;                   /* No of non zeros in this row */
429                                 /* Switch on the size of Node */
430     switch (nsz){               /* Each loop in 'case' is unrolled */
431     case 1 :
432       sum1  = 0.;
433 
434       for(n = 0; n< sz-1; n+=2) {
435         i1   = idx[0];          /* The instructions are ordered to */
436         i2   = idx[1];          /* make the compiler's job easy */
437         idx += 2;
438         tmp0 = x[i1];
439         tmp1 = x[i2];
440         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
441        }
442 
443       if (n == sz-1){          /* Take care of the last nonzero  */
444         tmp0  = x[*idx++];
445         sum1 += *v1++ * tmp0;
446       }
447       y[row++]=sum1;
448       break;
449     case 2:
450       sum1  = 0.;
451       sum2  = 0.;
452       v2    = v1 + n;
453 
454       for (n = 0; n< sz-1; n+=2) {
455         i1   = idx[0];
456         i2   = idx[1];
457         idx += 2;
458         tmp0 = x[i1];
459         tmp1 = x[i2];
460         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
461         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
462       }
463       if (n == sz-1){
464         tmp0  = x[*idx++];
465         sum1 += *v1++ * tmp0;
466         sum2 += *v2++ * tmp0;
467       }
468       y[row++]=sum1;
469       y[row++]=sum2;
470       v1      =v2;              /* Since the next block to be processed starts there*/
471       idx    +=sz;
472       break;
473     case 3:
474       sum1  = 0.;
475       sum2  = 0.;
476       sum3  = 0.;
477       v2    = v1 + n;
478       v3    = v2 + n;
479 
480       for (n = 0; n< sz-1; n+=2) {
481         i1   = idx[0];
482         i2   = idx[1];
483         idx += 2;
484         tmp0 = x[i1];
485         tmp1 = x[i2];
486         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
487         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
488         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
489       }
490       if (n == sz-1){
491         tmp0  = x[*idx++];
492         sum1 += *v1++ * tmp0;
493         sum2 += *v2++ * tmp0;
494         sum3 += *v3++ * tmp0;
495       }
496       y[row++]=sum1;
497       y[row++]=sum2;
498       y[row++]=sum3;
499       v1       =v3;             /* Since the next block to be processed starts there*/
500       idx     +=2*sz;
501       break;
502     case 4:
503       sum1  = 0.;
504       sum2  = 0.;
505       sum3  = 0.;
506       sum4  = 0.;
507       v2    = v1 + n;
508       v3    = v2 + n;
509       v4    = v3 + n;
510 
511       for (n = 0; n< sz-1; n+=2) {
512         i1   = idx[0];
513         i2   = idx[1];
514         idx += 2;
515         tmp0 = x[i1];
516         tmp1 = x[i2];
517         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
518         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
519         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
520         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
521       }
522       if (n == sz-1){
523         tmp0  = x[*idx++];
524         sum1 += *v1++ * tmp0;
525         sum2 += *v2++ * tmp0;
526         sum3 += *v3++ * tmp0;
527         sum4 += *v4++ * tmp0;
528       }
529       y[row++]=sum1;
530       y[row++]=sum2;
531       y[row++]=sum3;
532       y[row++]=sum4;
533       v1      =v4;              /* Since the next block to be processed starts there*/
534       idx    +=3*sz;
535       break;
536     case 5:
537       sum1  = 0.;
538       sum2  = 0.;
539       sum3  = 0.;
540       sum4  = 0.;
541       sum5  = 0.;
542       v2    = v1 + n;
543       v3    = v2 + n;
544       v4    = v3 + n;
545       v5    = v4 + n;
546 
547       for (n = 0; n<sz-1; n+=2) {
548         i1   = idx[0];
549         i2   = idx[1];
550         idx += 2;
551         tmp0 = x[i1];
552         tmp1 = x[i2];
553         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
554         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
555         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
556         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
557         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
558       }
559       if (n == sz-1){
560         tmp0  = x[*idx++];
561         sum1 += *v1++ * tmp0;
562         sum2 += *v2++ * tmp0;
563         sum3 += *v3++ * tmp0;
564         sum4 += *v4++ * tmp0;
565         sum5 += *v5++ * tmp0;
566       }
567       y[row++]=sum1;
568       y[row++]=sum2;
569       y[row++]=sum3;
570       y[row++]=sum4;
571       y[row++]=sum5;
572       v1      =v5;       /* Since the next block to be processed starts there */
573       idx    +=4*sz;
574       break;
575     default :
576       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
577     }
578   }
579   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
580   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
581   ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr);
582   PetscFunctionReturn(0);
583 }
584 /* ----------------------------------------------------------- */
585 /* Almost same code as the MatMult_SeqAIJ_Inode() */
586 #undef __FUNCT__
587 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode"
588 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
589 {
590   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
591   PetscScalar    sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
592   MatScalar      *v1,*v2,*v3,*v4,*v5;
593   PetscScalar    *x,*y,*z,*zt;
594   PetscErrorCode ierr;
595   PetscInt       *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
596 
597   PetscFunctionBegin;
598   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
599   node_max = a->inode.node_count;
600   ns       = a->inode.size;     /* Node Size array */
601   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
602   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
603   if (zz != yy) {
604     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
605   } else {
606     z = y;
607   }
608   zt = z;
609 
610   idx  = a->j;
611   v1   = a->a;
612   ii   = a->i;
613 
614   for (i = 0,row = 0; i< node_max; ++i){
615     nsz  = ns[i];
616     n    = ii[1] - ii[0];
617     ii  += nsz;
618     sz   = n;                   /* No of non zeros in this row */
619                                 /* Switch on the size of Node */
620     switch (nsz){               /* Each loop in 'case' is unrolled */
621     case 1 :
622       sum1  = *zt++;
623 
624       for(n = 0; n< sz-1; n+=2) {
625         i1   = idx[0];          /* The instructions are ordered to */
626         i2   = idx[1];          /* make the compiler's job easy */
627         idx += 2;
628         tmp0 = x[i1];
629         tmp1 = x[i2];
630         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
631        }
632 
633       if(n   == sz-1){          /* Take care of the last nonzero  */
634         tmp0  = x[*idx++];
635         sum1 += *v1++ * tmp0;
636       }
637       y[row++]=sum1;
638       break;
639     case 2:
640       sum1  = *zt++;
641       sum2  = *zt++;
642       v2    = v1 + n;
643 
644       for(n = 0; n< sz-1; n+=2) {
645         i1   = idx[0];
646         i2   = idx[1];
647         idx += 2;
648         tmp0 = x[i1];
649         tmp1 = x[i2];
650         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
651         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
652       }
653       if(n   == sz-1){
654         tmp0  = x[*idx++];
655         sum1 += *v1++ * tmp0;
656         sum2 += *v2++ * tmp0;
657       }
658       y[row++]=sum1;
659       y[row++]=sum2;
660       v1      =v2;              /* Since the next block to be processed starts there*/
661       idx    +=sz;
662       break;
663     case 3:
664       sum1  = *zt++;
665       sum2  = *zt++;
666       sum3  = *zt++;
667       v2    = v1 + n;
668       v3    = v2 + n;
669 
670       for (n = 0; n< sz-1; n+=2) {
671         i1   = idx[0];
672         i2   = idx[1];
673         idx += 2;
674         tmp0 = x[i1];
675         tmp1 = x[i2];
676         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
677         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
678         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
679       }
680       if (n == sz-1){
681         tmp0  = x[*idx++];
682         sum1 += *v1++ * tmp0;
683         sum2 += *v2++ * tmp0;
684         sum3 += *v3++ * tmp0;
685       }
686       y[row++]=sum1;
687       y[row++]=sum2;
688       y[row++]=sum3;
689       v1       =v3;             /* Since the next block to be processed starts there*/
690       idx     +=2*sz;
691       break;
692     case 4:
693       sum1  = *zt++;
694       sum2  = *zt++;
695       sum3  = *zt++;
696       sum4  = *zt++;
697       v2    = v1 + n;
698       v3    = v2 + n;
699       v4    = v3 + n;
700 
701       for (n = 0; n< sz-1; n+=2) {
702         i1   = idx[0];
703         i2   = idx[1];
704         idx += 2;
705         tmp0 = x[i1];
706         tmp1 = x[i2];
707         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
708         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
709         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
710         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
711       }
712       if (n == sz-1){
713         tmp0  = x[*idx++];
714         sum1 += *v1++ * tmp0;
715         sum2 += *v2++ * tmp0;
716         sum3 += *v3++ * tmp0;
717         sum4 += *v4++ * tmp0;
718       }
719       y[row++]=sum1;
720       y[row++]=sum2;
721       y[row++]=sum3;
722       y[row++]=sum4;
723       v1      =v4;              /* Since the next block to be processed starts there*/
724       idx    +=3*sz;
725       break;
726     case 5:
727       sum1  = *zt++;
728       sum2  = *zt++;
729       sum3  = *zt++;
730       sum4  = *zt++;
731       sum5  = *zt++;
732       v2    = v1 + n;
733       v3    = v2 + n;
734       v4    = v3 + n;
735       v5    = v4 + n;
736 
737       for (n = 0; n<sz-1; n+=2) {
738         i1   = idx[0];
739         i2   = idx[1];
740         idx += 2;
741         tmp0 = x[i1];
742         tmp1 = x[i2];
743         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
744         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
745         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
746         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
747         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
748       }
749       if(n   == sz-1){
750         tmp0  = x[*idx++];
751         sum1 += *v1++ * tmp0;
752         sum2 += *v2++ * tmp0;
753         sum3 += *v3++ * tmp0;
754         sum4 += *v4++ * tmp0;
755         sum5 += *v5++ * tmp0;
756       }
757       y[row++]=sum1;
758       y[row++]=sum2;
759       y[row++]=sum3;
760       y[row++]=sum4;
761       y[row++]=sum5;
762       v1      =v5;       /* Since the next block to be processed starts there */
763       idx    +=4*sz;
764       break;
765     default :
766       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
767     }
768   }
769   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
770   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
771   if (zz != yy) {
772     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
773   }
774   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
775   PetscFunctionReturn(0);
776 }
777 
778 /* ----------------------------------------------------------- */
779 #undef __FUNCT__
780 #define __FUNCT__ "MatSolve_SeqAIJ_Inode_inplace"
781 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)
782 {
783   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
784   IS                iscol = a->col,isrow = a->row;
785   PetscErrorCode    ierr;
786   const PetscInt    *r,*c,*rout,*cout;
787   PetscInt          i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
788   PetscInt          node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
789   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
790   PetscScalar       sum1,sum2,sum3,sum4,sum5;
791   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
792   const PetscScalar *b;
793 
794   PetscFunctionBegin;
795   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
796   node_max = a->inode.node_count;
797   ns       = a->inode.size;     /* Node Size array */
798 
799   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
800   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
801   tmp  = a->solve_work;
802 
803   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
804   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
805 
806   /* forward solve the lower triangular */
807   tmps = tmp ;
808   aa   = a_a ;
809   aj   = a_j ;
810   ad   = a->diag;
811 
812   for (i = 0,row = 0; i< node_max; ++i){
813     nsz = ns[i];
814     aii = ai[row];
815     v1  = aa + aii;
816     vi  = aj + aii;
817     nz  = ad[row]- aii;
818     if (i < node_max-1) {
819       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
820       * but our indexing to determine it's size could. */
821       PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,0); /* indices */
822       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
823       PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,0);
824       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
825     }
826 
827     switch (nsz){               /* Each loop in 'case' is unrolled */
828     case 1 :
829       sum1 = b[*r++];
830       for(j=0; j<nz-1; j+=2){
831         i0   = vi[0];
832         i1   = vi[1];
833         vi  +=2;
834         tmp0 = tmps[i0];
835         tmp1 = tmps[i1];
836         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
837       }
838       if(j == nz-1){
839         tmp0 = tmps[*vi++];
840         sum1 -= *v1++ *tmp0;
841       }
842       tmp[row ++]=sum1;
843       break;
844     case 2:
845       sum1 = b[*r++];
846       sum2 = b[*r++];
847       v2   = aa + ai[row+1];
848 
849       for(j=0; j<nz-1; j+=2){
850         i0   = vi[0];
851         i1   = vi[1];
852         vi  +=2;
853         tmp0 = tmps[i0];
854         tmp1 = tmps[i1];
855         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
856         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
857       }
858       if(j == nz-1){
859         tmp0 = tmps[*vi++];
860         sum1 -= *v1++ *tmp0;
861         sum2 -= *v2++ *tmp0;
862       }
863       sum2 -= *v2++ * sum1;
864       tmp[row ++]=sum1;
865       tmp[row ++]=sum2;
866       break;
867     case 3:
868       sum1 = b[*r++];
869       sum2 = b[*r++];
870       sum3 = b[*r++];
871       v2   = aa + ai[row+1];
872       v3   = aa + ai[row+2];
873 
874       for (j=0; j<nz-1; j+=2){
875         i0   = vi[0];
876         i1   = vi[1];
877         vi  +=2;
878         tmp0 = tmps[i0];
879         tmp1 = tmps[i1];
880         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
881         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
882         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
883       }
884       if (j == nz-1){
885         tmp0 = tmps[*vi++];
886         sum1 -= *v1++ *tmp0;
887         sum2 -= *v2++ *tmp0;
888         sum3 -= *v3++ *tmp0;
889       }
890       sum2 -= *v2++ * sum1;
891       sum3 -= *v3++ * sum1;
892       sum3 -= *v3++ * sum2;
893       tmp[row ++]=sum1;
894       tmp[row ++]=sum2;
895       tmp[row ++]=sum3;
896       break;
897 
898     case 4:
899       sum1 = b[*r++];
900       sum2 = b[*r++];
901       sum3 = b[*r++];
902       sum4 = b[*r++];
903       v2   = aa + ai[row+1];
904       v3   = aa + ai[row+2];
905       v4   = aa + ai[row+3];
906 
907       for (j=0; j<nz-1; j+=2){
908         i0   = vi[0];
909         i1   = vi[1];
910         vi  +=2;
911         tmp0 = tmps[i0];
912         tmp1 = tmps[i1];
913         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
914         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
915         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
916         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
917       }
918       if (j == nz-1){
919         tmp0 = tmps[*vi++];
920         sum1 -= *v1++ *tmp0;
921         sum2 -= *v2++ *tmp0;
922         sum3 -= *v3++ *tmp0;
923         sum4 -= *v4++ *tmp0;
924       }
925       sum2 -= *v2++ * sum1;
926       sum3 -= *v3++ * sum1;
927       sum4 -= *v4++ * sum1;
928       sum3 -= *v3++ * sum2;
929       sum4 -= *v4++ * sum2;
930       sum4 -= *v4++ * sum3;
931 
932       tmp[row ++]=sum1;
933       tmp[row ++]=sum2;
934       tmp[row ++]=sum3;
935       tmp[row ++]=sum4;
936       break;
937     case 5:
938       sum1 = b[*r++];
939       sum2 = b[*r++];
940       sum3 = b[*r++];
941       sum4 = b[*r++];
942       sum5 = b[*r++];
943       v2   = aa + ai[row+1];
944       v3   = aa + ai[row+2];
945       v4   = aa + ai[row+3];
946       v5   = aa + ai[row+4];
947 
948       for (j=0; j<nz-1; j+=2){
949         i0   = vi[0];
950         i1   = vi[1];
951         vi  +=2;
952         tmp0 = tmps[i0];
953         tmp1 = tmps[i1];
954         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
955         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
956         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
957         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
958         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
959       }
960       if (j == nz-1){
961         tmp0 = tmps[*vi++];
962         sum1 -= *v1++ *tmp0;
963         sum2 -= *v2++ *tmp0;
964         sum3 -= *v3++ *tmp0;
965         sum4 -= *v4++ *tmp0;
966         sum5 -= *v5++ *tmp0;
967       }
968 
969       sum2 -= *v2++ * sum1;
970       sum3 -= *v3++ * sum1;
971       sum4 -= *v4++ * sum1;
972       sum5 -= *v5++ * sum1;
973       sum3 -= *v3++ * sum2;
974       sum4 -= *v4++ * sum2;
975       sum5 -= *v5++ * sum2;
976       sum4 -= *v4++ * sum3;
977       sum5 -= *v5++ * sum3;
978       sum5 -= *v5++ * sum4;
979 
980       tmp[row ++]=sum1;
981       tmp[row ++]=sum2;
982       tmp[row ++]=sum3;
983       tmp[row ++]=sum4;
984       tmp[row ++]=sum5;
985       break;
986     default:
987       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
988     }
989   }
990   /* backward solve the upper triangular */
991   for (i=node_max -1 ,row = n-1 ; i>=0; i--){
992     nsz = ns[i];
993     aii = ai[row+1] -1;
994     v1  = aa + aii;
995     vi  = aj + aii;
996     nz  = aii- ad[row];
997     switch (nsz){               /* Each loop in 'case' is unrolled */
998     case 1 :
999       sum1 = tmp[row];
1000 
1001       for(j=nz ; j>1; j-=2){
1002         vi  -=2;
1003         i0   = vi[2];
1004         i1   = vi[1];
1005         tmp0 = tmps[i0];
1006         tmp1 = tmps[i1];
1007         v1   -= 2;
1008         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1009       }
1010       if (j==1){
1011         tmp0  = tmps[*vi--];
1012         sum1 -= *v1-- * tmp0;
1013       }
1014       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1015       break;
1016     case 2 :
1017       sum1 = tmp[row];
1018       sum2 = tmp[row -1];
1019       v2   = aa + ai[row]-1;
1020       for (j=nz ; j>1; j-=2){
1021         vi  -=2;
1022         i0   = vi[2];
1023         i1   = vi[1];
1024         tmp0 = tmps[i0];
1025         tmp1 = tmps[i1];
1026         v1   -= 2;
1027         v2   -= 2;
1028         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1029         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1030       }
1031       if (j==1){
1032         tmp0  = tmps[*vi--];
1033         sum1 -= *v1-- * tmp0;
1034         sum2 -= *v2-- * tmp0;
1035       }
1036 
1037       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1038       sum2   -= *v2-- * tmp0;
1039       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1040       break;
1041     case 3 :
1042       sum1 = tmp[row];
1043       sum2 = tmp[row -1];
1044       sum3 = tmp[row -2];
1045       v2   = aa + ai[row]-1;
1046       v3   = aa + ai[row -1]-1;
1047       for (j=nz ; j>1; j-=2){
1048         vi  -=2;
1049         i0   = vi[2];
1050         i1   = vi[1];
1051         tmp0 = tmps[i0];
1052         tmp1 = tmps[i1];
1053         v1   -= 2;
1054         v2   -= 2;
1055         v3   -= 2;
1056         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1057         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1058         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1059       }
1060       if (j==1){
1061         tmp0  = tmps[*vi--];
1062         sum1 -= *v1-- * tmp0;
1063         sum2 -= *v2-- * tmp0;
1064         sum3 -= *v3-- * tmp0;
1065       }
1066       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1067       sum2   -= *v2-- * tmp0;
1068       sum3   -= *v3-- * tmp0;
1069       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1070       sum3   -= *v3-- * tmp0;
1071       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1072 
1073       break;
1074     case 4 :
1075       sum1 = tmp[row];
1076       sum2 = tmp[row -1];
1077       sum3 = tmp[row -2];
1078       sum4 = tmp[row -3];
1079       v2   = aa + ai[row]-1;
1080       v3   = aa + ai[row -1]-1;
1081       v4   = aa + ai[row -2]-1;
1082 
1083       for (j=nz ; j>1; j-=2){
1084         vi  -=2;
1085         i0   = vi[2];
1086         i1   = vi[1];
1087         tmp0 = tmps[i0];
1088         tmp1 = tmps[i1];
1089         v1  -= 2;
1090         v2  -= 2;
1091         v3  -= 2;
1092         v4  -= 2;
1093         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1094         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1095         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1096         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1097       }
1098       if (j==1){
1099         tmp0  = tmps[*vi--];
1100         sum1 -= *v1-- * tmp0;
1101         sum2 -= *v2-- * tmp0;
1102         sum3 -= *v3-- * tmp0;
1103         sum4 -= *v4-- * tmp0;
1104       }
1105 
1106       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1107       sum2   -= *v2-- * tmp0;
1108       sum3   -= *v3-- * tmp0;
1109       sum4   -= *v4-- * tmp0;
1110       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1111       sum3   -= *v3-- * tmp0;
1112       sum4   -= *v4-- * tmp0;
1113       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1114       sum4   -= *v4-- * tmp0;
1115       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1116       break;
1117     case 5 :
1118       sum1 = tmp[row];
1119       sum2 = tmp[row -1];
1120       sum3 = tmp[row -2];
1121       sum4 = tmp[row -3];
1122       sum5 = tmp[row -4];
1123       v2   = aa + ai[row]-1;
1124       v3   = aa + ai[row -1]-1;
1125       v4   = aa + ai[row -2]-1;
1126       v5   = aa + ai[row -3]-1;
1127       for (j=nz ; j>1; j-=2){
1128         vi  -= 2;
1129         i0   = vi[2];
1130         i1   = vi[1];
1131         tmp0 = tmps[i0];
1132         tmp1 = tmps[i1];
1133         v1   -= 2;
1134         v2   -= 2;
1135         v3   -= 2;
1136         v4   -= 2;
1137         v5   -= 2;
1138         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1139         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1140         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1141         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1142         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1143       }
1144       if (j==1){
1145         tmp0  = tmps[*vi--];
1146         sum1 -= *v1-- * tmp0;
1147         sum2 -= *v2-- * tmp0;
1148         sum3 -= *v3-- * tmp0;
1149         sum4 -= *v4-- * tmp0;
1150         sum5 -= *v5-- * tmp0;
1151       }
1152 
1153       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1154       sum2   -= *v2-- * tmp0;
1155       sum3   -= *v3-- * tmp0;
1156       sum4   -= *v4-- * tmp0;
1157       sum5   -= *v5-- * tmp0;
1158       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1159       sum3   -= *v3-- * tmp0;
1160       sum4   -= *v4-- * tmp0;
1161       sum5   -= *v5-- * tmp0;
1162       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1163       sum4   -= *v4-- * tmp0;
1164       sum5   -= *v5-- * tmp0;
1165       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1166       sum5   -= *v5-- * tmp0;
1167       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1168       break;
1169     default:
1170       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1171     }
1172   }
1173   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1174   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1175   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1176   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1177   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
1178   PetscFunctionReturn(0);
1179 }
1180 
1181 #undef __FUNCT__
1182 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode"
1183 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info)
1184 {
1185   Mat              C=B;
1186   Mat_SeqAIJ       *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ *)C->data;
1187   IS               isrow = b->row,isicol = b->icol;
1188   PetscErrorCode   ierr;
1189   const PetscInt   *r,*ic,*ics;
1190   const PetscInt   n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag;
1191   PetscInt         i,j,k,nz,nzL,row,*pj;
1192   const PetscInt   *ajtmp,*bjtmp;
1193   MatScalar        *pc,*pc1,*pc2,*pc3,mul1,mul2,mul3,*pv,*rtmp1,*rtmp2,*rtmp3;
1194   const  MatScalar *aa=a->a,*v,*v1,*v2,*v3;
1195   FactorShiftCtx   sctx;
1196   const PetscInt   *ddiag;
1197   PetscReal        rs;
1198   MatScalar        d;
1199   PetscInt         inod,nodesz,node_max,col;
1200   const PetscInt   *ns;
1201   PetscInt         *tmp_vec1,*tmp_vec2,*nsmap;
1202 
1203   PetscFunctionBegin;
1204   /* MatPivotSetUp(): initialize shift context sctx */
1205   ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr);
1206 
1207   if (info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1208     ddiag          = a->diag;
1209     sctx.shift_top = info->zeropivot;
1210     for (i=0; i<n; i++) {
1211       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1212       d  = (aa)[ddiag[i]];
1213       rs = -PetscAbsScalar(d) - PetscRealPart(d);
1214       v  = aa+ai[i];
1215       nz = ai[i+1] - ai[i];
1216       for (j=0; j<nz; j++)
1217 	rs += PetscAbsScalar(v[j]);
1218       if (rs>sctx.shift_top) sctx.shift_top = rs;
1219     }
1220     sctx.shift_top   *= 1.1;
1221     sctx.nshift_max   = 5;
1222     sctx.shift_lo     = 0.;
1223     sctx.shift_hi     = 1.;
1224   }
1225 
1226   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1227   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1228 
1229   ierr  = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp1);CHKERRQ(ierr);
1230   ierr  = PetscMemzero(rtmp1,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr);
1231   rtmp2 = rtmp1 + n;
1232   rtmp3 = rtmp2 + n;
1233   ics   = ic;
1234 
1235   node_max = a->inode.node_count;
1236   ns       = a->inode.size;
1237   if (!ns){
1238     SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information");
1239   }
1240 
1241   /* If max inode size > 3, split it into two inodes.*/
1242   /* also map the inode sizes according to the ordering */
1243   ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr);
1244   for (i=0,j=0; i<node_max; ++i,++j){
1245     if (ns[i]>3) {
1246       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1247       ++j;
1248       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1249     } else {
1250       tmp_vec1[j] = ns[i];
1251     }
1252   }
1253   /* Use the correct node_max */
1254   node_max = j;
1255 
1256   /* Now reorder the inode info based on mat re-ordering info */
1257   /* First create a row -> inode_size_array_index map */
1258   ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr);
1259   ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr);
1260   for (i=0,row=0; i<node_max; i++) {
1261     nodesz = tmp_vec1[i];
1262     for (j=0; j<nodesz; j++,row++) {
1263       nsmap[row] = i;
1264     }
1265   }
1266   /* Using nsmap, create a reordered ns structure */
1267   for (i=0,j=0; i< node_max; i++) {
1268     nodesz       = tmp_vec1[nsmap[r[j]]];    /* here the reordered row_no is in r[] */
1269     tmp_vec2[i]  = nodesz;
1270     j           += nodesz;
1271   }
1272   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1273   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1274 
1275   /* Now use the correct ns */
1276   ns = tmp_vec2;
1277 
1278   do {
1279     sctx.useshift = PETSC_FALSE;
1280     /* Now loop over each block-row, and do the factorization */
1281     for (inod=0,i=0; inod<node_max; inod++){ /* i: row index; inod: inode index */
1282       nodesz = ns[inod];
1283 
1284       switch (nodesz){
1285       case 1:
1286       /*----------*/
1287         /* zero rtmp1 */
1288         /* L part */
1289         nz    = bi[i+1] - bi[i];
1290         bjtmp = bj + bi[i];
1291         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1292 
1293         /* U part */
1294         nz = bdiag[i]-bdiag[i+1];
1295         bjtmp = bj + bdiag[i+1]+1;
1296         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1297 
1298         /* load in initial (unfactored row) */
1299         nz    = ai[r[i]+1] - ai[r[i]];
1300         ajtmp = aj + ai[r[i]];
1301         v     = aa + ai[r[i]];
1302         for (j=0; j<nz; j++) {
1303           rtmp1[ics[ajtmp[j]]] = v[j];
1304         }
1305         /* ZeropivotApply() */
1306         rtmp1[i] += sctx.shift_amount;  /* shift the diagonal of the matrix */
1307 
1308         /* elimination */
1309         bjtmp = bj + bi[i];
1310         row   = *bjtmp++;
1311         nzL   = bi[i+1] - bi[i];
1312         for(k=0; k < nzL;k++) {
1313           pc = rtmp1 + row;
1314           if (*pc != 0.0) {
1315             pv   = b->a + bdiag[row];
1316             mul1 = *pc * (*pv);
1317             *pc  = mul1;
1318             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1319             pv = b->a + bdiag[row+1]+1;
1320             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1321             for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1322             ierr = PetscLogFlops(2.0*nz);CHKERRQ(ierr);
1323           }
1324           row = *bjtmp++;
1325         }
1326 
1327         /* finished row so stick it into b->a */
1328         rs = 0.0;
1329         /* L part */
1330         pv = b->a + bi[i] ;
1331         pj = b->j + bi[i] ;
1332         nz = bi[i+1] - bi[i];
1333         for (j=0; j<nz; j++) {
1334           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1335         }
1336 
1337         /* U part */
1338         pv = b->a + bdiag[i+1]+1;
1339         pj = b->j + bdiag[i+1]+1;
1340         nz = bdiag[i] - bdiag[i+1]-1;
1341         for (j=0; j<nz; j++) {
1342           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1343         }
1344 
1345         /* Check zero pivot */
1346         sctx.rs = rs;
1347         sctx.pv = rtmp1[i];
1348         ierr = MatPivotCheck(info,sctx,i);CHKERRQ(ierr);
1349 
1350         /* Mark diagonal and invert diagonal for simplier triangular solves */
1351         pv  = b->a + bdiag[i];
1352         *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1353         break;
1354 
1355       case 2:
1356       /*----------*/
1357         /* zero rtmp1 and rtmp2 */
1358         /* L part */
1359         nz    = bi[i+1] - bi[i];
1360         bjtmp = bj + bi[i];
1361         for  (j=0; j<nz; j++) {
1362           col = bjtmp[j];
1363           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1364         }
1365 
1366         /* U part */
1367         nz = bdiag[i]-bdiag[i+1];
1368         bjtmp = bj + bdiag[i+1]+1;
1369         for  (j=0; j<nz; j++) {
1370           col = bjtmp[j];
1371           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1372         }
1373 
1374         /* load in initial (unfactored row) */
1375         nz    = ai[r[i]+1] - ai[r[i]];
1376         ajtmp = aj + ai[r[i]];
1377         v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1];
1378         for (j=0; j<nz; j++) {
1379           col = ics[ajtmp[j]];
1380           rtmp1[col] = v1[j]; rtmp2[col] = v2[j];
1381         }
1382         /* ZeropivotApply(): shift the diagonal of the matrix  */
1383         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount;
1384 
1385         /* elimination */
1386         bjtmp = bj + bi[i];
1387         row   = *bjtmp++; /* pivot row */
1388         nzL   = bi[i+1] - bi[i];
1389         for(k=0; k < nzL;k++) {
1390           pc1 = rtmp1 + row;
1391           pc2 = rtmp2 + row;
1392           if (*pc1 != 0.0 || *pc2 != 0.0) {
1393             pv   = b->a + bdiag[row];
1394             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv);
1395             *pc1 = mul1;       *pc2 = mul2;
1396 
1397             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1398             pv = b->a + bdiag[row+1]+1;
1399             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1400             for (j=0; j<nz; j++){
1401               col = pj[j];
1402               rtmp1[col] -= mul1 * pv[j];
1403               rtmp2[col] -= mul2 * pv[j];
1404             }
1405             ierr = PetscLogFlops(4*nz);CHKERRQ(ierr);
1406           }
1407           row = *bjtmp++;
1408         }
1409 
1410         /* finished row i; check zero pivot, then stick row i into b->a */
1411         rs  = 0.0;
1412         /* L part */
1413         pc1 = b->a + bi[i];
1414         pj  = b->j + bi[i] ;
1415         nz  = bi[i+1] - bi[i];
1416         for (j=0; j<nz; j++) {
1417           col = pj[j];
1418           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1419         }
1420         /* U part */
1421         pc1 = b->a + bdiag[i+1]+1;
1422         pj  = b->j + bdiag[i+1]+1;
1423         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1424         for (j=0; j<nz; j++) {
1425           col = pj[j];
1426           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1427         }
1428 
1429         sctx.rs  = rs;
1430         sctx.pv  = rtmp1[i];
1431         ierr = MatPivotCheck(info,sctx,i);CHKERRQ(ierr);
1432         pc1  = b->a + bdiag[i]; /* Mark diagonal */
1433         *pc1 = 1.0/sctx.pv;
1434 
1435         /* Now take care of diagonal 2x2 block. */
1436         pc2 = rtmp2 + i;
1437         if (*pc2 != 0.0){
1438           mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */
1439           *pc2 = mul1;          /* insert L entry */
1440           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1441           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1442           for (j=0; j<nz; j++) {
1443             col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col];
1444           }
1445           ierr = PetscLogFlops(2*nz);CHKERRQ(ierr);
1446         }
1447 
1448         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1449         rs = 0.0;
1450         /* L part */
1451         pc2 = b->a + bi[i+1];
1452         pj  = b->j + bi[i+1] ;
1453         nz  = bi[i+2] - bi[i+1];
1454         for (j=0; j<nz; j++) {
1455           col = pj[j];
1456           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1457         }
1458         /* U part */
1459         pc2 = b->a + bdiag[i+2]+1;
1460         pj  = b->j + bdiag[i+2]+1;
1461         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1462         for (j=0; j<nz; j++) {
1463           col = pj[j];
1464           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1465         }
1466 
1467         sctx.rs  = rs;
1468         sctx.pv  = rtmp2[i+1];
1469         ierr = MatPivotCheck(info,sctx,i+1);CHKERRQ(ierr);
1470         pc2  = b->a + bdiag[i+1];
1471         *pc2 = 1.0/sctx.pv;
1472         break;
1473 
1474       case 3:
1475       /*----------*/
1476         /* zero rtmp */
1477         /* L part */
1478         nz    = bi[i+1] - bi[i];
1479         bjtmp = bj + bi[i];
1480         for  (j=0; j<nz; j++) {
1481           col = bjtmp[j];
1482           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1483         }
1484 
1485         /* U part */
1486         nz = bdiag[i]-bdiag[i+1];
1487         bjtmp = bj + bdiag[i+1]+1;
1488         for  (j=0; j<nz; j++) {
1489           col = bjtmp[j];
1490           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1491         }
1492 
1493         /* load in initial (unfactored row) */
1494         nz    = ai[r[i]+1] - ai[r[i]];
1495         ajtmp = aj + ai[r[i]];
1496         v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2];
1497         for (j=0; j<nz; j++) {
1498           col = ics[ajtmp[j]];
1499           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j];
1500         }
1501         /* ZeropivotApply(): shift the diagonal of the matrix  */
1502         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount;
1503 
1504         /* elimination */
1505         bjtmp = bj + bi[i];
1506         row   = *bjtmp++; /* pivot row */
1507         nzL   = bi[i+1] - bi[i];
1508         for(k=0; k < nzL;k++) {
1509           pc1 = rtmp1 + row;
1510           pc2 = rtmp2 + row;
1511           pc3 = rtmp3 + row;
1512           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1513             pv  = b->a + bdiag[row];
1514             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv);
1515             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3;
1516 
1517             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1518             pv = b->a + bdiag[row+1]+1;
1519             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1520             for (j=0; j<nz; j++){
1521               col = pj[j];
1522               rtmp1[col] -= mul1 * pv[j];
1523               rtmp2[col] -= mul2 * pv[j];
1524               rtmp3[col] -= mul3 * pv[j];
1525             }
1526             ierr = PetscLogFlops(6*nz);CHKERRQ(ierr);
1527           }
1528           row = *bjtmp++;
1529         }
1530 
1531         /* finished row i; check zero pivot, then stick row i into b->a */
1532         rs  = 0.0;
1533         /* L part */
1534         pc1 = b->a + bi[i];
1535         pj  = b->j + bi[i] ;
1536         nz  = bi[i+1] - bi[i];
1537         for (j=0; j<nz; j++) {
1538           col = pj[j];
1539           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1540         }
1541         /* U part */
1542         pc1 = b->a + bdiag[i+1]+1;
1543         pj  = b->j + bdiag[i+1]+1;
1544         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1545         for (j=0; j<nz; j++) {
1546           col = pj[j];
1547           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1548         }
1549 
1550         sctx.rs  = rs;
1551         sctx.pv  = rtmp1[i];
1552         ierr = MatPivotCheck(info,sctx,i);CHKERRQ(ierr);
1553         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1554         *pc1 = 1.0/sctx.pv;
1555 
1556         /* Now take care of 1st column of diagonal 3x3 block. */
1557         pc2 = rtmp2 + i;
1558         pc3 = rtmp3 + i;
1559         if (*pc2 != 0.0 || *pc3 != 0.0){
1560           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1561           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1562           pj = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1563           nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1564           for (j=0; j<nz; j++) {
1565             col = pj[j];
1566             rtmp2[col] -= mul2 * rtmp1[col];
1567             rtmp3[col] -= mul3 * rtmp1[col];
1568           }
1569           ierr = PetscLogFlops(4*nz);CHKERRQ(ierr);
1570         }
1571 
1572         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1573         rs = 0.0;
1574         /* L part */
1575         pc2 = b->a + bi[i+1];
1576         pj  = b->j + bi[i+1] ;
1577         nz  = bi[i+2] - bi[i+1];
1578         for (j=0; j<nz; j++) {
1579           col = pj[j];
1580           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1581         }
1582         /* U part */
1583         pc2 = b->a + bdiag[i+2]+1;
1584         pj  = b->j + bdiag[i+2]+1;
1585         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1586         for (j=0; j<nz; j++) {
1587           col = pj[j];
1588           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1589         }
1590 
1591         sctx.rs  = rs;
1592         sctx.pv  = rtmp2[i+1];
1593         ierr = MatPivotCheck(info,sctx,i+1);CHKERRQ(ierr);
1594         pc2  = b->a + bdiag[i+1];
1595         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1596 
1597         /* Now take care of 2nd column of diagonal 3x3 block. */
1598         pc3 = rtmp3 + i+1;
1599         if (*pc3 != 0.0){
1600           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1601           pj = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1602           nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1603           for (j=0; j<nz; j++) {
1604             col = pj[j];
1605             rtmp3[col] -= mul3 * rtmp2[col];
1606           }
1607           ierr = PetscLogFlops(2*nz);CHKERRQ(ierr);
1608         }
1609 
1610         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1611         rs = 0.0;
1612         /* L part */
1613         pc3 = b->a + bi[i+2];
1614         pj  = b->j + bi[i+2] ;
1615         nz  = bi[i+3] - bi[i+2];
1616         for (j=0; j<nz; j++) {
1617           col = pj[j];
1618           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1619         }
1620         /* U part */
1621         pc3 = b->a + bdiag[i+3]+1;
1622         pj  = b->j + bdiag[i+3]+1;
1623         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1624         for (j=0; j<nz; j++) {
1625           col = pj[j];
1626           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1627         }
1628 
1629         sctx.rs  = rs;
1630         sctx.pv  = rtmp3[i+2];
1631         ierr = MatPivotCheck(info,sctx,i+2);CHKERRQ(ierr);
1632         pc3  = b->a + bdiag[i+2];
1633         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1634         break;
1635 
1636       default:
1637         SETERRQ(PETSC_ERR_SUP,"Node size not yet supported \n");
1638       }
1639       i += nodesz;                 /* Update the row */
1640     }
1641 
1642     /* MatPivotRefine() */
1643     if (info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE && !sctx.useshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max){
1644       /*
1645        * if no shift in this attempt & shifting & started shifting & can refine,
1646        * then try lower shift
1647        */
1648       sctx.shift_hi       = sctx.shift_fraction;
1649       sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.;
1650       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
1651       sctx.useshift       = PETSC_TRUE;
1652       sctx.nshift++;
1653     }
1654   } while (sctx.useshift);
1655 
1656   ierr = PetscFree(rtmp1);CHKERRQ(ierr);
1657   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
1658   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
1659   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1660 
1661   C->ops->solve              = MatSolve_SeqAIJ_Inode;
1662   C->ops->solveadd           = MatSolveAdd_SeqAIJ;
1663   C->ops->solvetranspose     = MatSolveTranspose_SeqAIJ;
1664   C->ops->solvetransposeadd  = MatSolveTransposeAdd_SeqAIJ;
1665   C->ops->matsolve           = MatMatSolve_SeqAIJ;
1666   C->assembled    = PETSC_TRUE;
1667   C->preallocated = PETSC_TRUE;
1668   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
1669 
1670   /* MatShiftView(A,info,&sctx) */
1671   if (sctx.nshift){
1672     if (info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE) {
1673       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr);
1674     } else if (info->shifttype == MAT_SHIFT_NONZERO) {
1675       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
1676     } else if (info->shifttype == MAT_SHIFT_INBLOCKS){
1677       ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %G\n",sctx.nshift,info->shiftamount);CHKERRQ(ierr);
1678     }
1679   }
1680   PetscFunctionReturn(0);
1681 }
1682 
1683 #undef __FUNCT__
1684 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode_inplace"
1685 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info)
1686 {
1687   Mat               C = B;
1688   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1689   IS                iscol = b->col,isrow = b->row,isicol = b->icol;
1690   PetscErrorCode    ierr;
1691   const PetscInt    *r,*ic,*c,*ics;
1692   PetscInt          n = A->rmap->n,*bi = b->i;
1693   PetscInt          *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1694   PetscInt          i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz;
1695   PetscInt          *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1696   PetscScalar       mul1,mul2,mul3,tmp;
1697   MatScalar         *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1698   const MatScalar   *v1,*v2,*v3,*aa = a->a,*rtmp1;
1699   PetscReal         rs=0.0;
1700   FactorShiftCtx    sctx;
1701   PetscInt          newshift;
1702 
1703   PetscFunctionBegin;
1704   sctx.shift_top      = 0;
1705   sctx.nshift_max     = 0;
1706   sctx.shift_lo       = 0;
1707   sctx.shift_hi       = 0;
1708   sctx.shift_fraction = 0;
1709 
1710   /* if both shift schemes are chosen by user, only use info->shiftpd */
1711   if (info->shifttype==MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1712     sctx.shift_top = 0;
1713     for (i=0; i<n; i++) {
1714       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1715       rs    = 0.0;
1716       ajtmp = aj + ai[i];
1717       rtmp1 = aa + ai[i];
1718       nz = ai[i+1] - ai[i];
1719       for (j=0; j<nz; j++){
1720         if (*ajtmp != i){
1721           rs += PetscAbsScalar(*rtmp1++);
1722         } else {
1723           rs -= PetscRealPart(*rtmp1++);
1724         }
1725         ajtmp++;
1726       }
1727       if (rs>sctx.shift_top) sctx.shift_top = rs;
1728     }
1729     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1730     sctx.shift_top *= 1.1;
1731     sctx.nshift_max = 5;
1732     sctx.shift_lo   = 0.;
1733     sctx.shift_hi   = 1.;
1734   }
1735   sctx.shift_amount = 0;
1736   sctx.nshift       = 0;
1737 
1738   ierr  = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1739   ierr  = ISGetIndices(iscol,&c);CHKERRQ(ierr);
1740   ierr  = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1741   ierr  = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp11);CHKERRQ(ierr);
1742   ierr  = PetscMemzero(rtmp11,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr);
1743   ics   = ic ;
1744   rtmp22 = rtmp11 + n;
1745   rtmp33 = rtmp22 + n;
1746 
1747   node_max = a->inode.node_count;
1748   ns       = a->inode.size;
1749   if (!ns){
1750     SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information");
1751   }
1752 
1753   /* If max inode size > 3, split it into two inodes.*/
1754   /* also map the inode sizes according to the ordering */
1755   ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr);
1756   for (i=0,j=0; i<node_max; ++i,++j){
1757     if (ns[i]>3) {
1758       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1759       ++j;
1760       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1761     } else {
1762       tmp_vec1[j] = ns[i];
1763     }
1764   }
1765   /* Use the correct node_max */
1766   node_max = j;
1767 
1768   /* Now reorder the inode info based on mat re-ordering info */
1769   /* First create a row -> inode_size_array_index map */
1770   ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr);
1771   ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr);
1772   for (i=0,row=0; i<node_max; i++) {
1773     nodesz = tmp_vec1[i];
1774     for (j=0; j<nodesz; j++,row++) {
1775       nsmap[row] = i;
1776     }
1777   }
1778   /* Using nsmap, create a reordered ns structure */
1779   for (i=0,j=0; i< node_max; i++) {
1780     nodesz       = tmp_vec1[nsmap[r[j]]];    /* here the reordered row_no is in r[] */
1781     tmp_vec2[i]  = nodesz;
1782     j           += nodesz;
1783   }
1784   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1785   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1786   /* Now use the correct ns */
1787   ns = tmp_vec2;
1788 
1789   do {
1790     sctx.useshift = PETSC_FALSE;
1791     /* Now loop over each block-row, and do the factorization */
1792     for (i=0,row=0; i<node_max; i++) {
1793       nodesz = ns[i];
1794       nz     = bi[row+1] - bi[row];
1795       bjtmp  = bj + bi[row];
1796 
1797       switch (nodesz){
1798       case 1:
1799         for  (j=0; j<nz; j++){
1800           idx        = bjtmp[j];
1801           rtmp11[idx] = 0.0;
1802         }
1803 
1804         /* load in initial (unfactored row) */
1805         idx    = r[row];
1806         nz_tmp = ai[idx+1] - ai[idx];
1807         ajtmp  = aj + ai[idx];
1808         v1     = aa + ai[idx];
1809 
1810         for (j=0; j<nz_tmp; j++) {
1811           idx        = ics[ajtmp[j]];
1812           rtmp11[idx] = v1[j];
1813         }
1814         rtmp11[ics[r[row]]] += sctx.shift_amount;
1815 
1816         prow = *bjtmp++ ;
1817         while (prow < row) {
1818           pc1 = rtmp11 + prow;
1819           if (*pc1 != 0.0){
1820             pv   = ba + bd[prow];
1821             pj   = nbj + bd[prow];
1822             mul1 = *pc1 * *pv++;
1823             *pc1 = mul1;
1824             nz_tmp = bi[prow+1] - bd[prow] - 1;
1825             ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr);
1826             for (j=0; j<nz_tmp; j++) {
1827               tmp = pv[j];
1828               idx = pj[j];
1829               rtmp11[idx] -= mul1 * tmp;
1830             }
1831           }
1832           prow = *bjtmp++ ;
1833         }
1834         pj  = bj + bi[row];
1835         pc1 = ba + bi[row];
1836 
1837         sctx.pv    = rtmp11[row];
1838         rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
1839         rs         = 0.0;
1840         for (j=0; j<nz; j++) {
1841           idx    = pj[j];
1842           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
1843           if (idx != row) rs += PetscAbsScalar(pc1[j]);
1844         }
1845         sctx.rs  = rs;
1846         ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr);
1847         if (newshift == 1) goto endofwhile;
1848         break;
1849 
1850       case 2:
1851         for (j=0; j<nz; j++) {
1852           idx        = bjtmp[j];
1853           rtmp11[idx] = 0.0;
1854           rtmp22[idx] = 0.0;
1855         }
1856 
1857         /* load in initial (unfactored row) */
1858         idx    = r[row];
1859         nz_tmp = ai[idx+1] - ai[idx];
1860         ajtmp  = aj + ai[idx];
1861         v1     = aa + ai[idx];
1862         v2     = aa + ai[idx+1];
1863         for (j=0; j<nz_tmp; j++) {
1864           idx        = ics[ajtmp[j]];
1865           rtmp11[idx] = v1[j];
1866           rtmp22[idx] = v2[j];
1867         }
1868         rtmp11[ics[r[row]]]   += sctx.shift_amount;
1869         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
1870 
1871         prow = *bjtmp++ ;
1872         while (prow < row) {
1873           pc1 = rtmp11 + prow;
1874           pc2 = rtmp22 + prow;
1875           if (*pc1 != 0.0 || *pc2 != 0.0){
1876             pv   = ba + bd[prow];
1877             pj   = nbj + bd[prow];
1878             mul1 = *pc1 * *pv;
1879             mul2 = *pc2 * *pv;
1880             ++pv;
1881             *pc1 = mul1;
1882             *pc2 = mul2;
1883 
1884             nz_tmp = bi[prow+1] - bd[prow] - 1;
1885             for (j=0; j<nz_tmp; j++) {
1886               tmp = pv[j];
1887               idx = pj[j];
1888               rtmp11[idx] -= mul1 * tmp;
1889               rtmp22[idx] -= mul2 * tmp;
1890             }
1891             ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr);
1892           }
1893           prow = *bjtmp++ ;
1894         }
1895 
1896         /* Now take care of diagonal 2x2 block. Note: prow = row here */
1897         pc1 = rtmp11 + prow;
1898         pc2 = rtmp22 + prow;
1899 
1900         sctx.pv = *pc1;
1901         pj      = bj + bi[prow];
1902         rs      = 0.0;
1903         for (j=0; j<nz; j++){
1904           idx = pj[j];
1905           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
1906         }
1907         sctx.rs = rs;
1908         ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr);
1909         if (newshift == 1) goto endofwhile;
1910 
1911         if (*pc2 != 0.0){
1912           pj     = nbj + bd[prow];
1913           mul2   = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
1914           *pc2   = mul2;
1915           nz_tmp = bi[prow+1] - bd[prow] - 1;
1916           for (j=0; j<nz_tmp; j++) {
1917             idx = pj[j] ;
1918             tmp = rtmp11[idx];
1919             rtmp22[idx] -= mul2 * tmp;
1920           }
1921           ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr);
1922         }
1923 
1924         pj  = bj + bi[row];
1925         pc1 = ba + bi[row];
1926         pc2 = ba + bi[row+1];
1927 
1928         sctx.pv = rtmp22[row+1];
1929         rs = 0.0;
1930         rtmp11[row]   = 1.0/rtmp11[row];
1931         rtmp22[row+1] = 1.0/rtmp22[row+1];
1932         /* copy row entries from dense representation to sparse */
1933         for (j=0; j<nz; j++) {
1934           idx    = pj[j];
1935           pc1[j] = rtmp11[idx];
1936           pc2[j] = rtmp22[idx];
1937           if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
1938         }
1939         sctx.rs = rs;
1940         ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr);
1941         if (newshift == 1) goto endofwhile;
1942         break;
1943 
1944       case 3:
1945         for  (j=0; j<nz; j++) {
1946           idx        = bjtmp[j];
1947           rtmp11[idx] = 0.0;
1948           rtmp22[idx] = 0.0;
1949           rtmp33[idx] = 0.0;
1950         }
1951         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
1952         idx    = r[row];
1953         nz_tmp = ai[idx+1] - ai[idx];
1954         ajtmp = aj + ai[idx];
1955         v1    = aa + ai[idx];
1956         v2    = aa + ai[idx+1];
1957         v3    = aa + ai[idx+2];
1958         for (j=0; j<nz_tmp; j++) {
1959           idx        = ics[ajtmp[j]];
1960           rtmp11[idx] = v1[j];
1961           rtmp22[idx] = v2[j];
1962           rtmp33[idx] = v3[j];
1963         }
1964         rtmp11[ics[r[row]]]   += sctx.shift_amount;
1965         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
1966         rtmp33[ics[r[row+2]]] += sctx.shift_amount;
1967 
1968         /* loop over all pivot row blocks above this row block */
1969         prow = *bjtmp++ ;
1970         while (prow < row) {
1971           pc1 = rtmp11 + prow;
1972           pc2 = rtmp22 + prow;
1973           pc3 = rtmp33 + prow;
1974           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){
1975             pv   = ba  + bd[prow];
1976             pj   = nbj + bd[prow];
1977             mul1 = *pc1 * *pv;
1978             mul2 = *pc2 * *pv;
1979             mul3 = *pc3 * *pv;
1980             ++pv;
1981             *pc1 = mul1;
1982             *pc2 = mul2;
1983             *pc3 = mul3;
1984 
1985             nz_tmp = bi[prow+1] - bd[prow] - 1;
1986             /* update this row based on pivot row */
1987             for (j=0; j<nz_tmp; j++) {
1988               tmp = pv[j];
1989               idx = pj[j];
1990               rtmp11[idx] -= mul1 * tmp;
1991               rtmp22[idx] -= mul2 * tmp;
1992               rtmp33[idx] -= mul3 * tmp;
1993             }
1994             ierr = PetscLogFlops(6*nz_tmp);CHKERRQ(ierr);
1995           }
1996           prow = *bjtmp++ ;
1997         }
1998 
1999         /* Now take care of diagonal 3x3 block in this set of rows */
2000         /* note: prow = row here */
2001         pc1 = rtmp11 + prow;
2002         pc2 = rtmp22 + prow;
2003         pc3 = rtmp33 + prow;
2004 
2005         sctx.pv = *pc1;
2006         pj      = bj + bi[prow];
2007         rs      = 0.0;
2008         for (j=0; j<nz; j++){
2009           idx = pj[j];
2010           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2011         }
2012         sctx.rs = rs;
2013         ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr);
2014         if (newshift == 1) goto endofwhile;
2015 
2016         if (*pc2 != 0.0 || *pc3 != 0.0){
2017           mul2 = (*pc2)/(*pc1);
2018           mul3 = (*pc3)/(*pc1);
2019           *pc2 = mul2;
2020           *pc3 = mul3;
2021           nz_tmp = bi[prow+1] - bd[prow] - 1;
2022           pj     = nbj + bd[prow];
2023           for (j=0; j<nz_tmp; j++) {
2024             idx = pj[j] ;
2025             tmp = rtmp11[idx];
2026             rtmp22[idx] -= mul2 * tmp;
2027             rtmp33[idx] -= mul3 * tmp;
2028           }
2029           ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr);
2030         }
2031         ++prow;
2032 
2033         pc2 = rtmp22 + prow;
2034         pc3 = rtmp33 + prow;
2035         sctx.pv = *pc2;
2036         pj      = bj + bi[prow];
2037         rs      = 0.0;
2038         for (j=0; j<nz; j++){
2039           idx = pj[j];
2040           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2041         }
2042         sctx.rs = rs;
2043         ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr);
2044         if (newshift == 1) goto endofwhile;
2045 
2046         if (*pc3 != 0.0){
2047           mul3   = (*pc3)/(*pc2);
2048           *pc3   = mul3;
2049           pj     = nbj + bd[prow];
2050           nz_tmp = bi[prow+1] - bd[prow] - 1;
2051           for (j=0; j<nz_tmp; j++) {
2052             idx = pj[j] ;
2053             tmp = rtmp22[idx];
2054             rtmp33[idx] -= mul3 * tmp;
2055           }
2056           ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr);
2057         }
2058 
2059         pj  = bj + bi[row];
2060         pc1 = ba + bi[row];
2061         pc2 = ba + bi[row+1];
2062         pc3 = ba + bi[row+2];
2063 
2064         sctx.pv = rtmp33[row+2];
2065         rs = 0.0;
2066         rtmp11[row]   = 1.0/rtmp11[row];
2067         rtmp22[row+1] = 1.0/rtmp22[row+1];
2068         rtmp33[row+2] = 1.0/rtmp33[row+2];
2069         /* copy row entries from dense representation to sparse */
2070         for (j=0; j<nz; j++) {
2071           idx    = pj[j];
2072           pc1[j] = rtmp11[idx];
2073           pc2[j] = rtmp22[idx];
2074           pc3[j] = rtmp33[idx];
2075           if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
2076         }
2077 
2078         sctx.rs = rs;
2079         ierr = MatLUCheckShift_inline(info,sctx,row+2,newshift);CHKERRQ(ierr);
2080         if (newshift == 1) goto endofwhile;
2081         break;
2082 
2083       default:
2084         SETERRQ(PETSC_ERR_SUP,"Node size not yet supported \n");
2085       }
2086       row += nodesz;                 /* Update the row */
2087     }
2088     endofwhile:;
2089   } while (sctx.useshift);
2090   ierr = PetscFree(rtmp11);CHKERRQ(ierr);
2091   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
2092   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
2093   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
2094   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
2095   (B)->ops->solve           = MatSolve_SeqAIJ_Inode_inplace;
2096   /* do not set solve add, since MatSolve_Inode + Add is faster */
2097   C->ops->solvetranspose     = MatSolveTranspose_SeqAIJ_inplace;
2098   C->ops->solvetransposeadd  = MatSolveTransposeAdd_SeqAIJ_inplace;
2099   C->assembled   = PETSC_TRUE;
2100   C->preallocated = PETSC_TRUE;
2101   if (sctx.nshift) {
2102     if (info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE) {
2103       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr);
2104     } else if (info->shifttype == MAT_SHIFT_NONZERO) {
2105       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
2106     }
2107   }
2108   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
2109   PetscFunctionReturn(0);
2110 }
2111 
2112 
2113 /* ----------------------------------------------------------- */
2114 #undef __FUNCT__
2115 #define __FUNCT__ "MatSolve_SeqAIJ_Inode"
2116 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
2117 {
2118   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
2119   IS                iscol = a->col,isrow = a->row;
2120   PetscErrorCode    ierr;
2121   const PetscInt    *r,*c,*rout,*cout;
2122   PetscInt          i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
2123   PetscInt          node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
2124   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
2125   PetscScalar       sum1,sum2,sum3,sum4,sum5;
2126   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
2127   const PetscScalar *b;
2128 
2129   PetscFunctionBegin;
2130   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
2131   node_max = a->inode.node_count;
2132   ns       = a->inode.size;     /* Node Size array */
2133 
2134   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2135   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2136   tmp  = a->solve_work;
2137 
2138   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2139   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2140 
2141   /* forward solve the lower triangular */
2142   tmps = tmp ;
2143   aa   = a_a ;
2144   aj   = a_j ;
2145   ad   = a->diag;
2146 
2147   for (i = 0,row = 0; i< node_max; ++i){
2148     nsz = ns[i];
2149     aii = ai[row];
2150     v1  = aa + aii;
2151     vi  = aj + aii;
2152     nz  = ai[row+1]- ai[row];
2153 
2154     if (i < node_max-1) {
2155       /* Prefetch the indices for the next block */
2156       PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,0); /* indices */
2157       /* Prefetch the data for the next block */
2158       PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,0);
2159     }
2160 
2161     switch (nsz){               /* Each loop in 'case' is unrolled */
2162     case 1 :
2163       sum1 = b[r[row]];
2164       for(j=0; j<nz-1; j+=2){
2165         i0   = vi[j];
2166         i1   = vi[j+1];
2167         tmp0 = tmps[i0];
2168         tmp1 = tmps[i1];
2169         sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1;
2170       }
2171       if(j == nz-1){
2172         tmp0 = tmps[vi[j]];
2173         sum1 -= v1[j]*tmp0;
2174       }
2175       tmp[row++]=sum1;
2176       break;
2177     case 2:
2178       sum1 = b[r[row]];
2179       sum2 = b[r[row+1]];
2180       v2   = aa + ai[row+1];
2181 
2182       for(j=0; j<nz-1; j+=2){
2183         i0   = vi[j];
2184         i1   = vi[j+1];
2185         tmp0 = tmps[i0];
2186         tmp1 = tmps[i1];
2187         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2188         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2189       }
2190       if(j == nz-1){
2191         tmp0 = tmps[vi[j]];
2192         sum1 -= v1[j] *tmp0;
2193         sum2 -= v2[j] *tmp0;
2194       }
2195       sum2 -= v2[nz] * sum1;
2196       tmp[row ++]=sum1;
2197       tmp[row ++]=sum2;
2198       break;
2199     case 3:
2200       sum1 = b[r[row]];
2201       sum2 = b[r[row+1]];
2202       sum3 = b[r[row+2]];
2203       v2   = aa + ai[row+1];
2204       v3   = aa + ai[row+2];
2205 
2206       for (j=0; j<nz-1; j+=2){
2207         i0   = vi[j];
2208         i1   = vi[j+1];
2209         tmp0 = tmps[i0];
2210         tmp1 = tmps[i1];
2211         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2212         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2213         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2214       }
2215       if (j == nz-1){
2216         tmp0 = tmps[vi[j]];
2217         sum1 -= v1[j] *tmp0;
2218         sum2 -= v2[j] *tmp0;
2219         sum3 -= v3[j] *tmp0;
2220       }
2221       sum2 -= v2[nz] * sum1;
2222       sum3 -= v3[nz] * sum1;
2223       sum3 -= v3[nz+1] * sum2;
2224       tmp[row ++]=sum1;
2225       tmp[row ++]=sum2;
2226       tmp[row ++]=sum3;
2227       break;
2228 
2229     case 4:
2230       sum1 = b[r[row]];
2231       sum2 = b[r[row+1]];
2232       sum3 = b[r[row+2]];
2233       sum4 = b[r[row+3]];
2234       v2   = aa + ai[row+1];
2235       v3   = aa + ai[row+2];
2236       v4   = aa + ai[row+3];
2237 
2238       for (j=0; j<nz-1; j+=2){
2239         i0   = vi[j];
2240         i1   = vi[j+1];
2241         tmp0 = tmps[i0];
2242         tmp1 = tmps[i1];
2243         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2244         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2245         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2246         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2247       }
2248       if (j == nz-1){
2249         tmp0 = tmps[vi[j]];
2250         sum1 -= v1[j] *tmp0;
2251         sum2 -= v2[j] *tmp0;
2252         sum3 -= v3[j] *tmp0;
2253         sum4 -= v4[j] *tmp0;
2254       }
2255       sum2 -= v2[nz] * sum1;
2256       sum3 -= v3[nz] * sum1;
2257       sum4 -= v4[nz] * sum1;
2258       sum3 -= v3[nz+1] * sum2;
2259       sum4 -= v4[nz+1] * sum2;
2260       sum4 -= v4[nz+2] * sum3;
2261 
2262       tmp[row ++]=sum1;
2263       tmp[row ++]=sum2;
2264       tmp[row ++]=sum3;
2265       tmp[row ++]=sum4;
2266       break;
2267     case 5:
2268       sum1 = b[r[row]];
2269       sum2 = b[r[row+1]];
2270       sum3 = b[r[row+2]];
2271       sum4 = b[r[row+3]];
2272       sum5 = b[r[row+4]];
2273       v2   = aa + ai[row+1];
2274       v3   = aa + ai[row+2];
2275       v4   = aa + ai[row+3];
2276       v5   = aa + ai[row+4];
2277 
2278       for (j=0; j<nz-1; j+=2){
2279         i0   = vi[j];
2280         i1   = vi[j+1];
2281         tmp0 = tmps[i0];
2282         tmp1 = tmps[i1];
2283         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2284         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2285         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2286         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2287         sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1;
2288       }
2289       if (j == nz-1){
2290         tmp0 = tmps[vi[j]];
2291         sum1 -= v1[j] *tmp0;
2292         sum2 -= v2[j] *tmp0;
2293         sum3 -= v3[j] *tmp0;
2294         sum4 -= v4[j] *tmp0;
2295         sum5 -= v5[j] *tmp0;
2296       }
2297 
2298       sum2 -= v2[nz] * sum1;
2299       sum3 -= v3[nz] * sum1;
2300       sum4 -= v4[nz] * sum1;
2301       sum5 -= v5[nz] * sum1;
2302       sum3 -= v3[nz+1] * sum2;
2303       sum4 -= v4[nz+1] * sum2;
2304       sum5 -= v5[nz+1] * sum2;
2305       sum4 -= v4[nz+2] * sum3;
2306       sum5 -= v5[nz+2] * sum3;
2307       sum5 -= v5[nz+3] * sum4;
2308 
2309       tmp[row ++]=sum1;
2310       tmp[row ++]=sum2;
2311       tmp[row ++]=sum3;
2312       tmp[row ++]=sum4;
2313       tmp[row ++]=sum5;
2314       break;
2315     default:
2316       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
2317     }
2318   }
2319   /* backward solve the upper triangular */
2320   for (i=node_max -1 ,row = n-1 ; i>=0; i--){
2321     nsz = ns[i];
2322     aii = ad[row+1] + 1;
2323     v1  = aa + aii;
2324     vi  = aj + aii;
2325     nz  = ad[row]- ad[row+1] - 1;
2326 
2327     if (i > 0) {
2328       /* Prefetch the indices for the next block */
2329       PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,0); /* indices */
2330       /* Prefetch the data for the next block */
2331       PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,0);
2332     }
2333 
2334     switch (nsz){               /* Each loop in 'case' is unrolled */
2335     case 1 :
2336       sum1 = tmp[row];
2337 
2338       for(j=0 ; j<nz-1; j+=2){
2339         i0   = vi[j];
2340         i1   = vi[j+1];
2341         tmp0 = tmps[i0];
2342         tmp1 = tmps[i1];
2343         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2344       }
2345       if (j == nz-1){
2346         tmp0  = tmps[vi[j]];
2347         sum1 -= v1[j]*tmp0;
2348       }
2349       x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2350       break;
2351     case 2 :
2352       sum1 = tmp[row];
2353       sum2 = tmp[row-1];
2354       v2   = aa + ad[row] + 1;
2355       for (j=0 ; j<nz-1; j+=2){
2356         i0   = vi[j];
2357         i1   = vi[j+1];
2358         tmp0 = tmps[i0];
2359         tmp1 = tmps[i1];
2360         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2361         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2362       }
2363       if (j == nz-1){
2364         tmp0  = tmps[vi[j]];
2365         sum1 -= v1[j]* tmp0;
2366         sum2 -= v2[j+1]* tmp0;
2367       }
2368 
2369       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2370       sum2   -= v2[0] * tmp0;
2371       x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2372       break;
2373     case 3 :
2374       sum1 = tmp[row];
2375       sum2 = tmp[row -1];
2376       sum3 = tmp[row -2];
2377       v2   = aa + ad[row] + 1;
2378       v3   = aa + ad[row -1] + 1;
2379       for (j=0 ; j<nz-1; j+=2){
2380         i0   = vi[j];
2381         i1   = vi[j+1];
2382         tmp0 = tmps[i0];
2383         tmp1 = tmps[i1];
2384         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2385         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2386         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2387       }
2388       if (j== nz-1){
2389         tmp0  = tmps[vi[j]];
2390         sum1 -= v1[j] * tmp0;
2391         sum2 -= v2[j+1] * tmp0;
2392         sum3 -= v3[j+2] * tmp0;
2393       }
2394       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2395       sum2   -= v2[0]* tmp0;
2396       sum3   -= v3[1] * tmp0;
2397       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2398       sum3   -= v3[0]* tmp0;
2399       x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2400 
2401       break;
2402     case 4 :
2403       sum1 = tmp[row];
2404       sum2 = tmp[row -1];
2405       sum3 = tmp[row -2];
2406       sum4 = tmp[row -3];
2407       v2   = aa + ad[row]+1;
2408       v3   = aa + ad[row -1]+1;
2409       v4   = aa + ad[row -2]+1;
2410 
2411       for (j=0 ; j<nz-1; j+=2){
2412         i0   = vi[j];
2413         i1   = vi[j+1];
2414         tmp0 = tmps[i0];
2415         tmp1 = tmps[i1];
2416         sum1 -= v1[j] * tmp0   + v1[j+1] * tmp1;
2417         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2418         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2419         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2420       }
2421       if (j== nz-1){
2422         tmp0  = tmps[vi[j]];
2423         sum1 -= v1[j] * tmp0;
2424         sum2 -= v2[j+1] * tmp0;
2425         sum3 -= v3[j+2] * tmp0;
2426         sum4 -= v4[j+3] * tmp0;
2427       }
2428 
2429       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2430       sum2   -= v2[0] * tmp0;
2431       sum3   -= v3[1] * tmp0;
2432       sum4   -= v4[2] * tmp0;
2433       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2434       sum3   -= v3[0] * tmp0;
2435       sum4   -= v4[1] * tmp0;
2436       tmp0    = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2437       sum4   -= v4[0] * tmp0;
2438       x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2439       break;
2440     case 5 :
2441       sum1 = tmp[row];
2442       sum2 = tmp[row -1];
2443       sum3 = tmp[row -2];
2444       sum4 = tmp[row -3];
2445       sum5 = tmp[row -4];
2446       v2   = aa + ad[row]+1;
2447       v3   = aa + ad[row -1]+1;
2448       v4   = aa + ad[row -2]+1;
2449       v5   = aa + ad[row -3]+1;
2450       for (j=0 ; j<nz-1; j+=2){
2451         i0   = vi[j];
2452         i1   = vi[j+1];
2453         tmp0 = tmps[i0];
2454         tmp1 = tmps[i1];
2455         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2456         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2457         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2458         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2459         sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1;
2460       }
2461       if (j==nz-1){
2462         tmp0  = tmps[vi[j]];
2463         sum1 -= v1[j] * tmp0;
2464         sum2 -= v2[j+1] * tmp0;
2465         sum3 -= v3[j+2] * tmp0;
2466         sum4 -= v4[j+3] * tmp0;
2467         sum5 -= v5[j+4] * tmp0;
2468       }
2469 
2470       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2471       sum2   -= v2[0] * tmp0;
2472       sum3   -= v3[1] * tmp0;
2473       sum4   -= v4[2] * tmp0;
2474       sum5   -= v5[3] * tmp0;
2475       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2476       sum3   -= v3[0] * tmp0;
2477       sum4   -= v4[1] * tmp0;
2478       sum5   -= v5[2] * tmp0;
2479       tmp0    = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2480       sum4   -= v4[0] * tmp0;
2481       sum5   -= v5[1] * tmp0;
2482       tmp0    = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2483       sum5   -= v5[0] * tmp0;
2484       x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--;
2485       break;
2486     default:
2487       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
2488     }
2489   }
2490   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2491   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2492   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2493   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2494   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
2495   PetscFunctionReturn(0);
2496 }
2497 
2498 
2499 /*
2500      Makes a longer coloring[] array and calls the usual code with that
2501 */
2502 #undef __FUNCT__
2503 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode"
2504 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
2505 {
2506   Mat_SeqAIJ       *a = (Mat_SeqAIJ*)mat->data;
2507   PetscErrorCode  ierr;
2508   PetscInt        n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
2509   PetscInt        *colorused,i;
2510   ISColoringValue *newcolor;
2511 
2512   PetscFunctionBegin;
2513   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr);
2514   /* loop over inodes, marking a color for each column*/
2515   row = 0;
2516   for (i=0; i<m; i++){
2517     for (j=0; j<ns[i]; j++) {
2518       newcolor[row++] = coloring[i] + j*ncolors;
2519     }
2520   }
2521 
2522   /* eliminate unneeded colors */
2523   ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr);
2524   ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr);
2525   for (i=0; i<n; i++) {
2526     colorused[newcolor[i]] = 1;
2527   }
2528 
2529   for (i=1; i<5*ncolors; i++) {
2530     colorused[i] += colorused[i-1];
2531   }
2532   ncolors = colorused[5*ncolors-1];
2533   for (i=0; i<n; i++) {
2534     newcolor[i] = colorused[newcolor[i]]-1;
2535   }
2536   ierr = PetscFree(colorused);CHKERRQ(ierr);
2537   ierr = ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr);
2538   ierr = PetscFree(coloring);CHKERRQ(ierr);
2539   PetscFunctionReturn(0);
2540 }
2541 
2542 #include "../src/mat/blockinvert.h"
2543 
2544 #undef __FUNCT__
2545 #define __FUNCT__ "MatSOR_SeqAIJ_Inode"
2546 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
2547 {
2548   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
2549   PetscScalar        sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3;
2550   MatScalar          *ibdiag,*bdiag,work[25];
2551   PetscScalar        *x,*xb,tmp4,tmp5,x1,x2,x3,x4,x5;
2552   const MatScalar    *v = a->a,*v1,*v2,*v3,*v4,*v5;
2553   PetscReal          zeropivot = 1.0e-15, shift = 0.0;
2554   PetscErrorCode     ierr;
2555   PetscInt           n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2;
2556   PetscInt           *idx,*diag = a->diag,*ii = a->i,sz,k,ipvt[5];
2557 
2558   PetscFunctionBegin;
2559   if (omega != 1.0) SETERRQ(PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
2560   if (fshift != 0.0) SETERRQ(PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
2561   if (its > 1) {
2562     /* switch to non-inode version */
2563     ierr = MatSOR_SeqAIJ(A,bb,omega,flag,fshift,its,lits,xx);CHKERRQ(ierr);
2564     PetscFunctionReturn(0);
2565   }
2566 
2567   if (!a->inode.ibdiagvalid) {
2568     if (!a->inode.ibdiag) {
2569       /* calculate space needed for diagonal blocks */
2570       for (i=0; i<m; i++) {
2571 	cnt += sizes[i]*sizes[i];
2572       }
2573       a->inode.bdiagsize = cnt;
2574       ierr   = PetscMalloc3(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag,A->rmap->n,MatScalar,&a->inode.ssor_work);CHKERRQ(ierr);
2575     }
2576 
2577     /* copy over the diagonal blocks and invert them */
2578     ibdiag = a->inode.ibdiag;
2579     bdiag  = a->inode.bdiag;
2580     cnt = 0;
2581     for (i=0, row = 0; i<m; i++) {
2582       for (j=0; j<sizes[i]; j++) {
2583         for (k=0; k<sizes[i]; k++) {
2584           bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
2585         }
2586       }
2587       ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr);
2588 
2589       switch(sizes[i]) {
2590         case 1:
2591           /* Create matrix data structure */
2592           if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
2593           ibdiag[cnt] = 1.0/ibdiag[cnt];
2594           break;
2595         case 2:
2596           ierr = Kernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr);
2597           break;
2598         case 3:
2599           ierr = Kernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr);
2600           break;
2601         case 4:
2602           ierr = Kernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr);
2603           break;
2604         case 5:
2605           ierr = Kernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift);CHKERRQ(ierr);
2606           break;
2607        default:
2608 	 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2609       }
2610       cnt += sizes[i]*sizes[i];
2611       row += sizes[i];
2612     }
2613     a->inode.ibdiagvalid = PETSC_TRUE;
2614   }
2615   ibdiag = a->inode.ibdiag;
2616   bdiag  = a->inode.bdiag;
2617 
2618 
2619   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2620   if (flag & SOR_ZERO_INITIAL_GUESS) {
2621     PetscScalar *b;
2622     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2623     if (xx != bb) {
2624       ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2625     } else {
2626       b = x;
2627     }
2628     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){
2629 
2630       for (i=0, row=0; i<m; i++) {
2631         sz  = diag[row] - ii[row];
2632         v1  = a->a + ii[row];
2633         idx = a->j + ii[row];
2634 
2635         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2636         switch (sizes[i]){
2637           case 1:
2638 
2639             sum1  = b[row];
2640             for(n = 0; n<sz-1; n+=2) {
2641               i1   = idx[0];
2642               i2   = idx[1];
2643               idx += 2;
2644               tmp0 = x[i1];
2645               tmp1 = x[i2];
2646               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2647             }
2648 
2649             if (n == sz-1){
2650               tmp0  = x[*idx];
2651               sum1 -= *v1 * tmp0;
2652             }
2653             x[row++] = sum1*(*ibdiag++);
2654             break;
2655           case 2:
2656             v2    = a->a + ii[row+1];
2657             sum1  = b[row];
2658             sum2  = b[row+1];
2659             for(n = 0; n<sz-1; n+=2) {
2660               i1   = idx[0];
2661               i2   = idx[1];
2662               idx += 2;
2663               tmp0 = x[i1];
2664               tmp1 = x[i2];
2665               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2666               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2667             }
2668 
2669             if (n == sz-1){
2670               tmp0  = x[*idx];
2671               sum1 -= v1[0] * tmp0;
2672               sum2 -= v2[0] * tmp0;
2673             }
2674             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
2675             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
2676             ibdiag  += 4;
2677             break;
2678           case 3:
2679             v2    = a->a + ii[row+1];
2680             v3    = a->a + ii[row+2];
2681             sum1  = b[row];
2682             sum2  = b[row+1];
2683             sum3  = b[row+2];
2684             for(n = 0; n<sz-1; n+=2) {
2685               i1   = idx[0];
2686               i2   = idx[1];
2687               idx += 2;
2688               tmp0 = x[i1];
2689               tmp1 = x[i2];
2690               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2691               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2692               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2693             }
2694 
2695             if (n == sz-1){
2696               tmp0  = x[*idx];
2697               sum1 -= v1[0] * tmp0;
2698               sum2 -= v2[0] * tmp0;
2699               sum3 -= v3[0] * tmp0;
2700             }
2701             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2702             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2703             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2704             ibdiag  += 9;
2705             break;
2706           case 4:
2707             v2    = a->a + ii[row+1];
2708             v3    = a->a + ii[row+2];
2709             v4    = a->a + ii[row+3];
2710             sum1  = b[row];
2711             sum2  = b[row+1];
2712             sum3  = b[row+2];
2713             sum4  = b[row+3];
2714             for(n = 0; n<sz-1; n+=2) {
2715               i1   = idx[0];
2716               i2   = idx[1];
2717               idx += 2;
2718               tmp0 = x[i1];
2719               tmp1 = x[i2];
2720               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2721               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2722               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2723               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2724             }
2725 
2726             if (n == sz-1){
2727               tmp0  = x[*idx];
2728               sum1 -= v1[0] * tmp0;
2729               sum2 -= v2[0] * tmp0;
2730               sum3 -= v3[0] * tmp0;
2731               sum4 -= v4[0] * tmp0;
2732             }
2733             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2734             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2735             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2736             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2737             ibdiag  += 16;
2738             break;
2739           case 5:
2740             v2    = a->a + ii[row+1];
2741             v3    = a->a + ii[row+2];
2742             v4    = a->a + ii[row+3];
2743             v5    = a->a + ii[row+4];
2744             sum1  = b[row];
2745             sum2  = b[row+1];
2746             sum3  = b[row+2];
2747             sum4  = b[row+3];
2748             sum5  = b[row+4];
2749             for(n = 0; n<sz-1; n+=2) {
2750               i1   = idx[0];
2751               i2   = idx[1];
2752               idx += 2;
2753               tmp0 = x[i1];
2754               tmp1 = x[i2];
2755               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2756               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2757               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2758               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2759               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2760             }
2761 
2762             if (n == sz-1){
2763               tmp0  = x[*idx];
2764               sum1 -= v1[0] * tmp0;
2765               sum2 -= v2[0] * tmp0;
2766               sum3 -= v3[0] * tmp0;
2767               sum4 -= v4[0] * tmp0;
2768               sum5 -= v5[0] * tmp0;
2769             }
2770             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2771             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
2772             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
2773             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
2774             x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
2775             ibdiag  += 25;
2776             break;
2777 	  default:
2778    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2779         }
2780       }
2781 
2782       xb = x;
2783       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2784     } else xb = b;
2785     if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) &&
2786         (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) {
2787       cnt = 0;
2788       for (i=0, row=0; i<m; i++) {
2789 
2790         switch (sizes[i]){
2791           case 1:
2792             x[row++] *= bdiag[cnt++];
2793             break;
2794           case 2:
2795             x1   = x[row]; x2 = x[row+1];
2796             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
2797             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
2798             x[row++] = tmp1;
2799             x[row++] = tmp2;
2800             cnt += 4;
2801             break;
2802           case 3:
2803             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2];
2804             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
2805             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
2806             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
2807             x[row++] = tmp1;
2808             x[row++] = tmp2;
2809             x[row++] = tmp3;
2810             cnt += 9;
2811             break;
2812           case 4:
2813             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
2814             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
2815             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
2816             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
2817             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
2818             x[row++] = tmp1;
2819             x[row++] = tmp2;
2820             x[row++] = tmp3;
2821             x[row++] = tmp4;
2822             cnt += 16;
2823             break;
2824           case 5:
2825             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
2826             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
2827             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
2828             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
2829             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
2830             tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
2831             x[row++] = tmp1;
2832             x[row++] = tmp2;
2833             x[row++] = tmp3;
2834             x[row++] = tmp4;
2835             x[row++] = tmp5;
2836             cnt += 25;
2837             break;
2838 	  default:
2839    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2840         }
2841       }
2842       ierr = PetscLogFlops(m);CHKERRQ(ierr);
2843     }
2844     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){
2845 
2846       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
2847       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
2848         ibdiag -= sizes[i]*sizes[i];
2849         sz      = ii[row+1] - diag[row] - 1;
2850         v1      = a->a + diag[row] + 1;
2851         idx     = a->j + diag[row] + 1;
2852 
2853         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2854         switch (sizes[i]){
2855           case 1:
2856 
2857             sum1  = xb[row];
2858             for(n = 0; n<sz-1; n+=2) {
2859               i1   = idx[0];
2860               i2   = idx[1];
2861               idx += 2;
2862               tmp0 = x[i1];
2863               tmp1 = x[i2];
2864               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2865             }
2866 
2867             if (n == sz-1){
2868               tmp0  = x[*idx];
2869               sum1 -= *v1*tmp0;
2870             }
2871             x[row--] = sum1*(*ibdiag);
2872             break;
2873 
2874           case 2:
2875 
2876             sum1  = xb[row];
2877             sum2  = xb[row-1];
2878             /* note that sum1 is associated with the second of the two rows */
2879             v2    = a->a + diag[row-1] + 2;
2880             for(n = 0; n<sz-1; n+=2) {
2881               i1   = idx[0];
2882               i2   = idx[1];
2883               idx += 2;
2884               tmp0 = x[i1];
2885               tmp1 = x[i2];
2886               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2887               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2888             }
2889 
2890             if (n == sz-1){
2891               tmp0  = x[*idx];
2892               sum1 -= *v1*tmp0;
2893               sum2 -= *v2*tmp0;
2894             }
2895             x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
2896             x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
2897             break;
2898           case 3:
2899 
2900             sum1  = xb[row];
2901             sum2  = xb[row-1];
2902             sum3  = xb[row-2];
2903             v2    = a->a + diag[row-1] + 2;
2904             v3    = a->a + diag[row-2] + 3;
2905             for(n = 0; n<sz-1; n+=2) {
2906               i1   = idx[0];
2907               i2   = idx[1];
2908               idx += 2;
2909               tmp0 = x[i1];
2910               tmp1 = x[i2];
2911               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2912               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2913               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2914             }
2915 
2916             if (n == sz-1){
2917               tmp0  = x[*idx];
2918               sum1 -= *v1*tmp0;
2919               sum2 -= *v2*tmp0;
2920               sum3 -= *v3*tmp0;
2921             }
2922             x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
2923             x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
2924             x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
2925             break;
2926           case 4:
2927 
2928             sum1  = xb[row];
2929             sum2  = xb[row-1];
2930             sum3  = xb[row-2];
2931             sum4  = xb[row-3];
2932             v2    = a->a + diag[row-1] + 2;
2933             v3    = a->a + diag[row-2] + 3;
2934             v4    = a->a + diag[row-3] + 4;
2935             for(n = 0; n<sz-1; n+=2) {
2936               i1   = idx[0];
2937               i2   = idx[1];
2938               idx += 2;
2939               tmp0 = x[i1];
2940               tmp1 = x[i2];
2941               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2942               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2943               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2944               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2945             }
2946 
2947             if (n == sz-1){
2948               tmp0  = x[*idx];
2949               sum1 -= *v1*tmp0;
2950               sum2 -= *v2*tmp0;
2951               sum3 -= *v3*tmp0;
2952               sum4 -= *v4*tmp0;
2953             }
2954             x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
2955             x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
2956             x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
2957             x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
2958             break;
2959           case 5:
2960 
2961             sum1  = xb[row];
2962             sum2  = xb[row-1];
2963             sum3  = xb[row-2];
2964             sum4  = xb[row-3];
2965             sum5  = xb[row-4];
2966             v2    = a->a + diag[row-1] + 2;
2967             v3    = a->a + diag[row-2] + 3;
2968             v4    = a->a + diag[row-3] + 4;
2969             v5    = a->a + diag[row-4] + 5;
2970             for(n = 0; n<sz-1; n+=2) {
2971               i1   = idx[0];
2972               i2   = idx[1];
2973               idx += 2;
2974               tmp0 = x[i1];
2975               tmp1 = x[i2];
2976               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2977               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2978               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2979               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2980               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2981             }
2982 
2983             if (n == sz-1){
2984               tmp0  = x[*idx];
2985               sum1 -= *v1*tmp0;
2986               sum2 -= *v2*tmp0;
2987               sum3 -= *v3*tmp0;
2988               sum4 -= *v4*tmp0;
2989               sum5 -= *v5*tmp0;
2990             }
2991             x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
2992             x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
2993             x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
2994             x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
2995             x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
2996             break;
2997 	  default:
2998    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2999         }
3000       }
3001 
3002       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3003     }
3004     its--;
3005     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3006     if (bb != xx) {ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);}
3007   }
3008   if (flag & SOR_EISENSTAT) {
3009     const PetscScalar *b;
3010     MatScalar         *t = a->inode.ssor_work;
3011 
3012     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3013     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3014     /*
3015           Apply  (U + D)^-1  where D is now the block diagonal
3016     */
3017     ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3018     for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3019       ibdiag -= sizes[i]*sizes[i];
3020       sz      = ii[row+1] - diag[row] - 1;
3021       v1      = a->a + diag[row] + 1;
3022       idx     = a->j + diag[row] + 1;
3023       CHKMEMQ;
3024       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3025       switch (sizes[i]){
3026         case 1:
3027 
3028 	  sum1  = b[row];
3029 	  for(n = 0; n<sz-1; n+=2) {
3030 	    i1   = idx[0];
3031 	    i2   = idx[1];
3032 	    idx += 2;
3033 	    tmp0 = x[i1];
3034 	    tmp1 = x[i2];
3035 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3036 	  }
3037 
3038 	  if (n == sz-1){
3039 	    tmp0  = x[*idx];
3040 	    sum1 -= *v1*tmp0;
3041 	  }
3042 	  x[row] = sum1*(*ibdiag);row--;
3043 	  break;
3044 
3045         case 2:
3046 
3047 	  sum1  = b[row];
3048 	  sum2  = b[row-1];
3049 	  /* note that sum1 is associated with the second of the two rows */
3050 	  v2    = a->a + diag[row-1] + 2;
3051 	  for(n = 0; n<sz-1; n+=2) {
3052 	    i1   = idx[0];
3053 	    i2   = idx[1];
3054 	    idx += 2;
3055 	    tmp0 = x[i1];
3056 	    tmp1 = x[i2];
3057 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3058 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3059 	  }
3060 
3061 	  if (n == sz-1){
3062 	    tmp0  = x[*idx];
3063 	    sum1 -= *v1*tmp0;
3064 	    sum2 -= *v2*tmp0;
3065 	  }
3066 	  x[row] = sum2*ibdiag[1] + sum1*ibdiag[3];
3067 	  x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
3068           row -= 2;
3069 	  break;
3070         case 3:
3071 
3072 	  sum1  = b[row];
3073 	  sum2  = b[row-1];
3074 	  sum3  = b[row-2];
3075 	  v2    = a->a + diag[row-1] + 2;
3076 	  v3    = a->a + diag[row-2] + 3;
3077 	  for(n = 0; n<sz-1; n+=2) {
3078 	    i1   = idx[0];
3079 	    i2   = idx[1];
3080 	    idx += 2;
3081 	    tmp0 = x[i1];
3082 	    tmp1 = x[i2];
3083 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3084 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3085 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3086 	  }
3087 
3088 	  if (n == sz-1){
3089 	    tmp0  = x[*idx];
3090 	    sum1 -= *v1*tmp0;
3091 	    sum2 -= *v2*tmp0;
3092 	    sum3 -= *v3*tmp0;
3093 	  }
3094 	  x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3095 	  x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3096 	  x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3097           row -= 3;
3098 	  break;
3099         case 4:
3100 
3101 	  sum1  = b[row];
3102 	  sum2  = b[row-1];
3103 	  sum3  = b[row-2];
3104 	  sum4  = b[row-3];
3105 	  v2    = a->a + diag[row-1] + 2;
3106 	  v3    = a->a + diag[row-2] + 3;
3107 	  v4    = a->a + diag[row-3] + 4;
3108 	  for(n = 0; n<sz-1; n+=2) {
3109 	    i1   = idx[0];
3110 	    i2   = idx[1];
3111 	    idx += 2;
3112 	    tmp0 = x[i1];
3113 	    tmp1 = x[i2];
3114 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3115 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3116 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3117 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3118 	  }
3119 
3120 	  if (n == sz-1){
3121 	    tmp0  = x[*idx];
3122 	    sum1 -= *v1*tmp0;
3123 	    sum2 -= *v2*tmp0;
3124 	    sum3 -= *v3*tmp0;
3125 	    sum4 -= *v4*tmp0;
3126 	  }
3127 	  x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3128 	  x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3129 	  x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3130 	  x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3131           row -= 4;
3132 	  break;
3133         case 5:
3134 
3135 	  sum1  = b[row];
3136 	  sum2  = b[row-1];
3137 	  sum3  = b[row-2];
3138 	  sum4  = b[row-3];
3139 	  sum5  = b[row-4];
3140 	  v2    = a->a + diag[row-1] + 2;
3141 	  v3    = a->a + diag[row-2] + 3;
3142 	  v4    = a->a + diag[row-3] + 4;
3143 	  v5    = a->a + diag[row-4] + 5;
3144 	  for(n = 0; n<sz-1; n+=2) {
3145 	    i1   = idx[0];
3146 	    i2   = idx[1];
3147 	    idx += 2;
3148 	    tmp0 = x[i1];
3149 	    tmp1 = x[i2];
3150 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3151 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3152 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3153 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3154 	    sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3155 	  }
3156 
3157 	  if (n == sz-1){
3158 	    tmp0  = x[*idx];
3159 	    sum1 -= *v1*tmp0;
3160 	    sum2 -= *v2*tmp0;
3161 	    sum3 -= *v3*tmp0;
3162 	    sum4 -= *v4*tmp0;
3163 	    sum5 -= *v5*tmp0;
3164 	  }
3165 	  x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3166 	  x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3167 	  x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3168 	  x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3169 	  x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3170           row -= 5;
3171 	  break;
3172         default:
3173 	  SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3174       }
3175       CHKMEMQ;
3176     }
3177     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3178 
3179     /*
3180            t = b - D x    where D is the block diagonal
3181     */
3182     cnt = 0;
3183     for (i=0, row=0; i<m; i++) {
3184       CHKMEMQ;
3185       switch (sizes[i]){
3186         case 1:
3187 	  t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
3188 	  break;
3189         case 2:
3190 	  x1   = x[row]; x2 = x[row+1];
3191 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3192 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3193 	  t[row]   = b[row] - tmp1;
3194 	  t[row+1] = b[row+1] - tmp2; row += 2;
3195 	  cnt += 4;
3196 	  break;
3197         case 3:
3198 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2];
3199 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3200 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3201 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3202 	  t[row] = b[row] - tmp1;
3203 	  t[row+1] = b[row+1] - tmp2;
3204 	  t[row+2] = b[row+2] - tmp3; row += 3;
3205 	  cnt += 9;
3206 	  break;
3207         case 4:
3208 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3209 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3210 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3211 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3212 	  tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3213 	  t[row] = b[row] - tmp1;
3214 	  t[row+1] = b[row+1] - tmp2;
3215 	  t[row+2] = b[row+2] - tmp3;
3216 	  t[row+3] = b[row+3] - tmp4; row += 4;
3217 	  cnt += 16;
3218 	  break;
3219         case 5:
3220 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3221 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3222 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3223 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3224 	  tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3225 	  tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3226 	  t[row] = b[row] - tmp1;
3227 	  t[row+1] = b[row+1] - tmp2;
3228 	  t[row+2] = b[row+2] - tmp3;
3229 	  t[row+3] = b[row+3] - tmp4;
3230 	  t[row+4] = b[row+4] - tmp5;row += 5;
3231 	  cnt += 25;
3232 	  break;
3233         default:
3234 	  SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3235       }
3236       CHKMEMQ;
3237     }
3238     ierr = PetscLogFlops(m);CHKERRQ(ierr);
3239 
3240 
3241 
3242     /*
3243           Apply (L + D)^-1 where D is the block diagonal
3244     */
3245     for (i=0, row=0; i<m; i++) {
3246       sz  = diag[row] - ii[row];
3247       v1  = a->a + ii[row];
3248       idx = a->j + ii[row];
3249       CHKMEMQ;
3250       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3251       switch (sizes[i]){
3252         case 1:
3253 
3254 	  sum1  = t[row];
3255 	  for(n = 0; n<sz-1; n+=2) {
3256 	    i1   = idx[0];
3257 	    i2   = idx[1];
3258 	    idx += 2;
3259 	    tmp0 = t[i1];
3260 	    tmp1 = t[i2];
3261 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3262 	  }
3263 
3264 	  if (n == sz-1){
3265 	    tmp0  = t[*idx];
3266 	    sum1 -= *v1 * tmp0;
3267 	  }
3268 	  x[row] += t[row] = sum1*(*ibdiag++); row++;
3269 	  break;
3270         case 2:
3271 	  v2    = a->a + ii[row+1];
3272 	  sum1  = t[row];
3273 	  sum2  = t[row+1];
3274 	  for(n = 0; n<sz-1; n+=2) {
3275 	    i1   = idx[0];
3276 	    i2   = idx[1];
3277 	    idx += 2;
3278 	    tmp0 = t[i1];
3279 	    tmp1 = t[i2];
3280 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3281 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3282 	  }
3283 
3284 	  if (n == sz-1){
3285 	    tmp0  = t[*idx];
3286 	    sum1 -= v1[0] * tmp0;
3287 	    sum2 -= v2[0] * tmp0;
3288 	  }
3289 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3290 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3291 	  ibdiag  += 4; row += 2;
3292 	  break;
3293         case 3:
3294 	  v2    = a->a + ii[row+1];
3295 	  v3    = a->a + ii[row+2];
3296 	  sum1  = t[row];
3297 	  sum2  = t[row+1];
3298 	  sum3  = t[row+2];
3299 	  for(n = 0; n<sz-1; n+=2) {
3300 	    i1   = idx[0];
3301 	    i2   = idx[1];
3302 	    idx += 2;
3303 	    tmp0 = t[i1];
3304 	    tmp1 = t[i2];
3305 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3306 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3307 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3308 	  }
3309 
3310 	  if (n == sz-1){
3311 	    tmp0  = t[*idx];
3312 	    sum1 -= v1[0] * tmp0;
3313 	    sum2 -= v2[0] * tmp0;
3314 	    sum3 -= v3[0] * tmp0;
3315 	  }
3316 	  x[row]  += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3317 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3318 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3319 	  ibdiag  += 9; row += 3;
3320 	  break;
3321         case 4:
3322 	  v2    = a->a + ii[row+1];
3323 	  v3    = a->a + ii[row+2];
3324 	  v4    = a->a + ii[row+3];
3325 	  sum1  = t[row];
3326 	  sum2  = t[row+1];
3327 	  sum3  = t[row+2];
3328 	  sum4  = t[row+3];
3329 	  for(n = 0; n<sz-1; n+=2) {
3330 	    i1   = idx[0];
3331 	    i2   = idx[1];
3332 	    idx += 2;
3333 	    tmp0 = t[i1];
3334 	    tmp1 = t[i2];
3335 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3336 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3337 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3338 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3339 	  }
3340 
3341 	  if (n == sz-1){
3342 	    tmp0  = t[*idx];
3343 	    sum1 -= v1[0] * tmp0;
3344 	    sum2 -= v2[0] * tmp0;
3345 	    sum3 -= v3[0] * tmp0;
3346 	    sum4 -= v4[0] * tmp0;
3347 	  }
3348 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3349 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3350 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3351 	  x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3352 	  ibdiag  += 16; row += 4;
3353 	  break;
3354         case 5:
3355 	  v2    = a->a + ii[row+1];
3356 	  v3    = a->a + ii[row+2];
3357 	  v4    = a->a + ii[row+3];
3358 	  v5    = a->a + ii[row+4];
3359 	  sum1  = t[row];
3360 	  sum2  = t[row+1];
3361 	  sum3  = t[row+2];
3362 	  sum4  = t[row+3];
3363 	  sum5  = t[row+4];
3364 	  for(n = 0; n<sz-1; n+=2) {
3365 	    i1   = idx[0];
3366 	    i2   = idx[1];
3367 	    idx += 2;
3368 	    tmp0 = t[i1];
3369 	    tmp1 = t[i2];
3370 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3371 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3372 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3373 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3374 	    sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3375 	  }
3376 
3377 	  if (n == sz-1){
3378 	    tmp0  = t[*idx];
3379 	    sum1 -= v1[0] * tmp0;
3380 	    sum2 -= v2[0] * tmp0;
3381 	    sum3 -= v3[0] * tmp0;
3382 	    sum4 -= v4[0] * tmp0;
3383 	    sum5 -= v5[0] * tmp0;
3384 	  }
3385 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3386 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3387 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3388 	  x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3389 	  x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3390 	  ibdiag  += 25; row += 5;
3391 	  break;
3392         default:
3393 	  SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3394       }
3395       CHKMEMQ;
3396     }
3397     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3398     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3399     ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3400   }
3401   PetscFunctionReturn(0);
3402 }
3403 
3404 #undef __FUNCT__
3405 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode"
3406 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
3407 {
3408   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3409   PetscScalar        *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
3410   const MatScalar    *bdiag = a->inode.bdiag;
3411   const PetscScalar  *b;
3412   PetscErrorCode      ierr;
3413   PetscInt            m = a->inode.node_count,cnt = 0,i,row;
3414   const PetscInt      *sizes = a->inode.size;
3415 
3416   PetscFunctionBegin;
3417   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3418   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3419   cnt = 0;
3420   for (i=0, row=0; i<m; i++) {
3421     switch (sizes[i]){
3422       case 1:
3423 	x[row] = b[row]*bdiag[cnt++];row++;
3424 	break;
3425       case 2:
3426 	x1   = b[row]; x2 = b[row+1];
3427 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3428 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3429 	x[row++] = tmp1;
3430 	x[row++] = tmp2;
3431 	cnt += 4;
3432 	break;
3433       case 3:
3434 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2];
3435 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3436 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3437 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3438 	x[row++] = tmp1;
3439 	x[row++] = tmp2;
3440 	x[row++] = tmp3;
3441 	cnt += 9;
3442 	break;
3443       case 4:
3444 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
3445 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3446 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3447 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3448 	tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3449 	x[row++] = tmp1;
3450 	x[row++] = tmp2;
3451 	x[row++] = tmp3;
3452 	x[row++] = tmp4;
3453 	cnt += 16;
3454 	break;
3455       case 5:
3456 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
3457 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3458 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3459 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3460 	tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3461 	tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3462 	x[row++] = tmp1;
3463 	x[row++] = tmp2;
3464 	x[row++] = tmp3;
3465 	x[row++] = tmp4;
3466 	x[row++] = tmp5;
3467 	cnt += 25;
3468 	break;
3469       default:
3470 	SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3471     }
3472   }
3473   ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr);
3474   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3475   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3476   PetscFunctionReturn(0);
3477 }
3478 
3479 /*
3480     samestructure indicates that the matrix has not changed its nonzero structure so we
3481     do not need to recompute the inodes
3482 */
3483 #undef __FUNCT__
3484 #define __FUNCT__ "Mat_CheckInode"
3485 PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure)
3486 {
3487   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3488   PetscErrorCode ierr;
3489   PetscInt       i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size;
3490   PetscTruth     flag;
3491 
3492   PetscFunctionBegin;
3493   if (!a->inode.use)                     PetscFunctionReturn(0);
3494   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
3495 
3496 
3497   m = A->rmap->n;
3498   if (a->inode.size) {ns = a->inode.size;}
3499   else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);}
3500 
3501   i          = 0;
3502   node_count = 0;
3503   idx        = a->j;
3504   ii         = a->i;
3505   while (i < m){                /* For each row */
3506     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
3507     /* Limits the number of elements in a node to 'a->inode.limit' */
3508     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
3509       nzy     = ii[j+1] - ii[j]; /* Same number of nonzeros */
3510       if (nzy != nzx) break;
3511       idy  += nzx;             /* Same nonzero pattern */
3512       ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
3513       if (!flag) break;
3514     }
3515     ns[node_count++] = blk_size;
3516     idx += blk_size*nzx;
3517     i    = j;
3518   }
3519   /* If not enough inodes found,, do not use inode version of the routines */
3520   if (!a->inode.size && m && node_count > .9*m) {
3521     ierr = PetscFree(ns);CHKERRQ(ierr);
3522     a->inode.node_count     = 0;
3523     a->inode.size           = PETSC_NULL;
3524     a->inode.use            = PETSC_FALSE;
3525     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
3526   } else {
3527     A->ops->mult              = MatMult_SeqAIJ_Inode;
3528     A->ops->sor               = MatSOR_SeqAIJ_Inode;
3529     A->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
3530     A->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
3531     A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
3532     A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
3533     A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
3534     A->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
3535     A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
3536     A->ops->lufactornumeric   = MatLUFactorNumeric_SeqAIJ_Inode_inplace;
3537     a->inode.node_count       = node_count;
3538     a->inode.size             = ns;
3539     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
3540   }
3541   PetscFunctionReturn(0);
3542 }
3543 
3544 #define MatGetRow_FactoredLU(cols,nzl,nzu,nz,ai,aj,adiag,row) {	\
3545 PetscInt __k, *__vi; \
3546 __vi = aj + ai[row];				\
3547 for(__k=0;__k<nzl;__k++) cols[__k] = __vi[__k]; \
3548 __vi = aj + adiag[row];				\
3549 cols[nzl] = __vi[0];\
3550 __vi = aj + adiag[row+1]+1;\
3551 for(__k=0;__k<nzu;__k++) cols[nzl+1+__k] = __vi[__k];}
3552 
3553 
3554 /*
3555    Mat_CheckInode_FactorLU - Check Inode for factored seqaij matrix.
3556    Modified from Mat_CheckInode().
3557 
3558    Input Parameters:
3559 +  Mat A - ILU or LU matrix factor
3560 -  samestructure - TURE indicates that the matrix has not changed its nonzero structure so we
3561     do not need to recompute the inodes
3562 */
3563 #undef __FUNCT__
3564 #define __FUNCT__ "Mat_CheckInode_FactorLU"
3565 PetscErrorCode Mat_CheckInode_FactorLU(Mat A,PetscTruth samestructure)
3566 {
3567   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3568   PetscErrorCode ierr;
3569   PetscInt       i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size;
3570   PetscInt       *cols1,*cols2,*ns,*ai = a->i,*aj = a->j, *adiag = a->diag;
3571   PetscTruth     flag;
3572 
3573   PetscFunctionBegin;
3574   if (!a->inode.use)                     PetscFunctionReturn(0);
3575   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
3576 
3577   m = A->rmap->n;
3578   if (a->inode.size) {ns = a->inode.size;}
3579   else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);}
3580 
3581   i          = 0;
3582   node_count = 0;
3583   while (i < m){                /* For each row */
3584     nzl1 = ai[i+1] - ai[i];       /* Number of nonzeros in L */
3585     nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/
3586     nzx  = nzl1 + nzu1 + 1;
3587     ierr = PetscMalloc((nzx+1)*sizeof(PetscInt),&cols1);CHKERRQ(ierr);
3588     MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i);
3589 
3590     /* Limits the number of elements in a node to 'a->inode.limit' */
3591     for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
3592       nzl2    = ai[j+1] - ai[j];
3593       nzu2    = adiag[j] - adiag[j+1] - 1;
3594       nzy     = nzl2 + nzu2 + 1;
3595       if( nzy != nzx) break;
3596       ierr    = PetscMalloc((nzy+1)*sizeof(PetscInt),&cols2);CHKERRQ(ierr);
3597       MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);
3598       ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
3599       if (!flag) {ierr = PetscFree(cols2);CHKERRQ(ierr);break;}
3600       ierr = PetscFree(cols2);CHKERRQ(ierr);
3601     }
3602     ns[node_count++] = blk_size;
3603     ierr = PetscFree(cols1);CHKERRQ(ierr);
3604     i    = j;
3605   }
3606   /* If not enough inodes found,, do not use inode version of the routines */
3607   if (!a->inode.size && m && node_count > .9*m) {
3608     ierr = PetscFree(ns);CHKERRQ(ierr);
3609     a->inode.node_count     = 0;
3610     a->inode.size           = PETSC_NULL;
3611     a->inode.use            = PETSC_FALSE;
3612     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
3613   } else {
3614     A->ops->mult              = 0;
3615     A->ops->sor               = 0;
3616     A->ops->multadd           = 0;
3617     A->ops->getrowij          = 0;
3618     A->ops->restorerowij      = 0;
3619     A->ops->getcolumnij       = 0;
3620     A->ops->restorecolumnij   = 0;
3621     A->ops->coloringpatch     = 0;
3622     A->ops->multdiagonalblock = 0;
3623     A->ops->lufactornumeric   = MatLUFactorNumeric_SeqAIJ_Inode; /* not done yet */
3624     a->inode.node_count       = node_count;
3625     a->inode.size             = ns;
3626     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
3627   }
3628   PetscFunctionReturn(0);
3629 }
3630 
3631 /*
3632      This is really ugly. if inodes are used this replaces the
3633   permutations with ones that correspond to rows/cols of the matrix
3634   rather then inode blocks
3635 */
3636 #undef __FUNCT__
3637 #define __FUNCT__ "MatInodeAdjustForInodes"
3638 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
3639 {
3640   PetscErrorCode ierr,(*f)(Mat,IS*,IS*);
3641 
3642   PetscFunctionBegin;
3643   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);CHKERRQ(ierr);
3644   if (f) {
3645     ierr = (*f)(A,rperm,cperm);CHKERRQ(ierr);
3646   }
3647   PetscFunctionReturn(0);
3648 }
3649 
3650 EXTERN_C_BEGIN
3651 #undef __FUNCT__
3652 #define __FUNCT__ "MatAdjustForInodes_SeqAIJ_Inode"
3653 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
3654 {
3655   Mat_SeqAIJ      *a=(Mat_SeqAIJ*)A->data;
3656   PetscErrorCode ierr;
3657   PetscInt       m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
3658   const PetscInt *ridx,*cidx;
3659   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
3660   PetscInt       nslim_col,*ns_col;
3661   IS             ris = *rperm,cis = *cperm;
3662 
3663   PetscFunctionBegin;
3664   if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */
3665   if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */
3666 
3667   ierr  = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
3668   ierr  = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
3669   ierr  = PetscMalloc2(m,PetscInt,&permr,n,PetscInt,&permc);CHKERRQ(ierr);
3670 
3671   ierr  = ISGetIndices(ris,&ridx);CHKERRQ(ierr);
3672   ierr  = ISGetIndices(cis,&cidx);CHKERRQ(ierr);
3673 
3674   /* Form the inode structure for the rows of permuted matric using inv perm*/
3675   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
3676 
3677   /* Construct the permutations for rows*/
3678   for (i=0,row = 0; i<nslim_row; ++i){
3679     indx      = ridx[i];
3680     start_val = tns[indx];
3681     end_val   = tns[indx + 1];
3682     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
3683   }
3684 
3685   /* Form the inode structure for the columns of permuted matrix using inv perm*/
3686   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
3687 
3688  /* Construct permutations for columns */
3689   for (i=0,col=0; i<nslim_col; ++i){
3690     indx      = cidx[i];
3691     start_val = tns[indx];
3692     end_val   = tns[indx + 1];
3693     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
3694   }
3695 
3696   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);CHKERRQ(ierr);
3697   ierr = ISSetPermutation(*rperm);CHKERRQ(ierr);
3698   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);CHKERRQ(ierr);
3699   ierr = ISSetPermutation(*cperm);CHKERRQ(ierr);
3700 
3701   ierr  = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr);
3702   ierr  = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr);
3703 
3704   ierr = PetscFree(ns_col);CHKERRQ(ierr);
3705   ierr = PetscFree2(permr,permc);CHKERRQ(ierr);
3706   ierr = ISDestroy(cis);CHKERRQ(ierr);
3707   ierr = ISDestroy(ris);CHKERRQ(ierr);
3708   ierr = PetscFree(tns);CHKERRQ(ierr);
3709   PetscFunctionReturn(0);
3710 }
3711 EXTERN_C_END
3712 
3713 #undef __FUNCT__
3714 #define __FUNCT__ "MatInodeGetInodeSizes"
3715 /*@C
3716    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
3717 
3718    Collective on Mat
3719 
3720    Input Parameter:
3721 .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
3722 
3723    Output Parameter:
3724 +  node_count - no of inodes present in the matrix.
3725 .  sizes      - an array of size node_count,with sizes of each inode.
3726 -  limit      - the max size used to generate the inodes.
3727 
3728    Level: advanced
3729 
3730    Notes: This routine returns some internal storage information
3731    of the matrix, it is intended to be used by advanced users.
3732    It should be called after the matrix is assembled.
3733    The contents of the sizes[] array should not be changed.
3734    PETSC_NULL may be passed for information not requested.
3735 
3736 .keywords: matrix, seqaij, get, inode
3737 
3738 .seealso: MatGetInfo()
3739 @*/
3740 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
3741 {
3742   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
3743 
3744   PetscFunctionBegin;
3745   if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
3746   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr);
3747   if (f) {
3748     ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr);
3749   }
3750   PetscFunctionReturn(0);
3751 }
3752 
3753 EXTERN_C_BEGIN
3754 #undef __FUNCT__
3755 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode"
3756 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
3757 {
3758   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
3759 
3760   PetscFunctionBegin;
3761   if (node_count) *node_count = a->inode.node_count;
3762   if (sizes)      *sizes      = a->inode.size;
3763   if (limit)      *limit      = a->inode.limit;
3764   PetscFunctionReturn(0);
3765 }
3766 EXTERN_C_END
3767