xref: /petsc/src/mat/impls/aij/seq/inode.c (revision b89f182d523227cdac08169ad4acabdbdf9ddad3)
1 #define PETSCMAT_DLL
2 
3 /*
4   This file provides high performance routines for the Inode format (compressed sparse row)
5   by taking advantage of rows with identical nonzero structure (I-nodes).
6 */
7 #include "../src/mat/impls/aij/seq/aij.h"
8 
9 #undef __FUNCT__
10 #define __FUNCT__ "Mat_CreateColInode"
11 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns)
12 {
13   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
14   PetscErrorCode ierr;
15   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;
16 
17   PetscFunctionBegin;
18   n      = A->cmap->n;
19   m      = A->rmap->n;
20   ns_row = a->inode.size;
21 
22   min_mn = (m < n) ? m : n;
23   if (!ns) {
24     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++);
25     for(; count+1 < n; count++,i++);
26     if (count < n)  {
27       i++;
28     }
29     *size = i;
30     PetscFunctionReturn(0);
31   }
32   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr);
33 
34   /* Use the same row structure wherever feasible. */
35   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
36     ns_col[i] = ns_row[i];
37   }
38 
39   /* if m < n; pad up the remainder with inode_limit */
40   for(; count+1 < n; count++,i++) {
41     ns_col[i] = 1;
42   }
43   /* The last node is the odd ball. padd it up with the remaining rows; */
44   if (count < n)  {
45     ns_col[i] = n - count;
46     i++;
47   } else if (count > n) {
48     /* Adjust for the over estimation */
49     ns_col[i-1] += n - count;
50   }
51   *size = i;
52   *ns   = ns_col;
53   PetscFunctionReturn(0);
54 }
55 
56 
57 /*
58       This builds symmetric version of nonzero structure,
59 */
60 #undef __FUNCT__
61 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric"
62 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
63 {
64   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
65   PetscErrorCode ierr;
66   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n;
67   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j;
68 
69   PetscFunctionBegin;
70   nslim_row = a->inode.node_count;
71   m         = A->rmap->n;
72   n         = A->cmap->n;
73   if (m != n) SETERRQ(PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
74 
75   /* Use the row_inode as column_inode */
76   nslim_col = nslim_row;
77   ns_col    = ns_row;
78 
79   /* allocate space for reformated inode structure */
80   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
81   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
82   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
83 
84   for (i1=0,col=0; i1<nslim_col; ++i1){
85     nsz = ns_col[i1];
86     for (i2=0; i2<nsz; ++i2,++col)
87       tvc[col] = i1;
88   }
89   /* allocate space for row pointers */
90   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
91   *iia = ia;
92   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
93   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
94 
95   /* determine the number of columns in each row */
96   ia[0] = oshift;
97   for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) {
98 
99     j    = aj + ai[row] + ishift;
100     jmax = aj + ai[row+1] + ishift;
101     i2   = 0;
102     col  = *j++ + ishift;
103     i2   = tvc[col];
104     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
105       ia[i1+1]++;
106       ia[i2+1]++;
107       i2++;                     /* Start col of next node */
108       while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j;
109       i2 = tvc[col];
110     }
111     if(i2 == i1) ia[i2+1]++;    /* now the diagonal element */
112   }
113 
114   /* shift ia[i] to point to next row */
115   for (i1=1; i1<nslim_row+1; i1++) {
116     row        = ia[i1-1];
117     ia[i1]    += row;
118     work[i1-1] = row - oshift;
119   }
120 
121   /* allocate space for column pointers */
122   nz   = ia[nslim_row] + (!ishift);
123   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
124   *jja = ja;
125 
126  /* loop over lower triangular part putting into ja */
127   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
128     j    = aj + ai[row] + ishift;
129     jmax = aj + ai[row+1] + ishift;
130     i2   = 0;                     /* Col inode index */
131     col  = *j++ + ishift;
132     i2   = tvc[col];
133     while (i2<i1 && j<jmax) {
134       ja[work[i2]++] = i1 + oshift;
135       ja[work[i1]++] = i2 + oshift;
136       ++i2;
137       while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */
138       i2 = tvc[col];
139     }
140     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
141 
142   }
143   ierr = PetscFree(work);CHKERRQ(ierr);
144   ierr = PetscFree(tns);CHKERRQ(ierr);
145   ierr = PetscFree(tvc);CHKERRQ(ierr);
146   PetscFunctionReturn(0);
147 }
148 
149 /*
150       This builds nonsymmetric version of nonzero structure,
151 */
152 #undef __FUNCT__
153 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric"
154 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
155 {
156   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
157   PetscErrorCode ierr;
158   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col;
159   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
160 
161   PetscFunctionBegin;
162   nslim_row = a->inode.node_count;
163   n         = A->cmap->n;
164 
165   /* Create The column_inode for this matrix */
166   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
167 
168   /* allocate space for reformated column_inode structure */
169   ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
170   ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
171   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
172 
173   for (i1=0,col=0; i1<nslim_col; ++i1){
174     nsz = ns_col[i1];
175     for (i2=0; i2<nsz; ++i2,++col)
176       tvc[col] = i1;
177   }
178   /* allocate space for row pointers */
179   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
180   *iia = ia;
181   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
182   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
183 
184   /* determine the number of columns in each row */
185   ia[0] = oshift;
186   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
187     j   = aj + ai[row] + ishift;
188     col = *j++ + ishift;
189     i2  = tvc[col];
190     nz  = ai[row+1] - ai[row];
191     while (nz-- > 0) {           /* off-diagonal elemets */
192       ia[i1+1]++;
193       i2++;                     /* Start col of next node */
194       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
195       if (nz > 0) i2 = tvc[col];
196     }
197   }
198 
199   /* shift ia[i] to point to next row */
200   for (i1=1; i1<nslim_row+1; i1++) {
201     row        = ia[i1-1];
202     ia[i1]    += row;
203     work[i1-1] = row - oshift;
204   }
205 
206   /* allocate space for column pointers */
207   nz   = ia[nslim_row] + (!ishift);
208   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
209   *jja = ja;
210 
211  /* loop over matrix putting into ja */
212   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
213     j   = aj + ai[row] + ishift;
214     i2  = 0;                     /* Col inode index */
215     col = *j++ + ishift;
216     i2  = tvc[col];
217     nz  = ai[row+1] - ai[row];
218     while (nz-- > 0) {
219       ja[work[i1]++] = i2 + oshift;
220       ++i2;
221       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
222       if (nz > 0) i2 = tvc[col];
223     }
224   }
225   ierr = PetscFree(ns_col);CHKERRQ(ierr);
226   ierr = PetscFree(work);CHKERRQ(ierr);
227   ierr = PetscFree(tns);CHKERRQ(ierr);
228   ierr = PetscFree(tvc);CHKERRQ(ierr);
229   PetscFunctionReturn(0);
230 }
231 
232 #undef __FUNCT__
233 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode"
234 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
235 {
236   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
237   PetscErrorCode ierr;
238 
239   PetscFunctionBegin;
240   *n     = a->inode.node_count;
241   if (!ia) PetscFunctionReturn(0);
242   if (!blockcompressed) {
243     ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
244   } else if (symmetric) {
245     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
246   } else {
247     ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
248   }
249   PetscFunctionReturn(0);
250 }
251 
252 #undef __FUNCT__
253 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode"
254 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
255 {
256   PetscErrorCode ierr;
257 
258   PetscFunctionBegin;
259   if (!ia) PetscFunctionReturn(0);
260 
261   if (!blockcompressed) {
262     ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
263   } else {
264     ierr = PetscFree(*ia);CHKERRQ(ierr);
265     ierr = PetscFree(*ja);CHKERRQ(ierr);
266   }
267 
268   PetscFunctionReturn(0);
269 }
270 
271 /* ----------------------------------------------------------- */
272 
273 #undef __FUNCT__
274 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric"
275 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
276 {
277   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
278   PetscErrorCode ierr;
279   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
280   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
281 
282   PetscFunctionBegin;
283   nslim_row = a->inode.node_count;
284   n         = A->cmap->n;
285 
286   /* Create The column_inode for this matrix */
287   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
288 
289   /* allocate space for reformated column_inode structure */
290   ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
291   ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
292   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
293 
294   for (i1=0,col=0; i1<nslim_col; ++i1){
295     nsz = ns_col[i1];
296     for (i2=0; i2<nsz; ++i2,++col)
297       tvc[col] = i1;
298   }
299   /* allocate space for column pointers */
300   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
301   *iia = ia;
302   ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr);
303   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
304 
305   /* determine the number of columns in each row */
306   ia[0] = oshift;
307   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
308     j   = aj + ai[row] + ishift;
309     col = *j++ + ishift;
310     i2  = tvc[col];
311     nz  = ai[row+1] - ai[row];
312     while (nz-- > 0) {           /* off-diagonal elemets */
313       /* ia[i1+1]++; */
314       ia[i2+1]++;
315       i2++;
316       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
317       if (nz > 0) i2 = tvc[col];
318     }
319   }
320 
321   /* shift ia[i] to point to next col */
322   for (i1=1; i1<nslim_col+1; i1++) {
323     col        = ia[i1-1];
324     ia[i1]    += col;
325     work[i1-1] = col - oshift;
326   }
327 
328   /* allocate space for column pointers */
329   nz   = ia[nslim_col] + (!ishift);
330   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
331   *jja = ja;
332 
333  /* loop over matrix putting into ja */
334   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
335     j   = aj + ai[row] + ishift;
336     i2  = 0;                     /* Col inode index */
337     col = *j++ + ishift;
338     i2  = tvc[col];
339     nz  = ai[row+1] - ai[row];
340     while (nz-- > 0) {
341       /* ja[work[i1]++] = i2 + oshift; */
342       ja[work[i2]++] = i1 + oshift;
343       i2++;
344       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
345       if (nz > 0) i2 = tvc[col];
346     }
347   }
348   ierr = PetscFree(ns_col);CHKERRQ(ierr);
349   ierr = PetscFree(work);CHKERRQ(ierr);
350   ierr = PetscFree(tns);CHKERRQ(ierr);
351   ierr = PetscFree(tvc);CHKERRQ(ierr);
352   PetscFunctionReturn(0);
353 }
354 
355 #undef __FUNCT__
356 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode"
357 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
358 {
359   PetscErrorCode ierr;
360 
361   PetscFunctionBegin;
362   ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr);
363   if (!ia) PetscFunctionReturn(0);
364 
365   if (!blockcompressed) {
366     ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
367   } else if (symmetric) {
368     /* Since the indices are symmetric it does'nt matter */
369     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
370   } else {
371     ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
372   }
373   PetscFunctionReturn(0);
374 }
375 
376 #undef __FUNCT__
377 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode"
378 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
379 {
380   PetscErrorCode ierr;
381 
382   PetscFunctionBegin;
383   if (!ia) PetscFunctionReturn(0);
384   if (!blockcompressed) {
385     ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
386   } else {
387     ierr = PetscFree(*ia);CHKERRQ(ierr);
388     ierr = PetscFree(*ja);CHKERRQ(ierr);
389   }
390   PetscFunctionReturn(0);
391 }
392 
393 /* ----------------------------------------------------------- */
394 
395 #undef __FUNCT__
396 #define __FUNCT__ "MatMult_SeqAIJ_Inode"
397 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
398 {
399   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
400   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
401   PetscScalar       *y;
402   const PetscScalar *x;
403   const MatScalar   *v1,*v2,*v3,*v4,*v5;
404   PetscErrorCode    ierr;
405   PetscInt          *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0;
406 
407 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
408 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
409 #endif
410 
411   PetscFunctionBegin;
412   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
413   node_max = a->inode.node_count;
414   ns       = a->inode.size;     /* Node Size array */
415   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
416   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
417   idx  = a->j;
418   v1   = a->a;
419   ii   = a->i;
420 
421   for (i = 0,row = 0; i< node_max; ++i){
422     nsz  = ns[i];
423     n    = ii[1] - ii[0];
424     nonzerorow += (n>0)*nsz;
425     ii  += nsz;
426     PetscPrefetchBlock(idx+nsz*n,n,0,0);    /* Prefetch the indices for the block row after the current one */
427     PetscPrefetchBlock(v1+nsz*n,nsz*n,0,0); /* Prefetch the values for the block row after the current one  */
428     sz   = n;                   /* No of non zeros in this row */
429                                 /* Switch on the size of Node */
430     switch (nsz){               /* Each loop in 'case' is unrolled */
431     case 1 :
432       sum1  = 0.;
433 
434       for(n = 0; n< sz-1; n+=2) {
435         i1   = idx[0];          /* The instructions are ordered to */
436         i2   = idx[1];          /* make the compiler's job easy */
437         idx += 2;
438         tmp0 = x[i1];
439         tmp1 = x[i2];
440         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
441        }
442 
443       if (n == sz-1){          /* Take care of the last nonzero  */
444         tmp0  = x[*idx++];
445         sum1 += *v1++ * tmp0;
446       }
447       y[row++]=sum1;
448       break;
449     case 2:
450       sum1  = 0.;
451       sum2  = 0.;
452       v2    = v1 + n;
453 
454       for (n = 0; n< sz-1; n+=2) {
455         i1   = idx[0];
456         i2   = idx[1];
457         idx += 2;
458         tmp0 = x[i1];
459         tmp1 = x[i2];
460         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
461         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
462       }
463       if (n == sz-1){
464         tmp0  = x[*idx++];
465         sum1 += *v1++ * tmp0;
466         sum2 += *v2++ * tmp0;
467       }
468       y[row++]=sum1;
469       y[row++]=sum2;
470       v1      =v2;              /* Since the next block to be processed starts there*/
471       idx    +=sz;
472       break;
473     case 3:
474       sum1  = 0.;
475       sum2  = 0.;
476       sum3  = 0.;
477       v2    = v1 + n;
478       v3    = v2 + n;
479 
480       for (n = 0; n< sz-1; n+=2) {
481         i1   = idx[0];
482         i2   = idx[1];
483         idx += 2;
484         tmp0 = x[i1];
485         tmp1 = x[i2];
486         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
487         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
488         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
489       }
490       if (n == sz-1){
491         tmp0  = x[*idx++];
492         sum1 += *v1++ * tmp0;
493         sum2 += *v2++ * tmp0;
494         sum3 += *v3++ * tmp0;
495       }
496       y[row++]=sum1;
497       y[row++]=sum2;
498       y[row++]=sum3;
499       v1       =v3;             /* Since the next block to be processed starts there*/
500       idx     +=2*sz;
501       break;
502     case 4:
503       sum1  = 0.;
504       sum2  = 0.;
505       sum3  = 0.;
506       sum4  = 0.;
507       v2    = v1 + n;
508       v3    = v2 + n;
509       v4    = v3 + n;
510 
511       for (n = 0; n< sz-1; n+=2) {
512         i1   = idx[0];
513         i2   = idx[1];
514         idx += 2;
515         tmp0 = x[i1];
516         tmp1 = x[i2];
517         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
518         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
519         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
520         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
521       }
522       if (n == sz-1){
523         tmp0  = x[*idx++];
524         sum1 += *v1++ * tmp0;
525         sum2 += *v2++ * tmp0;
526         sum3 += *v3++ * tmp0;
527         sum4 += *v4++ * tmp0;
528       }
529       y[row++]=sum1;
530       y[row++]=sum2;
531       y[row++]=sum3;
532       y[row++]=sum4;
533       v1      =v4;              /* Since the next block to be processed starts there*/
534       idx    +=3*sz;
535       break;
536     case 5:
537       sum1  = 0.;
538       sum2  = 0.;
539       sum3  = 0.;
540       sum4  = 0.;
541       sum5  = 0.;
542       v2    = v1 + n;
543       v3    = v2 + n;
544       v4    = v3 + n;
545       v5    = v4 + n;
546 
547       for (n = 0; n<sz-1; n+=2) {
548         i1   = idx[0];
549         i2   = idx[1];
550         idx += 2;
551         tmp0 = x[i1];
552         tmp1 = x[i2];
553         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
554         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
555         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
556         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
557         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
558       }
559       if (n == sz-1){
560         tmp0  = x[*idx++];
561         sum1 += *v1++ * tmp0;
562         sum2 += *v2++ * tmp0;
563         sum3 += *v3++ * tmp0;
564         sum4 += *v4++ * tmp0;
565         sum5 += *v5++ * tmp0;
566       }
567       y[row++]=sum1;
568       y[row++]=sum2;
569       y[row++]=sum3;
570       y[row++]=sum4;
571       y[row++]=sum5;
572       v1      =v5;       /* Since the next block to be processed starts there */
573       idx    +=4*sz;
574       break;
575     default :
576       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
577     }
578   }
579   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
580   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
581   ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr);
582   PetscFunctionReturn(0);
583 }
584 /* ----------------------------------------------------------- */
585 /* Almost same code as the MatMult_SeqAIJ_Inode() */
586 #undef __FUNCT__
587 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode"
588 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
589 {
590   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
591   PetscScalar    sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
592   MatScalar      *v1,*v2,*v3,*v4,*v5;
593   PetscScalar    *x,*y,*z,*zt;
594   PetscErrorCode ierr;
595   PetscInt       *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
596 
597   PetscFunctionBegin;
598   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
599   node_max = a->inode.node_count;
600   ns       = a->inode.size;     /* Node Size array */
601   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
602   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
603   if (zz != yy) {
604     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
605   } else {
606     z = y;
607   }
608   zt = z;
609 
610   idx  = a->j;
611   v1   = a->a;
612   ii   = a->i;
613 
614   for (i = 0,row = 0; i< node_max; ++i){
615     nsz  = ns[i];
616     n    = ii[1] - ii[0];
617     ii  += nsz;
618     sz   = n;                   /* No of non zeros in this row */
619                                 /* Switch on the size of Node */
620     switch (nsz){               /* Each loop in 'case' is unrolled */
621     case 1 :
622       sum1  = *zt++;
623 
624       for(n = 0; n< sz-1; n+=2) {
625         i1   = idx[0];          /* The instructions are ordered to */
626         i2   = idx[1];          /* make the compiler's job easy */
627         idx += 2;
628         tmp0 = x[i1];
629         tmp1 = x[i2];
630         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
631        }
632 
633       if(n   == sz-1){          /* Take care of the last nonzero  */
634         tmp0  = x[*idx++];
635         sum1 += *v1++ * tmp0;
636       }
637       y[row++]=sum1;
638       break;
639     case 2:
640       sum1  = *zt++;
641       sum2  = *zt++;
642       v2    = v1 + n;
643 
644       for(n = 0; n< sz-1; n+=2) {
645         i1   = idx[0];
646         i2   = idx[1];
647         idx += 2;
648         tmp0 = x[i1];
649         tmp1 = x[i2];
650         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
651         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
652       }
653       if(n   == sz-1){
654         tmp0  = x[*idx++];
655         sum1 += *v1++ * tmp0;
656         sum2 += *v2++ * tmp0;
657       }
658       y[row++]=sum1;
659       y[row++]=sum2;
660       v1      =v2;              /* Since the next block to be processed starts there*/
661       idx    +=sz;
662       break;
663     case 3:
664       sum1  = *zt++;
665       sum2  = *zt++;
666       sum3  = *zt++;
667       v2    = v1 + n;
668       v3    = v2 + n;
669 
670       for (n = 0; n< sz-1; n+=2) {
671         i1   = idx[0];
672         i2   = idx[1];
673         idx += 2;
674         tmp0 = x[i1];
675         tmp1 = x[i2];
676         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
677         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
678         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
679       }
680       if (n == sz-1){
681         tmp0  = x[*idx++];
682         sum1 += *v1++ * tmp0;
683         sum2 += *v2++ * tmp0;
684         sum3 += *v3++ * tmp0;
685       }
686       y[row++]=sum1;
687       y[row++]=sum2;
688       y[row++]=sum3;
689       v1       =v3;             /* Since the next block to be processed starts there*/
690       idx     +=2*sz;
691       break;
692     case 4:
693       sum1  = *zt++;
694       sum2  = *zt++;
695       sum3  = *zt++;
696       sum4  = *zt++;
697       v2    = v1 + n;
698       v3    = v2 + n;
699       v4    = v3 + n;
700 
701       for (n = 0; n< sz-1; n+=2) {
702         i1   = idx[0];
703         i2   = idx[1];
704         idx += 2;
705         tmp0 = x[i1];
706         tmp1 = x[i2];
707         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
708         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
709         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
710         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
711       }
712       if (n == sz-1){
713         tmp0  = x[*idx++];
714         sum1 += *v1++ * tmp0;
715         sum2 += *v2++ * tmp0;
716         sum3 += *v3++ * tmp0;
717         sum4 += *v4++ * tmp0;
718       }
719       y[row++]=sum1;
720       y[row++]=sum2;
721       y[row++]=sum3;
722       y[row++]=sum4;
723       v1      =v4;              /* Since the next block to be processed starts there*/
724       idx    +=3*sz;
725       break;
726     case 5:
727       sum1  = *zt++;
728       sum2  = *zt++;
729       sum3  = *zt++;
730       sum4  = *zt++;
731       sum5  = *zt++;
732       v2    = v1 + n;
733       v3    = v2 + n;
734       v4    = v3 + n;
735       v5    = v4 + n;
736 
737       for (n = 0; n<sz-1; n+=2) {
738         i1   = idx[0];
739         i2   = idx[1];
740         idx += 2;
741         tmp0 = x[i1];
742         tmp1 = x[i2];
743         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
744         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
745         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
746         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
747         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
748       }
749       if(n   == sz-1){
750         tmp0  = x[*idx++];
751         sum1 += *v1++ * tmp0;
752         sum2 += *v2++ * tmp0;
753         sum3 += *v3++ * tmp0;
754         sum4 += *v4++ * tmp0;
755         sum5 += *v5++ * tmp0;
756       }
757       y[row++]=sum1;
758       y[row++]=sum2;
759       y[row++]=sum3;
760       y[row++]=sum4;
761       y[row++]=sum5;
762       v1      =v5;       /* Since the next block to be processed starts there */
763       idx    +=4*sz;
764       break;
765     default :
766       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
767     }
768   }
769   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
770   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
771   if (zz != yy) {
772     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
773   }
774   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
775   PetscFunctionReturn(0);
776 }
777 
778 /* ----------------------------------------------------------- */
779 #undef __FUNCT__
780 #define __FUNCT__ "MatSolve_SeqAIJ_Inode_inplace"
781 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)
782 {
783   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
784   IS                iscol = a->col,isrow = a->row;
785   PetscErrorCode    ierr;
786   const PetscInt    *r,*c,*rout,*cout;
787   PetscInt          i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
788   PetscInt          node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
789   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
790   PetscScalar       sum1,sum2,sum3,sum4,sum5;
791   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
792   const PetscScalar *b;
793 
794   PetscFunctionBegin;
795   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
796   node_max = a->inode.node_count;
797   ns       = a->inode.size;     /* Node Size array */
798 
799   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
800   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
801   tmp  = a->solve_work;
802 
803   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
804   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
805 
806   /* forward solve the lower triangular */
807   tmps = tmp ;
808   aa   = a_a ;
809   aj   = a_j ;
810   ad   = a->diag;
811 
812   for (i = 0,row = 0; i< node_max; ++i){
813     nsz = ns[i];
814     aii = ai[row];
815     v1  = aa + aii;
816     vi  = aj + aii;
817     nz  = ad[row]- aii;
818     if (i < node_max-1) {
819       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
820       * but our indexing to determine it's size could. */
821       PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,0); /* indices */
822       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
823       PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,0);
824       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
825     }
826 
827     switch (nsz){               /* Each loop in 'case' is unrolled */
828     case 1 :
829       sum1 = b[*r++];
830       for(j=0; j<nz-1; j+=2){
831         i0   = vi[0];
832         i1   = vi[1];
833         vi  +=2;
834         tmp0 = tmps[i0];
835         tmp1 = tmps[i1];
836         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
837       }
838       if(j == nz-1){
839         tmp0 = tmps[*vi++];
840         sum1 -= *v1++ *tmp0;
841       }
842       tmp[row ++]=sum1;
843       break;
844     case 2:
845       sum1 = b[*r++];
846       sum2 = b[*r++];
847       v2   = aa + ai[row+1];
848 
849       for(j=0; j<nz-1; j+=2){
850         i0   = vi[0];
851         i1   = vi[1];
852         vi  +=2;
853         tmp0 = tmps[i0];
854         tmp1 = tmps[i1];
855         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
856         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
857       }
858       if(j == nz-1){
859         tmp0 = tmps[*vi++];
860         sum1 -= *v1++ *tmp0;
861         sum2 -= *v2++ *tmp0;
862       }
863       sum2 -= *v2++ * sum1;
864       tmp[row ++]=sum1;
865       tmp[row ++]=sum2;
866       break;
867     case 3:
868       sum1 = b[*r++];
869       sum2 = b[*r++];
870       sum3 = b[*r++];
871       v2   = aa + ai[row+1];
872       v3   = aa + ai[row+2];
873 
874       for (j=0; j<nz-1; j+=2){
875         i0   = vi[0];
876         i1   = vi[1];
877         vi  +=2;
878         tmp0 = tmps[i0];
879         tmp1 = tmps[i1];
880         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
881         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
882         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
883       }
884       if (j == nz-1){
885         tmp0 = tmps[*vi++];
886         sum1 -= *v1++ *tmp0;
887         sum2 -= *v2++ *tmp0;
888         sum3 -= *v3++ *tmp0;
889       }
890       sum2 -= *v2++ * sum1;
891       sum3 -= *v3++ * sum1;
892       sum3 -= *v3++ * sum2;
893       tmp[row ++]=sum1;
894       tmp[row ++]=sum2;
895       tmp[row ++]=sum3;
896       break;
897 
898     case 4:
899       sum1 = b[*r++];
900       sum2 = b[*r++];
901       sum3 = b[*r++];
902       sum4 = b[*r++];
903       v2   = aa + ai[row+1];
904       v3   = aa + ai[row+2];
905       v4   = aa + ai[row+3];
906 
907       for (j=0; j<nz-1; j+=2){
908         i0   = vi[0];
909         i1   = vi[1];
910         vi  +=2;
911         tmp0 = tmps[i0];
912         tmp1 = tmps[i1];
913         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
914         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
915         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
916         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
917       }
918       if (j == nz-1){
919         tmp0 = tmps[*vi++];
920         sum1 -= *v1++ *tmp0;
921         sum2 -= *v2++ *tmp0;
922         sum3 -= *v3++ *tmp0;
923         sum4 -= *v4++ *tmp0;
924       }
925       sum2 -= *v2++ * sum1;
926       sum3 -= *v3++ * sum1;
927       sum4 -= *v4++ * sum1;
928       sum3 -= *v3++ * sum2;
929       sum4 -= *v4++ * sum2;
930       sum4 -= *v4++ * sum3;
931 
932       tmp[row ++]=sum1;
933       tmp[row ++]=sum2;
934       tmp[row ++]=sum3;
935       tmp[row ++]=sum4;
936       break;
937     case 5:
938       sum1 = b[*r++];
939       sum2 = b[*r++];
940       sum3 = b[*r++];
941       sum4 = b[*r++];
942       sum5 = b[*r++];
943       v2   = aa + ai[row+1];
944       v3   = aa + ai[row+2];
945       v4   = aa + ai[row+3];
946       v5   = aa + ai[row+4];
947 
948       for (j=0; j<nz-1; j+=2){
949         i0   = vi[0];
950         i1   = vi[1];
951         vi  +=2;
952         tmp0 = tmps[i0];
953         tmp1 = tmps[i1];
954         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
955         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
956         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
957         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
958         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
959       }
960       if (j == nz-1){
961         tmp0 = tmps[*vi++];
962         sum1 -= *v1++ *tmp0;
963         sum2 -= *v2++ *tmp0;
964         sum3 -= *v3++ *tmp0;
965         sum4 -= *v4++ *tmp0;
966         sum5 -= *v5++ *tmp0;
967       }
968 
969       sum2 -= *v2++ * sum1;
970       sum3 -= *v3++ * sum1;
971       sum4 -= *v4++ * sum1;
972       sum5 -= *v5++ * sum1;
973       sum3 -= *v3++ * sum2;
974       sum4 -= *v4++ * sum2;
975       sum5 -= *v5++ * sum2;
976       sum4 -= *v4++ * sum3;
977       sum5 -= *v5++ * sum3;
978       sum5 -= *v5++ * sum4;
979 
980       tmp[row ++]=sum1;
981       tmp[row ++]=sum2;
982       tmp[row ++]=sum3;
983       tmp[row ++]=sum4;
984       tmp[row ++]=sum5;
985       break;
986     default:
987       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
988     }
989   }
990   /* backward solve the upper triangular */
991   for (i=node_max -1 ,row = n-1 ; i>=0; i--){
992     nsz = ns[i];
993     aii = ai[row+1] -1;
994     v1  = aa + aii;
995     vi  = aj + aii;
996     nz  = aii- ad[row];
997     switch (nsz){               /* Each loop in 'case' is unrolled */
998     case 1 :
999       sum1 = tmp[row];
1000 
1001       for(j=nz ; j>1; j-=2){
1002         vi  -=2;
1003         i0   = vi[2];
1004         i1   = vi[1];
1005         tmp0 = tmps[i0];
1006         tmp1 = tmps[i1];
1007         v1   -= 2;
1008         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1009       }
1010       if (j==1){
1011         tmp0  = tmps[*vi--];
1012         sum1 -= *v1-- * tmp0;
1013       }
1014       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1015       break;
1016     case 2 :
1017       sum1 = tmp[row];
1018       sum2 = tmp[row -1];
1019       v2   = aa + ai[row]-1;
1020       for (j=nz ; j>1; j-=2){
1021         vi  -=2;
1022         i0   = vi[2];
1023         i1   = vi[1];
1024         tmp0 = tmps[i0];
1025         tmp1 = tmps[i1];
1026         v1   -= 2;
1027         v2   -= 2;
1028         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1029         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1030       }
1031       if (j==1){
1032         tmp0  = tmps[*vi--];
1033         sum1 -= *v1-- * tmp0;
1034         sum2 -= *v2-- * tmp0;
1035       }
1036 
1037       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1038       sum2   -= *v2-- * tmp0;
1039       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1040       break;
1041     case 3 :
1042       sum1 = tmp[row];
1043       sum2 = tmp[row -1];
1044       sum3 = tmp[row -2];
1045       v2   = aa + ai[row]-1;
1046       v3   = aa + ai[row -1]-1;
1047       for (j=nz ; j>1; j-=2){
1048         vi  -=2;
1049         i0   = vi[2];
1050         i1   = vi[1];
1051         tmp0 = tmps[i0];
1052         tmp1 = tmps[i1];
1053         v1   -= 2;
1054         v2   -= 2;
1055         v3   -= 2;
1056         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1057         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1058         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1059       }
1060       if (j==1){
1061         tmp0  = tmps[*vi--];
1062         sum1 -= *v1-- * tmp0;
1063         sum2 -= *v2-- * tmp0;
1064         sum3 -= *v3-- * tmp0;
1065       }
1066       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1067       sum2   -= *v2-- * tmp0;
1068       sum3   -= *v3-- * tmp0;
1069       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1070       sum3   -= *v3-- * tmp0;
1071       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1072 
1073       break;
1074     case 4 :
1075       sum1 = tmp[row];
1076       sum2 = tmp[row -1];
1077       sum3 = tmp[row -2];
1078       sum4 = tmp[row -3];
1079       v2   = aa + ai[row]-1;
1080       v3   = aa + ai[row -1]-1;
1081       v4   = aa + ai[row -2]-1;
1082 
1083       for (j=nz ; j>1; j-=2){
1084         vi  -=2;
1085         i0   = vi[2];
1086         i1   = vi[1];
1087         tmp0 = tmps[i0];
1088         tmp1 = tmps[i1];
1089         v1  -= 2;
1090         v2  -= 2;
1091         v3  -= 2;
1092         v4  -= 2;
1093         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1094         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1095         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1096         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1097       }
1098       if (j==1){
1099         tmp0  = tmps[*vi--];
1100         sum1 -= *v1-- * tmp0;
1101         sum2 -= *v2-- * tmp0;
1102         sum3 -= *v3-- * tmp0;
1103         sum4 -= *v4-- * tmp0;
1104       }
1105 
1106       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1107       sum2   -= *v2-- * tmp0;
1108       sum3   -= *v3-- * tmp0;
1109       sum4   -= *v4-- * tmp0;
1110       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1111       sum3   -= *v3-- * tmp0;
1112       sum4   -= *v4-- * tmp0;
1113       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1114       sum4   -= *v4-- * tmp0;
1115       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1116       break;
1117     case 5 :
1118       sum1 = tmp[row];
1119       sum2 = tmp[row -1];
1120       sum3 = tmp[row -2];
1121       sum4 = tmp[row -3];
1122       sum5 = tmp[row -4];
1123       v2   = aa + ai[row]-1;
1124       v3   = aa + ai[row -1]-1;
1125       v4   = aa + ai[row -2]-1;
1126       v5   = aa + ai[row -3]-1;
1127       for (j=nz ; j>1; j-=2){
1128         vi  -= 2;
1129         i0   = vi[2];
1130         i1   = vi[1];
1131         tmp0 = tmps[i0];
1132         tmp1 = tmps[i1];
1133         v1   -= 2;
1134         v2   -= 2;
1135         v3   -= 2;
1136         v4   -= 2;
1137         v5   -= 2;
1138         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1139         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1140         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1141         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1142         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1143       }
1144       if (j==1){
1145         tmp0  = tmps[*vi--];
1146         sum1 -= *v1-- * tmp0;
1147         sum2 -= *v2-- * tmp0;
1148         sum3 -= *v3-- * tmp0;
1149         sum4 -= *v4-- * tmp0;
1150         sum5 -= *v5-- * tmp0;
1151       }
1152 
1153       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1154       sum2   -= *v2-- * tmp0;
1155       sum3   -= *v3-- * tmp0;
1156       sum4   -= *v4-- * tmp0;
1157       sum5   -= *v5-- * tmp0;
1158       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1159       sum3   -= *v3-- * tmp0;
1160       sum4   -= *v4-- * tmp0;
1161       sum5   -= *v5-- * tmp0;
1162       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1163       sum4   -= *v4-- * tmp0;
1164       sum5   -= *v5-- * tmp0;
1165       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1166       sum5   -= *v5-- * tmp0;
1167       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1168       break;
1169     default:
1170       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1171     }
1172   }
1173   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1174   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1175   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1176   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1177   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
1178   PetscFunctionReturn(0);
1179 }
1180 
1181 #undef __FUNCT__
1182 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode"
1183 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info)
1184 {
1185   Mat              C=B;
1186   Mat_SeqAIJ       *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ *)C->data;
1187   IS               isrow = b->row,isicol = b->icol;
1188   PetscErrorCode   ierr;
1189   const PetscInt   *r,*ic,*ics;
1190   const PetscInt   n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag;
1191   PetscInt         i,j,k,nz,nzL,row,*pj;
1192   const PetscInt   *ajtmp,*bjtmp;
1193   MatScalar        *pc,*pc1,*pc2,*pc3,mul1,mul2,mul3,*pv,*rtmp1,*rtmp2,*rtmp3;
1194   const  MatScalar *aa=a->a,*v,*v1,*v2,*v3;
1195   FactorShiftCtx   sctx;
1196   PetscInt         *ddiag;
1197   PetscReal        rs;
1198   MatScalar        d;
1199   PetscInt         inod,nodesz,node_max,*ns,col;
1200   PetscInt         *tmp_vec1,*tmp_vec2,*nsmap;
1201 
1202   PetscFunctionBegin;
1203   /* MatPivotSetUp(): initialize shift context sctx */
1204   ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr);
1205 
1206   if (info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1207     ddiag          = a->diag;
1208     sctx.shift_top = info->zeropivot;
1209     for (i=0; i<n; i++) {
1210       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1211       d  = (aa)[ddiag[i]];
1212       rs = -PetscAbsScalar(d) - PetscRealPart(d);
1213       v  = aa+ai[i];
1214       nz = ai[i+1] - ai[i];
1215       for (j=0; j<nz; j++)
1216 	rs += PetscAbsScalar(v[j]);
1217       if (rs>sctx.shift_top) sctx.shift_top = rs;
1218     }
1219     sctx.shift_top   *= 1.1;
1220     sctx.nshift_max   = 5;
1221     sctx.shift_lo     = 0.;
1222     sctx.shift_hi     = 1.;
1223   }
1224 
1225   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1226   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1227 
1228   ierr  = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp1);CHKERRQ(ierr);
1229   ierr  = PetscMemzero(rtmp1,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr);
1230   rtmp2 = rtmp1 + n;
1231   rtmp3 = rtmp2 + n;
1232   ics   = ic;
1233 
1234   node_max = a->inode.node_count;
1235   ns       = a->inode.size;
1236   if (!ns){
1237     SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information");
1238   }
1239 
1240   /* If max inode size > 3, split it into two inodes.*/
1241   /* also map the inode sizes according to the ordering */
1242   ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr);
1243   for (i=0,j=0; i<node_max; ++i,++j){
1244     if (ns[i]>3) {
1245       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1246       ++j;
1247       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1248     } else {
1249       tmp_vec1[j] = ns[i];
1250     }
1251   }
1252   /* Use the correct node_max */
1253   node_max = j;
1254 
1255   /* Now reorder the inode info based on mat re-ordering info */
1256   /* First create a row -> inode_size_array_index map */
1257   ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr);
1258   ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr);
1259   for (i=0,row=0; i<node_max; i++) {
1260     nodesz = tmp_vec1[i];
1261     for (j=0; j<nodesz; j++,row++) {
1262       nsmap[row] = i;
1263     }
1264   }
1265   /* Using nsmap, create a reordered ns structure */
1266   for (i=0,j=0; i< node_max; i++) {
1267     nodesz       = tmp_vec1[nsmap[r[j]]];    /* here the reordered row_no is in r[] */
1268     tmp_vec2[i]  = nodesz;
1269     j           += nodesz;
1270   }
1271   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1272   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1273 
1274   /* Now use the correct ns */
1275   ns = tmp_vec2;
1276 
1277   do {
1278     sctx.useshift = PETSC_FALSE;
1279     /* Now loop over each block-row, and do the factorization */
1280     for (inod=0,i=0; inod<node_max; inod++){ /* i: row index; inod: inode index */
1281       nodesz = ns[inod];
1282 
1283       switch (nodesz){
1284       case 1:
1285       /*----------*/
1286         /* zero rtmp1 */
1287         /* L part */
1288         nz    = bi[i+1] - bi[i];
1289         bjtmp = bj + bi[i];
1290         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1291 
1292         /* U part */
1293         nz = bdiag[i]-bdiag[i+1];
1294         bjtmp = bj + bdiag[i+1]+1;
1295         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1296 
1297         /* load in initial (unfactored row) */
1298         nz    = ai[r[i]+1] - ai[r[i]];
1299         ajtmp = aj + ai[r[i]];
1300         v     = aa + ai[r[i]];
1301         for (j=0; j<nz; j++) {
1302           rtmp1[ics[ajtmp[j]]] = v[j];
1303         }
1304         /* ZeropivotApply() */
1305         rtmp1[i] += sctx.shift_amount;  /* shift the diagonal of the matrix */
1306 
1307         /* elimination */
1308         bjtmp = bj + bi[i];
1309         row   = *bjtmp++;
1310         nzL   = bi[i+1] - bi[i];
1311         for(k=0; k < nzL;k++) {
1312           pc = rtmp1 + row;
1313           if (*pc != 0.0) {
1314             pv   = b->a + bdiag[row];
1315             mul1 = *pc * (*pv);
1316             *pc  = mul1;
1317             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1318             pv = b->a + bdiag[row+1]+1;
1319             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1320             for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1321             ierr = PetscLogFlops(2.0*nz);CHKERRQ(ierr);
1322           }
1323           row = *bjtmp++;
1324         }
1325 
1326         /* finished row so stick it into b->a */
1327         rs = 0.0;
1328         /* L part */
1329         pv = b->a + bi[i] ;
1330         pj = b->j + bi[i] ;
1331         nz = bi[i+1] - bi[i];
1332         for (j=0; j<nz; j++) {
1333           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1334         }
1335 
1336         /* U part */
1337         pv = b->a + bdiag[i+1]+1;
1338         pj = b->j + bdiag[i+1]+1;
1339         nz = bdiag[i] - bdiag[i+1]-1;
1340         for (j=0; j<nz; j++) {
1341           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1342         }
1343 
1344         /* Check zero pivot */
1345         sctx.rs = rs;
1346         sctx.pv = rtmp1[i];
1347         ierr = MatPivotCheck(info,sctx,i);CHKERRQ(ierr);
1348 
1349         /* Mark diagonal and invert diagonal for simplier triangular solves */
1350         pv  = b->a + bdiag[i];
1351         *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1352         break;
1353 
1354       case 2:
1355       /*----------*/
1356         /* zero rtmp1 and rtmp2 */
1357         /* L part */
1358         nz    = bi[i+1] - bi[i];
1359         bjtmp = bj + bi[i];
1360         for  (j=0; j<nz; j++) {
1361           col = bjtmp[j];
1362           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1363         }
1364 
1365         /* U part */
1366         nz = bdiag[i]-bdiag[i+1];
1367         bjtmp = bj + bdiag[i+1]+1;
1368         for  (j=0; j<nz; j++) {
1369           col = bjtmp[j];
1370           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1371         }
1372 
1373         /* load in initial (unfactored row) */
1374         nz    = ai[r[i]+1] - ai[r[i]];
1375         ajtmp = aj + ai[r[i]];
1376         v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1];
1377         for (j=0; j<nz; j++) {
1378           col = ics[ajtmp[j]];
1379           rtmp1[col] = v1[j]; rtmp2[col] = v2[j];
1380         }
1381         /* ZeropivotApply(): shift the diagonal of the matrix  */
1382         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount;
1383 
1384         /* elimination */
1385         bjtmp = bj + bi[i];
1386         row   = *bjtmp++; /* pivot row */
1387         nzL   = bi[i+1] - bi[i];
1388         for(k=0; k < nzL;k++) {
1389           pc1 = rtmp1 + row;
1390           pc2 = rtmp2 + row;
1391           if (*pc1 != 0.0 || *pc2 != 0.0) {
1392             pv         = b->a + bdiag[row];
1393             mul1 = *pc1*(*pv);    mul2 = *pc2*(*pv);
1394             *pc1        = mul1;   *pc2 = mul2;
1395 
1396             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1397             pv = b->a + bdiag[row+1]+1;
1398             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1399             for (j=0; j<nz; j++){
1400               col = pj[j];
1401               rtmp1[col] -= mul1 * pv[j];
1402               rtmp2[col] -= mul2 * pv[j];
1403             }
1404             ierr = PetscLogFlops(4*nz);CHKERRQ(ierr);
1405           }
1406           row = *bjtmp++;
1407         }
1408 
1409         /* finished row i; check zero pivot, then stick row i into b->a */
1410         rs  = 0.0;
1411         /* L part */
1412         k  = 0;
1413         pc1 = b->a + bi[i];
1414         pj  = b->j + bi[i] ;
1415         nz  = bi[i+1] - bi[i];
1416         for (j=0; j<nz; j++) {
1417           col = pj[j];
1418           if (col < i){pc1[k] = rtmp1[col]; k++; rs += PetscAbsScalar(pc1[k]);}
1419         }
1420         /* U part */
1421         pc1 = b->a + bdiag[i+1]+1;
1422         pj  = b->j + bdiag[i+1]+1;
1423         nz  = bdiag[i] - bdiag[i+1]; /* include diagonal */
1424         k  = 0;
1425         for (j=0; j<nz; j++) {
1426           col = pj[j];
1427           if (col > i)  {pc1[k] = rtmp1[col]; k++; rs += PetscAbsScalar(pc1[k]);}
1428         }
1429 
1430         sctx.rs  = rs;
1431         sctx.pv  = rtmp1[i];
1432         ierr = MatPivotCheck(info,sctx,i);CHKERRQ(ierr);
1433         pc1  = b->a + bdiag[i]; /* Mark diagonal */
1434         *pc1 = 1.0/sctx.pv;
1435 
1436         /* Now take care of diagonal 2x2 block. */
1437         pc2 = rtmp2 + i;
1438         if (*pc2 != 0.0){
1439           mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */
1440           *pc2 = mul1;          /* insert L entry */
1441           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1442           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1443           for (j=0; j<nz; j++) {
1444             col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col];
1445           }
1446           ierr = PetscLogFlops(2*nz);CHKERRQ(ierr);
1447         }
1448 
1449         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1450         rs = 0.0;
1451         /* L part */
1452         pc2 = b->a + bi[i+1];
1453         pj  = b->j + bi[i+1] ;
1454         nz  = bi[i+2] - bi[i+1];
1455         k  = 0;
1456         for (j=0; j<nz; j++) {
1457           col = pj[j];
1458           if (col < i+1){pc2[k] = rtmp2[col]; k++; rs += PetscAbsScalar(pc2[k]);}
1459         }
1460         /* U part */
1461         pc2 = b->a + bdiag[i+2]+1;
1462         pj  = b->j + bdiag[i+1]+1;
1463         nz  = bdiag[i] - bdiag[i+1]; /* inlcude diagonal */
1464         k =0;
1465         for (j=0; j<nz; j++) {
1466           col = pj[j];
1467           if (col > i+1){pc2[k] = rtmp2[col]; k++; rs += PetscAbsScalar(pc2[k]);}
1468         }
1469 
1470         sctx.rs  = rs;
1471         sctx.pv  = rtmp2[i+1];
1472         ierr = MatPivotCheck(info,sctx,i+1);CHKERRQ(ierr);
1473         pc2  = b->a + bdiag[i+1];
1474         *pc2 = 1.0/sctx.pv;
1475         break;
1476 
1477       case 3:
1478       /*----------*/
1479         /* zero rtmp */
1480         /* L part */
1481         nz    = bi[i+1] - bi[i];
1482         bjtmp = bj + bi[i];
1483         for  (j=0; j<nz; j++) {
1484           col = bjtmp[j];
1485           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1486         }
1487 
1488         /* U part */
1489         nz = bdiag[i]-bdiag[i+1];
1490         bjtmp = bj + bdiag[i+1]+1;
1491         for  (j=0; j<nz; j++) {
1492           col = bjtmp[j];
1493           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1494         }
1495 
1496         /* load in initial (unfactored row) */
1497         nz    = ai[r[i]+1] - ai[r[i]];
1498         ajtmp = aj + ai[r[i]];
1499         v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2];
1500         for (j=0; j<nz; j++) {
1501           col = ics[ajtmp[j]];
1502           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j];
1503         }
1504         /* ZeropivotApply(): shift the diagonal of the matrix  */
1505         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount;
1506 
1507         /* elimination */
1508         bjtmp = bj + bi[i];
1509         row   = *bjtmp++; /* pivot row */
1510         nzL   = bi[i+1] - bi[i];
1511         for(k=0; k < nzL;k++) {
1512           pc1 = rtmp1 + row;
1513           pc2 = rtmp2 + row;
1514           pc3 = rtmp3 + row;
1515           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1516             pv  = b->a + bdiag[row];
1517             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv);
1518             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3;
1519 
1520             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1521             pv = b->a + bdiag[row+1]+1;
1522             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1523             for (j=0; j<nz; j++){
1524               col = pj[j];
1525               rtmp1[col] -= mul1 * pv[j];
1526               rtmp2[col] -= mul2 * pv[j];
1527               rtmp3[col] -= mul3 * pv[j];
1528             }
1529             ierr = PetscLogFlops(6*nz);CHKERRQ(ierr);
1530           }
1531           row = *bjtmp++;
1532         }
1533 
1534         /* finished row i; check zero pivot, then stick row i into b->a */
1535         rs  = 0.0;
1536         /* L part */
1537         pc1 = b->a + bi[i];
1538         pj  = b->j + bi[i] ;
1539         nz  = bi[i+1] - bi[i];
1540         k  = 0;
1541         for (j=0; j<nz; j++) {
1542           col = pj[j];
1543           if (col < i){pc1[k] = rtmp1[col]; k++; rs += PetscAbsScalar(pc1[k]);}
1544         }
1545         /* U part */
1546         pc1 = b->a + bdiag[i+1]+1;
1547         pj  = b->j + bdiag[i+1]+1;
1548         nz  = bdiag[i] - bdiag[i+1]; /* include diagonal */
1549         k  = 0;
1550         for (j=0; j<nz; j++) {
1551           col = pj[j];
1552           if (col > i){pc1[k] = rtmp1[col]; k++; rs += PetscAbsScalar(pc1[k]);}
1553         }
1554 
1555         sctx.rs  = rs;
1556         sctx.pv  = rtmp1[i];
1557         ierr = MatPivotCheck(info,sctx,i);CHKERRQ(ierr);
1558         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1559         *pc1 = 1.0/sctx.pv;
1560 
1561         /* Now take care of 1st column of diagonal 3x3 block. */
1562         pc2 = rtmp2 + i;
1563         pc3 = rtmp3 + i;
1564         if (*pc2 != 0.0 || *pc3 != 0.0){
1565           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1566           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1567           pj = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1568           nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1569           for (j=0; j<nz; j++) {
1570             col = pj[j];
1571             rtmp2[col] -= mul2 * rtmp1[col];
1572             rtmp3[col] -= mul3 * rtmp1[col];
1573           }
1574           ierr = PetscLogFlops(4*nz);CHKERRQ(ierr);
1575         }
1576 
1577         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1578         rs = 0.0;
1579         /* L part */
1580         pc2 = b->a + bi[i+1];
1581         pj  = b->j + bi[i+1] ;
1582         nz  = bi[i+2] - bi[i+1];
1583         k   = 0;
1584         for (j=0; j<nz; j++) {
1585           col = pj[j];
1586           if (col < i+1){pc2[k] = rtmp2[col]; k++; rs += PetscAbsScalar(pc2[k]);}
1587         }
1588         /* U part */
1589         pc2 = b->a + bdiag[i+2]+1;
1590         pj  = b->j + bdiag[i+1]+1;
1591         nz  = bdiag[i] - bdiag[i+1]; /* inlcude diagonal */
1592         k   = 0;
1593         for (j=0; j<nz; j++) {
1594           col = pj[j];
1595           if (col > i+1){pc2[k] = rtmp2[col]; k++; rs += PetscAbsScalar(pc2[k]);}
1596         }
1597 
1598         sctx.rs  = rs;
1599         sctx.pv  = rtmp2[i+1];
1600         ierr = MatPivotCheck(info,sctx,i+1);CHKERRQ(ierr);
1601         pc2  = b->a + bdiag[i+1];
1602         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1603 
1604         /* Now take care of 2nd column of diagonal 3x3 block. */
1605         pc3 = rtmp3 + i+1;
1606         if (*pc3 != 0.0){
1607           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1608           pj = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1609           nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1610           for (j=0; j<nz; j++) {
1611             col = pj[j];
1612             rtmp3[col] -= mul3 * rtmp2[col];
1613           }
1614           ierr = PetscLogFlops(2*nz);CHKERRQ(ierr);
1615         }
1616 
1617         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1618         rs = 0.0;
1619         /* L part */
1620         pc3 = b->a + bi[i+2];
1621         pj  = b->j + bi[i+2] ;
1622         nz  = bi[i+3] - bi[i+2];
1623         k   = 0;
1624         for (j=0; j<nz; j++) {
1625           col = pj[j];
1626           if (col < i+2){pc3[k] = rtmp3[col]; k++; rs += PetscAbsScalar(pc3[k]);}
1627         }
1628         /* U part */
1629         pc3 = b->a + bdiag[i+3]+1;
1630         pj  = b->j + bdiag[i+1]+1;
1631         nz  = bdiag[i] - bdiag[i+1]; /* inlcude diagonal */
1632         k  = 0;
1633         for (j=0; j<nz; j++) {
1634           col = pj[j];
1635           if (col > i+2){pc3[k] = rtmp3[col]; k++; rs += PetscAbsScalar(pc3[k]);}
1636         }
1637 
1638         sctx.rs  = rs;
1639         sctx.pv  = rtmp3[i+2];
1640         ierr = MatPivotCheck(info,sctx,i+2);CHKERRQ(ierr);
1641         pc3  = b->a + bdiag[i+2];
1642         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1643         break;
1644 
1645       default:
1646         SETERRQ(PETSC_ERR_SUP,"Node size not yet supported \n");
1647       }
1648       i += nodesz;                 /* Update the row */
1649     }
1650 
1651     /* MatPivotRefine() */
1652     if (info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE && !sctx.useshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max){
1653       /*
1654        * if no shift in this attempt & shifting & started shifting & can refine,
1655        * then try lower shift
1656        */
1657       sctx.shift_hi       = sctx.shift_fraction;
1658       sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.;
1659       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
1660       sctx.useshift       = PETSC_TRUE;
1661       sctx.nshift++;
1662     }
1663   } while (sctx.useshift);
1664 
1665   ierr = PetscFree(rtmp1);CHKERRQ(ierr);
1666   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
1667   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
1668   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1669 
1670   C->ops->solve              = MatSolve_SeqAIJ_Inode;
1671   C->ops->solveadd           = MatSolveAdd_SeqAIJ;
1672   C->ops->solvetranspose     = MatSolveTranspose_SeqAIJ;
1673   C->ops->solvetransposeadd  = MatSolveTransposeAdd_SeqAIJ;
1674   C->ops->matsolve           = MatMatSolve_SeqAIJ;
1675   C->assembled    = PETSC_TRUE;
1676   C->preallocated = PETSC_TRUE;
1677   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
1678 
1679   /* MatShiftView(A,info,&sctx) */
1680   if (sctx.nshift){
1681     if (info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE) {
1682       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr);
1683     } else if (info->shifttype == MAT_SHIFT_NONZERO) {
1684       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
1685     } else if (info->shifttype == MAT_SHIFT_INBLOCKS){
1686       ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %G\n",sctx.nshift,info->shiftamount);CHKERRQ(ierr);
1687     }
1688   }
1689   PetscFunctionReturn(0);
1690 }
1691 
1692 #undef __FUNCT__
1693 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode_inplace"
1694 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info)
1695 {
1696   Mat               C = B;
1697   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1698   IS                iscol = b->col,isrow = b->row,isicol = b->icol;
1699   PetscErrorCode    ierr;
1700   const PetscInt    *r,*ic,*c,*ics;
1701   PetscInt          n = A->rmap->n,*bi = b->i;
1702   PetscInt          *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1703   PetscInt          i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz;
1704   PetscInt          *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1705   PetscScalar       mul1,mul2,mul3,tmp;
1706   MatScalar         *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1707   const MatScalar   *v1,*v2,*v3,*aa = a->a,*rtmp1;
1708   PetscReal         rs=0.0;
1709   FactorShiftCtx    sctx;
1710   PetscInt          newshift;
1711 
1712   PetscFunctionBegin;
1713   sctx.shift_top      = 0;
1714   sctx.nshift_max     = 0;
1715   sctx.shift_lo       = 0;
1716   sctx.shift_hi       = 0;
1717   sctx.shift_fraction = 0;
1718 
1719   /* if both shift schemes are chosen by user, only use info->shiftpd */
1720   if (info->shifttype==MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1721     sctx.shift_top = 0;
1722     for (i=0; i<n; i++) {
1723       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1724       rs    = 0.0;
1725       ajtmp = aj + ai[i];
1726       rtmp1 = aa + ai[i];
1727       nz = ai[i+1] - ai[i];
1728       for (j=0; j<nz; j++){
1729         if (*ajtmp != i){
1730           rs += PetscAbsScalar(*rtmp1++);
1731         } else {
1732           rs -= PetscRealPart(*rtmp1++);
1733         }
1734         ajtmp++;
1735       }
1736       if (rs>sctx.shift_top) sctx.shift_top = rs;
1737     }
1738     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1739     sctx.shift_top *= 1.1;
1740     sctx.nshift_max = 5;
1741     sctx.shift_lo   = 0.;
1742     sctx.shift_hi   = 1.;
1743   }
1744   sctx.shift_amount = 0;
1745   sctx.nshift       = 0;
1746 
1747   ierr  = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1748   ierr  = ISGetIndices(iscol,&c);CHKERRQ(ierr);
1749   ierr  = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1750   ierr  = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp11);CHKERRQ(ierr);
1751   ierr  = PetscMemzero(rtmp11,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr);
1752   ics   = ic ;
1753   rtmp22 = rtmp11 + n;
1754   rtmp33 = rtmp22 + n;
1755 
1756   node_max = a->inode.node_count;
1757   ns       = a->inode.size;
1758   if (!ns){
1759     SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information");
1760   }
1761 
1762   /* If max inode size > 3, split it into two inodes.*/
1763   /* also map the inode sizes according to the ordering */
1764   ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr);
1765   for (i=0,j=0; i<node_max; ++i,++j){
1766     if (ns[i]>3) {
1767       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1768       ++j;
1769       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1770     } else {
1771       tmp_vec1[j] = ns[i];
1772     }
1773   }
1774   /* Use the correct node_max */
1775   node_max = j;
1776 
1777   /* Now reorder the inode info based on mat re-ordering info */
1778   /* First create a row -> inode_size_array_index map */
1779   ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr);
1780   ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr);
1781   for (i=0,row=0; i<node_max; i++) {
1782     nodesz = tmp_vec1[i];
1783     for (j=0; j<nodesz; j++,row++) {
1784       nsmap[row] = i;
1785     }
1786   }
1787   /* Using nsmap, create a reordered ns structure */
1788   for (i=0,j=0; i< node_max; i++) {
1789     nodesz       = tmp_vec1[nsmap[r[j]]];    /* here the reordered row_no is in r[] */
1790     tmp_vec2[i]  = nodesz;
1791     j           += nodesz;
1792   }
1793   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1794   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1795   /* Now use the correct ns */
1796   ns = tmp_vec2;
1797 
1798   do {
1799     sctx.useshift = PETSC_FALSE;
1800     /* Now loop over each block-row, and do the factorization */
1801     for (i=0,row=0; i<node_max; i++) {
1802       nodesz = ns[i];
1803       nz     = bi[row+1] - bi[row];
1804       bjtmp  = bj + bi[row];
1805 
1806       switch (nodesz){
1807       case 1:
1808         for  (j=0; j<nz; j++){
1809           idx        = bjtmp[j];
1810           rtmp11[idx] = 0.0;
1811         }
1812 
1813         /* load in initial (unfactored row) */
1814         idx    = r[row];
1815         nz_tmp = ai[idx+1] - ai[idx];
1816         ajtmp  = aj + ai[idx];
1817         v1     = aa + ai[idx];
1818 
1819         for (j=0; j<nz_tmp; j++) {
1820           idx        = ics[ajtmp[j]];
1821           rtmp11[idx] = v1[j];
1822         }
1823         rtmp11[ics[r[row]]] += sctx.shift_amount;
1824 
1825         prow = *bjtmp++ ;
1826         while (prow < row) {
1827           pc1 = rtmp11 + prow;
1828           if (*pc1 != 0.0){
1829             pv   = ba + bd[prow];
1830             pj   = nbj + bd[prow];
1831             mul1 = *pc1 * *pv++;
1832             *pc1 = mul1;
1833             nz_tmp = bi[prow+1] - bd[prow] - 1;
1834             ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr);
1835             for (j=0; j<nz_tmp; j++) {
1836               tmp = pv[j];
1837               idx = pj[j];
1838               rtmp11[idx] -= mul1 * tmp;
1839             }
1840           }
1841           prow = *bjtmp++ ;
1842         }
1843         pj  = bj + bi[row];
1844         pc1 = ba + bi[row];
1845 
1846         sctx.pv    = rtmp11[row];
1847         rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
1848         rs         = 0.0;
1849         for (j=0; j<nz; j++) {
1850           idx    = pj[j];
1851           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
1852           if (idx != row) rs += PetscAbsScalar(pc1[j]);
1853         }
1854         sctx.rs  = rs;
1855         ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr);
1856         if (newshift == 1) goto endofwhile;
1857         break;
1858 
1859       case 2:
1860         for (j=0; j<nz; j++) {
1861           idx        = bjtmp[j];
1862           rtmp11[idx] = 0.0;
1863           rtmp22[idx] = 0.0;
1864         }
1865 
1866         /* load in initial (unfactored row) */
1867         idx    = r[row];
1868         nz_tmp = ai[idx+1] - ai[idx];
1869         ajtmp  = aj + ai[idx];
1870         v1     = aa + ai[idx];
1871         v2     = aa + ai[idx+1];
1872         for (j=0; j<nz_tmp; j++) {
1873           idx        = ics[ajtmp[j]];
1874           rtmp11[idx] = v1[j];
1875           rtmp22[idx] = v2[j];
1876         }
1877         rtmp11[ics[r[row]]]   += sctx.shift_amount;
1878         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
1879 
1880         prow = *bjtmp++ ;
1881         while (prow < row) {
1882           pc1 = rtmp11 + prow;
1883           pc2 = rtmp22 + prow;
1884           if (*pc1 != 0.0 || *pc2 != 0.0){
1885             pv   = ba + bd[prow];
1886             pj   = nbj + bd[prow];
1887             mul1 = *pc1 * *pv;
1888             mul2 = *pc2 * *pv;
1889             ++pv;
1890             *pc1 = mul1;
1891             *pc2 = mul2;
1892 
1893             nz_tmp = bi[prow+1] - bd[prow] - 1;
1894             for (j=0; j<nz_tmp; j++) {
1895               tmp = pv[j];
1896               idx = pj[j];
1897               rtmp11[idx] -= mul1 * tmp;
1898               rtmp22[idx] -= mul2 * tmp;
1899             }
1900             ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr);
1901           }
1902           prow = *bjtmp++ ;
1903         }
1904 
1905         /* Now take care of diagonal 2x2 block. Note: prow = row here */
1906         pc1 = rtmp11 + prow;
1907         pc2 = rtmp22 + prow;
1908 
1909         sctx.pv = *pc1;
1910         pj      = bj + bi[prow];
1911         rs      = 0.0;
1912         for (j=0; j<nz; j++){
1913           idx = pj[j];
1914           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
1915         }
1916         sctx.rs = rs;
1917         ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr);
1918         if (newshift == 1) goto endofwhile;
1919 
1920         if (*pc2 != 0.0){
1921           pj     = nbj + bd[prow];
1922           mul2   = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
1923           *pc2   = mul2;
1924           nz_tmp = bi[prow+1] - bd[prow] - 1;
1925           for (j=0; j<nz_tmp; j++) {
1926             idx = pj[j] ;
1927             tmp = rtmp11[idx];
1928             rtmp22[idx] -= mul2 * tmp;
1929           }
1930           ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr);
1931         }
1932 
1933         pj  = bj + bi[row];
1934         pc1 = ba + bi[row];
1935         pc2 = ba + bi[row+1];
1936 
1937         sctx.pv = rtmp22[row+1];
1938         rs = 0.0;
1939         rtmp11[row]   = 1.0/rtmp11[row];
1940         rtmp22[row+1] = 1.0/rtmp22[row+1];
1941         /* copy row entries from dense representation to sparse */
1942         for (j=0; j<nz; j++) {
1943           idx    = pj[j];
1944           pc1[j] = rtmp11[idx];
1945           pc2[j] = rtmp22[idx];
1946           if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
1947         }
1948         sctx.rs = rs;
1949         ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr);
1950         if (newshift == 1) goto endofwhile;
1951         break;
1952 
1953       case 3:
1954         for  (j=0; j<nz; j++) {
1955           idx        = bjtmp[j];
1956           rtmp11[idx] = 0.0;
1957           rtmp22[idx] = 0.0;
1958           rtmp33[idx] = 0.0;
1959         }
1960         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
1961         idx    = r[row];
1962         nz_tmp = ai[idx+1] - ai[idx];
1963         ajtmp = aj + ai[idx];
1964         v1    = aa + ai[idx];
1965         v2    = aa + ai[idx+1];
1966         v3    = aa + ai[idx+2];
1967         for (j=0; j<nz_tmp; j++) {
1968           idx        = ics[ajtmp[j]];
1969           rtmp11[idx] = v1[j];
1970           rtmp22[idx] = v2[j];
1971           rtmp33[idx] = v3[j];
1972         }
1973         rtmp11[ics[r[row]]]   += sctx.shift_amount;
1974         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
1975         rtmp33[ics[r[row+2]]] += sctx.shift_amount;
1976 
1977         /* loop over all pivot row blocks above this row block */
1978         prow = *bjtmp++ ;
1979         while (prow < row) {
1980           pc1 = rtmp11 + prow;
1981           pc2 = rtmp22 + prow;
1982           pc3 = rtmp33 + prow;
1983           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){
1984             pv   = ba  + bd[prow];
1985             pj   = nbj + bd[prow];
1986             mul1 = *pc1 * *pv;
1987             mul2 = *pc2 * *pv;
1988             mul3 = *pc3 * *pv;
1989             ++pv;
1990             *pc1 = mul1;
1991             *pc2 = mul2;
1992             *pc3 = mul3;
1993 
1994             nz_tmp = bi[prow+1] - bd[prow] - 1;
1995             /* update this row based on pivot row */
1996             for (j=0; j<nz_tmp; j++) {
1997               tmp = pv[j];
1998               idx = pj[j];
1999               rtmp11[idx] -= mul1 * tmp;
2000               rtmp22[idx] -= mul2 * tmp;
2001               rtmp33[idx] -= mul3 * tmp;
2002             }
2003             ierr = PetscLogFlops(6*nz_tmp);CHKERRQ(ierr);
2004           }
2005           prow = *bjtmp++ ;
2006         }
2007 
2008         /* Now take care of diagonal 3x3 block in this set of rows */
2009         /* note: prow = row here */
2010         pc1 = rtmp11 + prow;
2011         pc2 = rtmp22 + prow;
2012         pc3 = rtmp33 + prow;
2013 
2014         sctx.pv = *pc1;
2015         pj      = bj + bi[prow];
2016         rs      = 0.0;
2017         for (j=0; j<nz; j++){
2018           idx = pj[j];
2019           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2020         }
2021         sctx.rs = rs;
2022         ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr);
2023         if (newshift == 1) goto endofwhile;
2024 
2025         if (*pc2 != 0.0 || *pc3 != 0.0){
2026           mul2 = (*pc2)/(*pc1);
2027           mul3 = (*pc3)/(*pc1);
2028           *pc2 = mul2;
2029           *pc3 = mul3;
2030           nz_tmp = bi[prow+1] - bd[prow] - 1;
2031           pj     = nbj + bd[prow];
2032           for (j=0; j<nz_tmp; j++) {
2033             idx = pj[j] ;
2034             tmp = rtmp11[idx];
2035             rtmp22[idx] -= mul2 * tmp;
2036             rtmp33[idx] -= mul3 * tmp;
2037           }
2038           ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr);
2039         }
2040         ++prow;
2041 
2042         pc2 = rtmp22 + prow;
2043         pc3 = rtmp33 + prow;
2044         sctx.pv = *pc2;
2045         pj      = bj + bi[prow];
2046         rs      = 0.0;
2047         for (j=0; j<nz; j++){
2048           idx = pj[j];
2049           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2050         }
2051         sctx.rs = rs;
2052         ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr);
2053         if (newshift == 1) goto endofwhile;
2054 
2055         if (*pc3 != 0.0){
2056           mul3   = (*pc3)/(*pc2);
2057           *pc3   = mul3;
2058           pj     = nbj + bd[prow];
2059           nz_tmp = bi[prow+1] - bd[prow] - 1;
2060           for (j=0; j<nz_tmp; j++) {
2061             idx = pj[j] ;
2062             tmp = rtmp22[idx];
2063             rtmp33[idx] -= mul3 * tmp;
2064           }
2065           ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr);
2066         }
2067 
2068         pj  = bj + bi[row];
2069         pc1 = ba + bi[row];
2070         pc2 = ba + bi[row+1];
2071         pc3 = ba + bi[row+2];
2072 
2073         sctx.pv = rtmp33[row+2];
2074         rs = 0.0;
2075         rtmp11[row]   = 1.0/rtmp11[row];
2076         rtmp22[row+1] = 1.0/rtmp22[row+1];
2077         rtmp33[row+2] = 1.0/rtmp33[row+2];
2078         /* copy row entries from dense representation to sparse */
2079         for (j=0; j<nz; j++) {
2080           idx    = pj[j];
2081           pc1[j] = rtmp11[idx];
2082           pc2[j] = rtmp22[idx];
2083           pc3[j] = rtmp33[idx];
2084           if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
2085         }
2086 
2087         sctx.rs = rs;
2088         ierr = MatLUCheckShift_inline(info,sctx,row+2,newshift);CHKERRQ(ierr);
2089         if (newshift == 1) goto endofwhile;
2090         break;
2091 
2092       default:
2093         SETERRQ(PETSC_ERR_SUP,"Node size not yet supported \n");
2094       }
2095       row += nodesz;                 /* Update the row */
2096     }
2097     endofwhile:;
2098   } while (sctx.useshift);
2099   ierr = PetscFree(rtmp11);CHKERRQ(ierr);
2100   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
2101   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
2102   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
2103   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
2104   (B)->ops->solve           = MatSolve_SeqAIJ_Inode_inplace;
2105   /* do not set solve add, since MatSolve_Inode + Add is faster */
2106   C->ops->solvetranspose     = MatSolveTranspose_SeqAIJ_inplace;
2107   C->ops->solvetransposeadd  = MatSolveTransposeAdd_SeqAIJ_inplace;
2108   C->assembled   = PETSC_TRUE;
2109   C->preallocated = PETSC_TRUE;
2110   if (sctx.nshift) {
2111     if (info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE) {
2112       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr);
2113     } else if (info->shifttype == MAT_SHIFT_NONZERO) {
2114       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
2115     }
2116   }
2117   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
2118   PetscFunctionReturn(0);
2119 }
2120 
2121 
2122 /* ----------------------------------------------------------- */
2123 #undef __FUNCT__
2124 #define __FUNCT__ "MatSolve_SeqAIJ_Inode"
2125 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
2126 {
2127   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
2128   IS                iscol = a->col,isrow = a->row;
2129   PetscErrorCode    ierr;
2130   const PetscInt    *r,*c,*rout,*cout;
2131   PetscInt          i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
2132   PetscInt          node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
2133   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
2134   PetscScalar       sum1,sum2,sum3,sum4,sum5;
2135   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
2136   const PetscScalar *b;
2137 
2138   PetscFunctionBegin;
2139   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
2140   node_max = a->inode.node_count;
2141   ns       = a->inode.size;     /* Node Size array */
2142 
2143   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2144   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2145   tmp  = a->solve_work;
2146 
2147   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2148   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2149 
2150   /* forward solve the lower triangular */
2151   tmps = tmp ;
2152   aa   = a_a ;
2153   aj   = a_j ;
2154   ad   = a->diag;
2155 
2156   for (i = 0,row = 0; i< node_max; ++i){
2157     nsz = ns[i];
2158     aii = ai[row];
2159     v1  = aa + aii;
2160     vi  = aj + aii;
2161     nz  = ai[row+1]- ai[row];
2162 
2163     if (i < node_max-1) {
2164       /* Prefetch the indices for the next block */
2165       PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,0); /* indices */
2166       /* Prefetch the data for the next block */
2167       PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,0);
2168     }
2169 
2170     switch (nsz){               /* Each loop in 'case' is unrolled */
2171     case 1 :
2172       sum1 = b[r[row]];
2173       for(j=0; j<nz-1; j+=2){
2174         i0   = vi[j];
2175         i1   = vi[j+1];
2176         tmp0 = tmps[i0];
2177         tmp1 = tmps[i1];
2178         sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1;
2179       }
2180       if(j == nz-1){
2181         tmp0 = tmps[vi[j]];
2182         sum1 -= v1[j]*tmp0;
2183       }
2184       tmp[row++]=sum1;
2185       break;
2186     case 2:
2187       sum1 = b[r[row]];
2188       sum2 = b[r[row+1]];
2189       v2   = aa + ai[row+1];
2190 
2191       for(j=0; j<nz-1; j+=2){
2192         i0   = vi[j];
2193         i1   = vi[j+1];
2194         tmp0 = tmps[i0];
2195         tmp1 = tmps[i1];
2196         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2197         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2198       }
2199       if(j == nz-1){
2200         tmp0 = tmps[vi[j]];
2201         sum1 -= v1[j] *tmp0;
2202         sum2 -= v2[j] *tmp0;
2203       }
2204       sum2 -= v2[nz] * sum1;
2205       tmp[row ++]=sum1;
2206       tmp[row ++]=sum2;
2207       break;
2208     case 3:
2209       sum1 = b[r[row]];
2210       sum2 = b[r[row+1]];
2211       sum3 = b[r[row+2]];
2212       v2   = aa + ai[row+1];
2213       v3   = aa + ai[row+2];
2214 
2215       for (j=0; j<nz-1; j+=2){
2216         i0   = vi[j];
2217         i1   = vi[j+1];
2218         tmp0 = tmps[i0];
2219         tmp1 = tmps[i1];
2220         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2221         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2222         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2223       }
2224       if (j == nz-1){
2225         tmp0 = tmps[vi[j]];
2226         sum1 -= v1[j] *tmp0;
2227         sum2 -= v2[j] *tmp0;
2228         sum3 -= v3[j] *tmp0;
2229       }
2230       sum2 -= v2[nz] * sum1;
2231       sum3 -= v3[nz] * sum1;
2232       sum3 -= v3[nz+1] * sum2;
2233       tmp[row ++]=sum1;
2234       tmp[row ++]=sum2;
2235       tmp[row ++]=sum3;
2236       break;
2237 
2238     case 4:
2239       sum1 = b[r[row]];
2240       sum2 = b[r[row+1]];
2241       sum3 = b[r[row+2]];
2242       sum4 = b[r[row+3]];
2243       v2   = aa + ai[row+1];
2244       v3   = aa + ai[row+2];
2245       v4   = aa + ai[row+3];
2246 
2247       for (j=0; j<nz-1; j+=2){
2248         i0   = vi[j];
2249         i1   = vi[j+1];
2250         tmp0 = tmps[i0];
2251         tmp1 = tmps[i1];
2252         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2253         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2254         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2255         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2256       }
2257       if (j == nz-1){
2258         tmp0 = tmps[vi[j]];
2259         sum1 -= v1[j] *tmp0;
2260         sum2 -= v2[j] *tmp0;
2261         sum3 -= v3[j] *tmp0;
2262         sum4 -= v4[j] *tmp0;
2263       }
2264       sum2 -= v2[nz] * sum1;
2265       sum3 -= v3[nz] * sum1;
2266       sum4 -= v4[nz] * sum1;
2267       sum3 -= v3[nz+1] * sum2;
2268       sum4 -= v4[nz+1] * sum2;
2269       sum4 -= v4[nz+2] * sum3;
2270 
2271       tmp[row ++]=sum1;
2272       tmp[row ++]=sum2;
2273       tmp[row ++]=sum3;
2274       tmp[row ++]=sum4;
2275       break;
2276     case 5:
2277       sum1 = b[r[row]];
2278       sum2 = b[r[row+1]];
2279       sum3 = b[r[row+2]];
2280       sum4 = b[r[row+3]];
2281       sum5 = b[r[row+4]];
2282       v2   = aa + ai[row+1];
2283       v3   = aa + ai[row+2];
2284       v4   = aa + ai[row+3];
2285       v5   = aa + ai[row+4];
2286 
2287       for (j=0; j<nz-1; j+=2){
2288         i0   = vi[j];
2289         i1   = vi[j+1];
2290         tmp0 = tmps[i0];
2291         tmp1 = tmps[i1];
2292         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2293         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2294         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2295         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2296         sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1;
2297       }
2298       if (j == nz-1){
2299         tmp0 = tmps[vi[j]];
2300         sum1 -= v1[j] *tmp0;
2301         sum2 -= v2[j] *tmp0;
2302         sum3 -= v3[j] *tmp0;
2303         sum4 -= v4[j] *tmp0;
2304         sum5 -= v5[j] *tmp0;
2305       }
2306 
2307       sum2 -= v2[nz] * sum1;
2308       sum3 -= v3[nz] * sum1;
2309       sum4 -= v4[nz] * sum1;
2310       sum5 -= v5[nz] * sum1;
2311       sum3 -= v3[nz+1] * sum2;
2312       sum4 -= v4[nz+1] * sum2;
2313       sum5 -= v5[nz+1] * sum2;
2314       sum4 -= v4[nz+2] * sum3;
2315       sum5 -= v5[nz+2] * sum3;
2316       sum5 -= v5[nz+3] * sum4;
2317 
2318       tmp[row ++]=sum1;
2319       tmp[row ++]=sum2;
2320       tmp[row ++]=sum3;
2321       tmp[row ++]=sum4;
2322       tmp[row ++]=sum5;
2323       break;
2324     default:
2325       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
2326     }
2327   }
2328   /* backward solve the upper triangular */
2329   for (i=node_max -1 ,row = n-1 ; i>=0; i--){
2330     nsz = ns[i];
2331     aii = ad[row+1] + 1;
2332     v1  = aa + aii;
2333     vi  = aj + aii;
2334     nz  = ad[row]- ad[row+1] - 1;
2335 
2336     if (i > 0) {
2337       /* Prefetch the indices for the next block */
2338       PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,0); /* indices */
2339       /* Prefetch the data for the next block */
2340       PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,0);
2341     }
2342 
2343     switch (nsz){               /* Each loop in 'case' is unrolled */
2344     case 1 :
2345       sum1 = tmp[row];
2346 
2347       for(j=0 ; j<nz-1; j+=2){
2348         i0   = vi[j];
2349         i1   = vi[j+1];
2350         tmp0 = tmps[i0];
2351         tmp1 = tmps[i1];
2352         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2353       }
2354       if (j == nz-1){
2355         tmp0  = tmps[vi[j]];
2356         sum1 -= v1[j]*tmp0;
2357       }
2358       x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2359       break;
2360     case 2 :
2361       sum1 = tmp[row];
2362       sum2 = tmp[row-1];
2363       v2   = aa + ad[row] + 1;
2364       for (j=0 ; j<nz-1; j+=2){
2365         i0   = vi[j];
2366         i1   = vi[j+1];
2367         tmp0 = tmps[i0];
2368         tmp1 = tmps[i1];
2369         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2370         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2371       }
2372       if (j == nz-1){
2373         tmp0  = tmps[vi[j]];
2374         sum1 -= v1[j]* tmp0;
2375         sum2 -= v2[j+1]* tmp0;
2376       }
2377 
2378       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2379       sum2   -= v2[0] * tmp0;
2380       x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2381       break;
2382     case 3 :
2383       sum1 = tmp[row];
2384       sum2 = tmp[row -1];
2385       sum3 = tmp[row -2];
2386       v2   = aa + ad[row] + 1;
2387       v3   = aa + ad[row -1] + 1;
2388       for (j=0 ; j<nz-1; j+=2){
2389         i0   = vi[j];
2390         i1   = vi[j+1];
2391         tmp0 = tmps[i0];
2392         tmp1 = tmps[i1];
2393         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2394         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2395         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2396       }
2397       if (j== nz-1){
2398         tmp0  = tmps[vi[j]];
2399         sum1 -= v1[j] * tmp0;
2400         sum2 -= v2[j+1] * tmp0;
2401         sum3 -= v3[j+2] * tmp0;
2402       }
2403       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2404       sum2   -= v2[0]* tmp0;
2405       sum3   -= v3[1] * tmp0;
2406       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2407       sum3   -= v3[0]* tmp0;
2408       x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2409 
2410       break;
2411     case 4 :
2412       sum1 = tmp[row];
2413       sum2 = tmp[row -1];
2414       sum3 = tmp[row -2];
2415       sum4 = tmp[row -3];
2416       v2   = aa + ad[row]+1;
2417       v3   = aa + ad[row -1]+1;
2418       v4   = aa + ad[row -2]+1;
2419 
2420       for (j=0 ; j<nz-1; j+=2){
2421         i0   = vi[j];
2422         i1   = vi[j+1];
2423         tmp0 = tmps[i0];
2424         tmp1 = tmps[i1];
2425         sum1 -= v1[j] * tmp0   + v1[j+1] * tmp1;
2426         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2427         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2428         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2429       }
2430       if (j== nz-1){
2431         tmp0  = tmps[vi[j]];
2432         sum1 -= v1[j] * tmp0;
2433         sum2 -= v2[j+1] * tmp0;
2434         sum3 -= v3[j+2] * tmp0;
2435         sum4 -= v4[j+3] * tmp0;
2436       }
2437 
2438       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2439       sum2   -= v2[0] * tmp0;
2440       sum3   -= v3[1] * tmp0;
2441       sum4   -= v4[2] * tmp0;
2442       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2443       sum3   -= v3[0] * tmp0;
2444       sum4   -= v4[1] * tmp0;
2445       tmp0    = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2446       sum4   -= v4[0] * tmp0;
2447       x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2448       break;
2449     case 5 :
2450       sum1 = tmp[row];
2451       sum2 = tmp[row -1];
2452       sum3 = tmp[row -2];
2453       sum4 = tmp[row -3];
2454       sum5 = tmp[row -4];
2455       v2   = aa + ad[row]+1;
2456       v3   = aa + ad[row -1]+1;
2457       v4   = aa + ad[row -2]+1;
2458       v5   = aa + ad[row -3]+1;
2459       for (j=0 ; j<nz-1; j+=2){
2460         i0   = vi[j];
2461         i1   = vi[j+1];
2462         tmp0 = tmps[i0];
2463         tmp1 = tmps[i1];
2464         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2465         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2466         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2467         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2468         sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1;
2469       }
2470       if (j==nz-1){
2471         tmp0  = tmps[vi[j]];
2472         sum1 -= v1[j] * tmp0;
2473         sum2 -= v2[j+1] * tmp0;
2474         sum3 -= v3[j+2] * tmp0;
2475         sum4 -= v4[j+3] * tmp0;
2476         sum5 -= v5[j+4] * tmp0;
2477       }
2478 
2479       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2480       sum2   -= v2[0] * tmp0;
2481       sum3   -= v3[1] * tmp0;
2482       sum4   -= v4[2] * tmp0;
2483       sum5   -= v5[3] * tmp0;
2484       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2485       sum3   -= v3[0] * tmp0;
2486       sum4   -= v4[1] * tmp0;
2487       sum5   -= v5[2] * tmp0;
2488       tmp0    = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2489       sum4   -= v4[0] * tmp0;
2490       sum5   -= v5[1] * tmp0;
2491       tmp0    = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2492       sum5   -= v5[0] * tmp0;
2493       x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--;
2494       break;
2495     default:
2496       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
2497     }
2498   }
2499   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2500   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2501   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2502   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2503   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
2504   PetscFunctionReturn(0);
2505 }
2506 
2507 
2508 /*
2509      Makes a longer coloring[] array and calls the usual code with that
2510 */
2511 #undef __FUNCT__
2512 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode"
2513 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
2514 {
2515   Mat_SeqAIJ       *a = (Mat_SeqAIJ*)mat->data;
2516   PetscErrorCode  ierr;
2517   PetscInt        n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
2518   PetscInt        *colorused,i;
2519   ISColoringValue *newcolor;
2520 
2521   PetscFunctionBegin;
2522   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr);
2523   /* loop over inodes, marking a color for each column*/
2524   row = 0;
2525   for (i=0; i<m; i++){
2526     for (j=0; j<ns[i]; j++) {
2527       newcolor[row++] = coloring[i] + j*ncolors;
2528     }
2529   }
2530 
2531   /* eliminate unneeded colors */
2532   ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr);
2533   ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr);
2534   for (i=0; i<n; i++) {
2535     colorused[newcolor[i]] = 1;
2536   }
2537 
2538   for (i=1; i<5*ncolors; i++) {
2539     colorused[i] += colorused[i-1];
2540   }
2541   ncolors = colorused[5*ncolors-1];
2542   for (i=0; i<n; i++) {
2543     newcolor[i] = colorused[newcolor[i]]-1;
2544   }
2545   ierr = PetscFree(colorused);CHKERRQ(ierr);
2546   ierr = ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr);
2547   ierr = PetscFree(coloring);CHKERRQ(ierr);
2548   PetscFunctionReturn(0);
2549 }
2550 
2551 #include "../src/mat/blockinvert.h"
2552 
2553 #undef __FUNCT__
2554 #define __FUNCT__ "MatSOR_SeqAIJ_Inode"
2555 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
2556 {
2557   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
2558   PetscScalar        sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3;
2559   MatScalar          *ibdiag,*bdiag,work[25];
2560   PetscScalar        *x,*xb,tmp4,tmp5,x1,x2,x3,x4,x5;
2561   const MatScalar    *v = a->a,*v1,*v2,*v3,*v4,*v5;
2562   PetscReal          zeropivot = 1.0e-15, shift = 0.0;
2563   PetscErrorCode     ierr;
2564   PetscInt           n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2;
2565   PetscInt           *idx,*diag = a->diag,*ii = a->i,sz,k,ipvt[5];
2566 
2567   PetscFunctionBegin;
2568   if (omega != 1.0) SETERRQ(PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
2569   if (fshift != 0.0) SETERRQ(PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
2570   if (its > 1) {
2571     /* switch to non-inode version */
2572     ierr = MatSOR_SeqAIJ(A,bb,omega,flag,fshift,its,lits,xx);CHKERRQ(ierr);
2573     PetscFunctionReturn(0);
2574   }
2575 
2576   if (!a->inode.ibdiagvalid) {
2577     if (!a->inode.ibdiag) {
2578       /* calculate space needed for diagonal blocks */
2579       for (i=0; i<m; i++) {
2580 	cnt += sizes[i]*sizes[i];
2581       }
2582       a->inode.bdiagsize = cnt;
2583       ierr   = PetscMalloc3(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag,A->rmap->n,MatScalar,&a->inode.ssor_work);CHKERRQ(ierr);
2584     }
2585 
2586     /* copy over the diagonal blocks and invert them */
2587     ibdiag = a->inode.ibdiag;
2588     bdiag  = a->inode.bdiag;
2589     cnt = 0;
2590     for (i=0, row = 0; i<m; i++) {
2591       for (j=0; j<sizes[i]; j++) {
2592         for (k=0; k<sizes[i]; k++) {
2593           bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
2594         }
2595       }
2596       ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr);
2597 
2598       switch(sizes[i]) {
2599         case 1:
2600           /* Create matrix data structure */
2601           if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
2602           ibdiag[cnt] = 1.0/ibdiag[cnt];
2603           break;
2604         case 2:
2605           ierr = Kernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr);
2606           break;
2607         case 3:
2608           ierr = Kernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr);
2609           break;
2610         case 4:
2611           ierr = Kernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr);
2612           break;
2613         case 5:
2614           ierr = Kernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift);CHKERRQ(ierr);
2615           break;
2616        default:
2617 	 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2618       }
2619       cnt += sizes[i]*sizes[i];
2620       row += sizes[i];
2621     }
2622     a->inode.ibdiagvalid = PETSC_TRUE;
2623   }
2624   ibdiag = a->inode.ibdiag;
2625   bdiag  = a->inode.bdiag;
2626 
2627 
2628   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2629   if (flag & SOR_ZERO_INITIAL_GUESS) {
2630     PetscScalar *b;
2631     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2632     if (xx != bb) {
2633       ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2634     } else {
2635       b = x;
2636     }
2637     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){
2638 
2639       for (i=0, row=0; i<m; i++) {
2640         sz  = diag[row] - ii[row];
2641         v1  = a->a + ii[row];
2642         idx = a->j + ii[row];
2643 
2644         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2645         switch (sizes[i]){
2646           case 1:
2647 
2648             sum1  = b[row];
2649             for(n = 0; n<sz-1; n+=2) {
2650               i1   = idx[0];
2651               i2   = idx[1];
2652               idx += 2;
2653               tmp0 = x[i1];
2654               tmp1 = x[i2];
2655               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2656             }
2657 
2658             if (n == sz-1){
2659               tmp0  = x[*idx];
2660               sum1 -= *v1 * tmp0;
2661             }
2662             x[row++] = sum1*(*ibdiag++);
2663             break;
2664           case 2:
2665             v2    = a->a + ii[row+1];
2666             sum1  = b[row];
2667             sum2  = b[row+1];
2668             for(n = 0; n<sz-1; n+=2) {
2669               i1   = idx[0];
2670               i2   = idx[1];
2671               idx += 2;
2672               tmp0 = x[i1];
2673               tmp1 = x[i2];
2674               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2675               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2676             }
2677 
2678             if (n == sz-1){
2679               tmp0  = x[*idx];
2680               sum1 -= v1[0] * tmp0;
2681               sum2 -= v2[0] * tmp0;
2682             }
2683             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
2684             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
2685             ibdiag  += 4;
2686             break;
2687           case 3:
2688             v2    = a->a + ii[row+1];
2689             v3    = a->a + ii[row+2];
2690             sum1  = b[row];
2691             sum2  = b[row+1];
2692             sum3  = b[row+2];
2693             for(n = 0; n<sz-1; n+=2) {
2694               i1   = idx[0];
2695               i2   = idx[1];
2696               idx += 2;
2697               tmp0 = x[i1];
2698               tmp1 = x[i2];
2699               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2700               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2701               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2702             }
2703 
2704             if (n == sz-1){
2705               tmp0  = x[*idx];
2706               sum1 -= v1[0] * tmp0;
2707               sum2 -= v2[0] * tmp0;
2708               sum3 -= v3[0] * tmp0;
2709             }
2710             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2711             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2712             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2713             ibdiag  += 9;
2714             break;
2715           case 4:
2716             v2    = a->a + ii[row+1];
2717             v3    = a->a + ii[row+2];
2718             v4    = a->a + ii[row+3];
2719             sum1  = b[row];
2720             sum2  = b[row+1];
2721             sum3  = b[row+2];
2722             sum4  = b[row+3];
2723             for(n = 0; n<sz-1; n+=2) {
2724               i1   = idx[0];
2725               i2   = idx[1];
2726               idx += 2;
2727               tmp0 = x[i1];
2728               tmp1 = x[i2];
2729               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2730               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2731               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2732               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2733             }
2734 
2735             if (n == sz-1){
2736               tmp0  = x[*idx];
2737               sum1 -= v1[0] * tmp0;
2738               sum2 -= v2[0] * tmp0;
2739               sum3 -= v3[0] * tmp0;
2740               sum4 -= v4[0] * tmp0;
2741             }
2742             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2743             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2744             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2745             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2746             ibdiag  += 16;
2747             break;
2748           case 5:
2749             v2    = a->a + ii[row+1];
2750             v3    = a->a + ii[row+2];
2751             v4    = a->a + ii[row+3];
2752             v5    = a->a + ii[row+4];
2753             sum1  = b[row];
2754             sum2  = b[row+1];
2755             sum3  = b[row+2];
2756             sum4  = b[row+3];
2757             sum5  = b[row+4];
2758             for(n = 0; n<sz-1; n+=2) {
2759               i1   = idx[0];
2760               i2   = idx[1];
2761               idx += 2;
2762               tmp0 = x[i1];
2763               tmp1 = x[i2];
2764               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2765               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2766               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2767               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2768               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2769             }
2770 
2771             if (n == sz-1){
2772               tmp0  = x[*idx];
2773               sum1 -= v1[0] * tmp0;
2774               sum2 -= v2[0] * tmp0;
2775               sum3 -= v3[0] * tmp0;
2776               sum4 -= v4[0] * tmp0;
2777               sum5 -= v5[0] * tmp0;
2778             }
2779             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2780             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
2781             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
2782             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
2783             x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
2784             ibdiag  += 25;
2785             break;
2786 	  default:
2787    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2788         }
2789       }
2790 
2791       xb = x;
2792       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2793     } else xb = b;
2794     if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) &&
2795         (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) {
2796       cnt = 0;
2797       for (i=0, row=0; i<m; i++) {
2798 
2799         switch (sizes[i]){
2800           case 1:
2801             x[row++] *= bdiag[cnt++];
2802             break;
2803           case 2:
2804             x1   = x[row]; x2 = x[row+1];
2805             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
2806             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
2807             x[row++] = tmp1;
2808             x[row++] = tmp2;
2809             cnt += 4;
2810             break;
2811           case 3:
2812             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2];
2813             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
2814             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
2815             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
2816             x[row++] = tmp1;
2817             x[row++] = tmp2;
2818             x[row++] = tmp3;
2819             cnt += 9;
2820             break;
2821           case 4:
2822             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
2823             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
2824             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
2825             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
2826             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
2827             x[row++] = tmp1;
2828             x[row++] = tmp2;
2829             x[row++] = tmp3;
2830             x[row++] = tmp4;
2831             cnt += 16;
2832             break;
2833           case 5:
2834             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
2835             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
2836             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
2837             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
2838             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
2839             tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
2840             x[row++] = tmp1;
2841             x[row++] = tmp2;
2842             x[row++] = tmp3;
2843             x[row++] = tmp4;
2844             x[row++] = tmp5;
2845             cnt += 25;
2846             break;
2847 	  default:
2848    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2849         }
2850       }
2851       ierr = PetscLogFlops(m);CHKERRQ(ierr);
2852     }
2853     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){
2854 
2855       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
2856       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
2857         ibdiag -= sizes[i]*sizes[i];
2858         sz      = ii[row+1] - diag[row] - 1;
2859         v1      = a->a + diag[row] + 1;
2860         idx     = a->j + diag[row] + 1;
2861 
2862         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2863         switch (sizes[i]){
2864           case 1:
2865 
2866             sum1  = xb[row];
2867             for(n = 0; n<sz-1; n+=2) {
2868               i1   = idx[0];
2869               i2   = idx[1];
2870               idx += 2;
2871               tmp0 = x[i1];
2872               tmp1 = x[i2];
2873               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2874             }
2875 
2876             if (n == sz-1){
2877               tmp0  = x[*idx];
2878               sum1 -= *v1*tmp0;
2879             }
2880             x[row--] = sum1*(*ibdiag);
2881             break;
2882 
2883           case 2:
2884 
2885             sum1  = xb[row];
2886             sum2  = xb[row-1];
2887             /* note that sum1 is associated with the second of the two rows */
2888             v2    = a->a + diag[row-1] + 2;
2889             for(n = 0; n<sz-1; n+=2) {
2890               i1   = idx[0];
2891               i2   = idx[1];
2892               idx += 2;
2893               tmp0 = x[i1];
2894               tmp1 = x[i2];
2895               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2896               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2897             }
2898 
2899             if (n == sz-1){
2900               tmp0  = x[*idx];
2901               sum1 -= *v1*tmp0;
2902               sum2 -= *v2*tmp0;
2903             }
2904             x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
2905             x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
2906             break;
2907           case 3:
2908 
2909             sum1  = xb[row];
2910             sum2  = xb[row-1];
2911             sum3  = xb[row-2];
2912             v2    = a->a + diag[row-1] + 2;
2913             v3    = a->a + diag[row-2] + 3;
2914             for(n = 0; n<sz-1; n+=2) {
2915               i1   = idx[0];
2916               i2   = idx[1];
2917               idx += 2;
2918               tmp0 = x[i1];
2919               tmp1 = x[i2];
2920               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2921               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2922               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2923             }
2924 
2925             if (n == sz-1){
2926               tmp0  = x[*idx];
2927               sum1 -= *v1*tmp0;
2928               sum2 -= *v2*tmp0;
2929               sum3 -= *v3*tmp0;
2930             }
2931             x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
2932             x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
2933             x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
2934             break;
2935           case 4:
2936 
2937             sum1  = xb[row];
2938             sum2  = xb[row-1];
2939             sum3  = xb[row-2];
2940             sum4  = xb[row-3];
2941             v2    = a->a + diag[row-1] + 2;
2942             v3    = a->a + diag[row-2] + 3;
2943             v4    = a->a + diag[row-3] + 4;
2944             for(n = 0; n<sz-1; n+=2) {
2945               i1   = idx[0];
2946               i2   = idx[1];
2947               idx += 2;
2948               tmp0 = x[i1];
2949               tmp1 = x[i2];
2950               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2951               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2952               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2953               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2954             }
2955 
2956             if (n == sz-1){
2957               tmp0  = x[*idx];
2958               sum1 -= *v1*tmp0;
2959               sum2 -= *v2*tmp0;
2960               sum3 -= *v3*tmp0;
2961               sum4 -= *v4*tmp0;
2962             }
2963             x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
2964             x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
2965             x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
2966             x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
2967             break;
2968           case 5:
2969 
2970             sum1  = xb[row];
2971             sum2  = xb[row-1];
2972             sum3  = xb[row-2];
2973             sum4  = xb[row-3];
2974             sum5  = xb[row-4];
2975             v2    = a->a + diag[row-1] + 2;
2976             v3    = a->a + diag[row-2] + 3;
2977             v4    = a->a + diag[row-3] + 4;
2978             v5    = a->a + diag[row-4] + 5;
2979             for(n = 0; n<sz-1; n+=2) {
2980               i1   = idx[0];
2981               i2   = idx[1];
2982               idx += 2;
2983               tmp0 = x[i1];
2984               tmp1 = x[i2];
2985               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2986               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2987               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2988               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2989               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2990             }
2991 
2992             if (n == sz-1){
2993               tmp0  = x[*idx];
2994               sum1 -= *v1*tmp0;
2995               sum2 -= *v2*tmp0;
2996               sum3 -= *v3*tmp0;
2997               sum4 -= *v4*tmp0;
2998               sum5 -= *v5*tmp0;
2999             }
3000             x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3001             x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3002             x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3003             x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3004             x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3005             break;
3006 	  default:
3007    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3008         }
3009       }
3010 
3011       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3012     }
3013     its--;
3014     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3015     if (bb != xx) {ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);}
3016   }
3017   if (flag & SOR_EISENSTAT) {
3018     const PetscScalar *b;
3019     MatScalar         *t = a->inode.ssor_work;
3020 
3021     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3022     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3023     /*
3024           Apply  (U + D)^-1  where D is now the block diagonal
3025     */
3026     ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3027     for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3028       ibdiag -= sizes[i]*sizes[i];
3029       sz      = ii[row+1] - diag[row] - 1;
3030       v1      = a->a + diag[row] + 1;
3031       idx     = a->j + diag[row] + 1;
3032       CHKMEMQ;
3033       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3034       switch (sizes[i]){
3035         case 1:
3036 
3037 	  sum1  = b[row];
3038 	  for(n = 0; n<sz-1; n+=2) {
3039 	    i1   = idx[0];
3040 	    i2   = idx[1];
3041 	    idx += 2;
3042 	    tmp0 = x[i1];
3043 	    tmp1 = x[i2];
3044 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3045 	  }
3046 
3047 	  if (n == sz-1){
3048 	    tmp0  = x[*idx];
3049 	    sum1 -= *v1*tmp0;
3050 	  }
3051 	  x[row] = sum1*(*ibdiag);row--;
3052 	  break;
3053 
3054         case 2:
3055 
3056 	  sum1  = b[row];
3057 	  sum2  = b[row-1];
3058 	  /* note that sum1 is associated with the second of the two rows */
3059 	  v2    = a->a + diag[row-1] + 2;
3060 	  for(n = 0; n<sz-1; n+=2) {
3061 	    i1   = idx[0];
3062 	    i2   = idx[1];
3063 	    idx += 2;
3064 	    tmp0 = x[i1];
3065 	    tmp1 = x[i2];
3066 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3067 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3068 	  }
3069 
3070 	  if (n == sz-1){
3071 	    tmp0  = x[*idx];
3072 	    sum1 -= *v1*tmp0;
3073 	    sum2 -= *v2*tmp0;
3074 	  }
3075 	  x[row] = sum2*ibdiag[1] + sum1*ibdiag[3];
3076 	  x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
3077           row -= 2;
3078 	  break;
3079         case 3:
3080 
3081 	  sum1  = b[row];
3082 	  sum2  = b[row-1];
3083 	  sum3  = b[row-2];
3084 	  v2    = a->a + diag[row-1] + 2;
3085 	  v3    = a->a + diag[row-2] + 3;
3086 	  for(n = 0; n<sz-1; n+=2) {
3087 	    i1   = idx[0];
3088 	    i2   = idx[1];
3089 	    idx += 2;
3090 	    tmp0 = x[i1];
3091 	    tmp1 = x[i2];
3092 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3093 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3094 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3095 	  }
3096 
3097 	  if (n == sz-1){
3098 	    tmp0  = x[*idx];
3099 	    sum1 -= *v1*tmp0;
3100 	    sum2 -= *v2*tmp0;
3101 	    sum3 -= *v3*tmp0;
3102 	  }
3103 	  x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3104 	  x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3105 	  x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3106           row -= 3;
3107 	  break;
3108         case 4:
3109 
3110 	  sum1  = b[row];
3111 	  sum2  = b[row-1];
3112 	  sum3  = b[row-2];
3113 	  sum4  = b[row-3];
3114 	  v2    = a->a + diag[row-1] + 2;
3115 	  v3    = a->a + diag[row-2] + 3;
3116 	  v4    = a->a + diag[row-3] + 4;
3117 	  for(n = 0; n<sz-1; n+=2) {
3118 	    i1   = idx[0];
3119 	    i2   = idx[1];
3120 	    idx += 2;
3121 	    tmp0 = x[i1];
3122 	    tmp1 = x[i2];
3123 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3124 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3125 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3126 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3127 	  }
3128 
3129 	  if (n == sz-1){
3130 	    tmp0  = x[*idx];
3131 	    sum1 -= *v1*tmp0;
3132 	    sum2 -= *v2*tmp0;
3133 	    sum3 -= *v3*tmp0;
3134 	    sum4 -= *v4*tmp0;
3135 	  }
3136 	  x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3137 	  x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3138 	  x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3139 	  x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3140           row -= 4;
3141 	  break;
3142         case 5:
3143 
3144 	  sum1  = b[row];
3145 	  sum2  = b[row-1];
3146 	  sum3  = b[row-2];
3147 	  sum4  = b[row-3];
3148 	  sum5  = b[row-4];
3149 	  v2    = a->a + diag[row-1] + 2;
3150 	  v3    = a->a + diag[row-2] + 3;
3151 	  v4    = a->a + diag[row-3] + 4;
3152 	  v5    = a->a + diag[row-4] + 5;
3153 	  for(n = 0; n<sz-1; n+=2) {
3154 	    i1   = idx[0];
3155 	    i2   = idx[1];
3156 	    idx += 2;
3157 	    tmp0 = x[i1];
3158 	    tmp1 = x[i2];
3159 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3160 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3161 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3162 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3163 	    sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3164 	  }
3165 
3166 	  if (n == sz-1){
3167 	    tmp0  = x[*idx];
3168 	    sum1 -= *v1*tmp0;
3169 	    sum2 -= *v2*tmp0;
3170 	    sum3 -= *v3*tmp0;
3171 	    sum4 -= *v4*tmp0;
3172 	    sum5 -= *v5*tmp0;
3173 	  }
3174 	  x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3175 	  x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3176 	  x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3177 	  x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3178 	  x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3179           row -= 5;
3180 	  break;
3181         default:
3182 	  SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3183       }
3184       CHKMEMQ;
3185     }
3186     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3187 
3188     /*
3189            t = b - D x    where D is the block diagonal
3190     */
3191     cnt = 0;
3192     for (i=0, row=0; i<m; i++) {
3193       CHKMEMQ;
3194       switch (sizes[i]){
3195         case 1:
3196 	  t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
3197 	  break;
3198         case 2:
3199 	  x1   = x[row]; x2 = x[row+1];
3200 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3201 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3202 	  t[row]   = b[row] - tmp1;
3203 	  t[row+1] = b[row+1] - tmp2; row += 2;
3204 	  cnt += 4;
3205 	  break;
3206         case 3:
3207 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2];
3208 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3209 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3210 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3211 	  t[row] = b[row] - tmp1;
3212 	  t[row+1] = b[row+1] - tmp2;
3213 	  t[row+2] = b[row+2] - tmp3; row += 3;
3214 	  cnt += 9;
3215 	  break;
3216         case 4:
3217 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3218 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3219 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3220 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3221 	  tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3222 	  t[row] = b[row] - tmp1;
3223 	  t[row+1] = b[row+1] - tmp2;
3224 	  t[row+2] = b[row+2] - tmp3;
3225 	  t[row+3] = b[row+3] - tmp4; row += 4;
3226 	  cnt += 16;
3227 	  break;
3228         case 5:
3229 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3230 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3231 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3232 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3233 	  tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3234 	  tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3235 	  t[row] = b[row] - tmp1;
3236 	  t[row+1] = b[row+1] - tmp2;
3237 	  t[row+2] = b[row+2] - tmp3;
3238 	  t[row+3] = b[row+3] - tmp4;
3239 	  t[row+4] = b[row+4] - tmp5;row += 5;
3240 	  cnt += 25;
3241 	  break;
3242         default:
3243 	  SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3244       }
3245       CHKMEMQ;
3246     }
3247     ierr = PetscLogFlops(m);CHKERRQ(ierr);
3248 
3249 
3250 
3251     /*
3252           Apply (L + D)^-1 where D is the block diagonal
3253     */
3254     for (i=0, row=0; i<m; i++) {
3255       sz  = diag[row] - ii[row];
3256       v1  = a->a + ii[row];
3257       idx = a->j + ii[row];
3258       CHKMEMQ;
3259       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3260       switch (sizes[i]){
3261         case 1:
3262 
3263 	  sum1  = t[row];
3264 	  for(n = 0; n<sz-1; n+=2) {
3265 	    i1   = idx[0];
3266 	    i2   = idx[1];
3267 	    idx += 2;
3268 	    tmp0 = t[i1];
3269 	    tmp1 = t[i2];
3270 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3271 	  }
3272 
3273 	  if (n == sz-1){
3274 	    tmp0  = t[*idx];
3275 	    sum1 -= *v1 * tmp0;
3276 	  }
3277 	  x[row] += t[row] = sum1*(*ibdiag++); row++;
3278 	  break;
3279         case 2:
3280 	  v2    = a->a + ii[row+1];
3281 	  sum1  = t[row];
3282 	  sum2  = t[row+1];
3283 	  for(n = 0; n<sz-1; n+=2) {
3284 	    i1   = idx[0];
3285 	    i2   = idx[1];
3286 	    idx += 2;
3287 	    tmp0 = t[i1];
3288 	    tmp1 = t[i2];
3289 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3290 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3291 	  }
3292 
3293 	  if (n == sz-1){
3294 	    tmp0  = t[*idx];
3295 	    sum1 -= v1[0] * tmp0;
3296 	    sum2 -= v2[0] * tmp0;
3297 	  }
3298 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3299 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3300 	  ibdiag  += 4; row += 2;
3301 	  break;
3302         case 3:
3303 	  v2    = a->a + ii[row+1];
3304 	  v3    = a->a + ii[row+2];
3305 	  sum1  = t[row];
3306 	  sum2  = t[row+1];
3307 	  sum3  = t[row+2];
3308 	  for(n = 0; n<sz-1; n+=2) {
3309 	    i1   = idx[0];
3310 	    i2   = idx[1];
3311 	    idx += 2;
3312 	    tmp0 = t[i1];
3313 	    tmp1 = t[i2];
3314 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3315 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3316 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3317 	  }
3318 
3319 	  if (n == sz-1){
3320 	    tmp0  = t[*idx];
3321 	    sum1 -= v1[0] * tmp0;
3322 	    sum2 -= v2[0] * tmp0;
3323 	    sum3 -= v3[0] * tmp0;
3324 	  }
3325 	  x[row]  += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3326 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3327 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3328 	  ibdiag  += 9; row += 3;
3329 	  break;
3330         case 4:
3331 	  v2    = a->a + ii[row+1];
3332 	  v3    = a->a + ii[row+2];
3333 	  v4    = a->a + ii[row+3];
3334 	  sum1  = t[row];
3335 	  sum2  = t[row+1];
3336 	  sum3  = t[row+2];
3337 	  sum4  = t[row+3];
3338 	  for(n = 0; n<sz-1; n+=2) {
3339 	    i1   = idx[0];
3340 	    i2   = idx[1];
3341 	    idx += 2;
3342 	    tmp0 = t[i1];
3343 	    tmp1 = t[i2];
3344 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3345 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3346 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3347 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3348 	  }
3349 
3350 	  if (n == sz-1){
3351 	    tmp0  = t[*idx];
3352 	    sum1 -= v1[0] * tmp0;
3353 	    sum2 -= v2[0] * tmp0;
3354 	    sum3 -= v3[0] * tmp0;
3355 	    sum4 -= v4[0] * tmp0;
3356 	  }
3357 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3358 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3359 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3360 	  x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3361 	  ibdiag  += 16; row += 4;
3362 	  break;
3363         case 5:
3364 	  v2    = a->a + ii[row+1];
3365 	  v3    = a->a + ii[row+2];
3366 	  v4    = a->a + ii[row+3];
3367 	  v5    = a->a + ii[row+4];
3368 	  sum1  = t[row];
3369 	  sum2  = t[row+1];
3370 	  sum3  = t[row+2];
3371 	  sum4  = t[row+3];
3372 	  sum5  = t[row+4];
3373 	  for(n = 0; n<sz-1; n+=2) {
3374 	    i1   = idx[0];
3375 	    i2   = idx[1];
3376 	    idx += 2;
3377 	    tmp0 = t[i1];
3378 	    tmp1 = t[i2];
3379 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3380 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3381 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3382 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3383 	    sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3384 	  }
3385 
3386 	  if (n == sz-1){
3387 	    tmp0  = t[*idx];
3388 	    sum1 -= v1[0] * tmp0;
3389 	    sum2 -= v2[0] * tmp0;
3390 	    sum3 -= v3[0] * tmp0;
3391 	    sum4 -= v4[0] * tmp0;
3392 	    sum5 -= v5[0] * tmp0;
3393 	  }
3394 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3395 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3396 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3397 	  x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3398 	  x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3399 	  ibdiag  += 25; row += 5;
3400 	  break;
3401         default:
3402 	  SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3403       }
3404       CHKMEMQ;
3405     }
3406     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3407     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3408     ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3409   }
3410   PetscFunctionReturn(0);
3411 }
3412 
3413 #undef __FUNCT__
3414 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode"
3415 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
3416 {
3417   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3418   PetscScalar        *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
3419   const MatScalar    *bdiag = a->inode.bdiag;
3420   const PetscScalar  *b;
3421   PetscErrorCode      ierr;
3422   PetscInt            m = a->inode.node_count,cnt = 0,i,row;
3423   const PetscInt      *sizes = a->inode.size;
3424 
3425   PetscFunctionBegin;
3426   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3427   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3428   cnt = 0;
3429   for (i=0, row=0; i<m; i++) {
3430     switch (sizes[i]){
3431       case 1:
3432 	x[row] = b[row]*bdiag[cnt++];row++;
3433 	break;
3434       case 2:
3435 	x1   = b[row]; x2 = b[row+1];
3436 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3437 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3438 	x[row++] = tmp1;
3439 	x[row++] = tmp2;
3440 	cnt += 4;
3441 	break;
3442       case 3:
3443 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2];
3444 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3445 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3446 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3447 	x[row++] = tmp1;
3448 	x[row++] = tmp2;
3449 	x[row++] = tmp3;
3450 	cnt += 9;
3451 	break;
3452       case 4:
3453 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
3454 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3455 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3456 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3457 	tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3458 	x[row++] = tmp1;
3459 	x[row++] = tmp2;
3460 	x[row++] = tmp3;
3461 	x[row++] = tmp4;
3462 	cnt += 16;
3463 	break;
3464       case 5:
3465 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
3466 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3467 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3468 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3469 	tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3470 	tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3471 	x[row++] = tmp1;
3472 	x[row++] = tmp2;
3473 	x[row++] = tmp3;
3474 	x[row++] = tmp4;
3475 	x[row++] = tmp5;
3476 	cnt += 25;
3477 	break;
3478       default:
3479 	SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3480     }
3481   }
3482   ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr);
3483   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3484   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3485   PetscFunctionReturn(0);
3486 }
3487 
3488 /*
3489     samestructure indicates that the matrix has not changed its nonzero structure so we
3490     do not need to recompute the inodes
3491 */
3492 #undef __FUNCT__
3493 #define __FUNCT__ "Mat_CheckInode"
3494 PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure)
3495 {
3496   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3497   PetscErrorCode ierr;
3498   PetscInt       i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size;
3499   PetscTruth     flag;
3500 
3501   PetscFunctionBegin;
3502   if (!a->inode.use)                     PetscFunctionReturn(0);
3503   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
3504 
3505 
3506   m = A->rmap->n;
3507   if (a->inode.size) {ns = a->inode.size;}
3508   else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);}
3509 
3510   i          = 0;
3511   node_count = 0;
3512   idx        = a->j;
3513   ii         = a->i;
3514   while (i < m){                /* For each row */
3515     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
3516     /* Limits the number of elements in a node to 'a->inode.limit' */
3517     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
3518       nzy     = ii[j+1] - ii[j]; /* Same number of nonzeros */
3519       if (nzy != nzx) break;
3520       idy  += nzx;             /* Same nonzero pattern */
3521       ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
3522       if (!flag) break;
3523     }
3524     ns[node_count++] = blk_size;
3525     idx += blk_size*nzx;
3526     i    = j;
3527   }
3528   /* If not enough inodes found,, do not use inode version of the routines */
3529   if (!a->inode.size && m && node_count > .9*m) {
3530     ierr = PetscFree(ns);CHKERRQ(ierr);
3531     a->inode.node_count     = 0;
3532     a->inode.size           = PETSC_NULL;
3533     a->inode.use            = PETSC_FALSE;
3534     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
3535   } else {
3536     A->ops->mult              = MatMult_SeqAIJ_Inode;
3537     A->ops->sor               = MatSOR_SeqAIJ_Inode;
3538     A->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
3539     A->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
3540     A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
3541     A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
3542     A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
3543     A->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
3544     A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
3545     A->ops->lufactornumeric   = MatLUFactorNumeric_SeqAIJ_Inode_inplace;
3546     a->inode.node_count       = node_count;
3547     a->inode.size             = ns;
3548     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
3549   }
3550   PetscFunctionReturn(0);
3551 }
3552 
3553 #define MatGetRow_FactoredLU(cols,nzl,nzu,nz,ai,aj,adiag,row) {	\
3554 PetscInt __k, *__vi; \
3555 __vi = aj + ai[row];				\
3556 for(__k=0;__k<nzl;__k++) cols[__k] = __vi[__k]; \
3557 __vi = aj + adiag[row];				\
3558 cols[nzl] = __vi[0];\
3559 __vi = aj + adiag[row+1]+1;\
3560 for(__k=0;__k<nzu;__k++) cols[nzl+1+__k] = __vi[__k];}
3561 
3562 
3563 /*
3564    Mat_CheckInode_FactorLU - Check Inode for factored seqaij matrix.
3565    Modified from Mat_CheckInode().
3566 
3567    Input Parameters:
3568 +  Mat A - ILU or LU matrix factor
3569 -  samestructure - TURE indicates that the matrix has not changed its nonzero structure so we
3570     do not need to recompute the inodes
3571 */
3572 #undef __FUNCT__
3573 #define __FUNCT__ "Mat_CheckInode_FactorLU"
3574 PetscErrorCode Mat_CheckInode_FactorLU(Mat A,PetscTruth samestructure)
3575 {
3576   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3577   PetscErrorCode ierr;
3578   PetscInt       i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size;
3579   PetscInt       *cols1,*cols2,*ns,*ai = a->i,*aj = a->j, *adiag = a->diag;
3580   PetscTruth     flag;
3581 
3582   PetscFunctionBegin;
3583   if (!a->inode.use)                     PetscFunctionReturn(0);
3584   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
3585 
3586   m = A->rmap->n;
3587   if (a->inode.size) {ns = a->inode.size;}
3588   else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);}
3589 
3590   i          = 0;
3591   node_count = 0;
3592   while (i < m){                /* For each row */
3593     nzl1 = ai[i+1] - ai[i];       /* Number of nonzeros in L */
3594     nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/
3595     nzx  = nzl1 + nzu1 + 1;
3596     ierr = PetscMalloc((nzx+1)*sizeof(PetscInt),&cols1);CHKERRQ(ierr);
3597     MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i);
3598 
3599     /* Limits the number of elements in a node to 'a->inode.limit' */
3600     for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
3601       nzl2    = ai[j+1] - ai[j];
3602       nzu2    = adiag[j] - adiag[j+1] - 1;
3603       nzy     = nzl2 + nzu2 + 1;
3604       if( nzy != nzx) break;
3605       ierr    = PetscMalloc((nzy+1)*sizeof(PetscInt),&cols2);CHKERRQ(ierr);
3606       MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);
3607       ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
3608       if (!flag) {ierr = PetscFree(cols2);CHKERRQ(ierr);break;}
3609       ierr = PetscFree(cols2);CHKERRQ(ierr);
3610     }
3611     ns[node_count++] = blk_size;
3612     ierr = PetscFree(cols1);CHKERRQ(ierr);
3613     i    = j;
3614   }
3615   /* If not enough inodes found,, do not use inode version of the routines */
3616   if (!a->inode.size && m && node_count > .9*m) {
3617     ierr = PetscFree(ns);CHKERRQ(ierr);
3618     a->inode.node_count     = 0;
3619     a->inode.size           = PETSC_NULL;
3620     a->inode.use            = PETSC_FALSE;
3621     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
3622   } else {
3623     A->ops->mult              = 0;
3624     A->ops->sor               = 0;
3625     A->ops->multadd           = 0;
3626     A->ops->getrowij          = 0;
3627     A->ops->restorerowij      = 0;
3628     A->ops->getcolumnij       = 0;
3629     A->ops->restorecolumnij   = 0;
3630     A->ops->coloringpatch     = 0;
3631     A->ops->multdiagonalblock = 0;
3632     A->ops->lufactornumeric   = MatLUFactorNumeric_SeqAIJ_Inode; /* not done yet */
3633     a->inode.node_count       = node_count;
3634     a->inode.size             = ns;
3635     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
3636   }
3637   PetscFunctionReturn(0);
3638 }
3639 
3640 /*
3641      This is really ugly. if inodes are used this replaces the
3642   permutations with ones that correspond to rows/cols of the matrix
3643   rather then inode blocks
3644 */
3645 #undef __FUNCT__
3646 #define __FUNCT__ "MatInodeAdjustForInodes"
3647 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
3648 {
3649   PetscErrorCode ierr,(*f)(Mat,IS*,IS*);
3650 
3651   PetscFunctionBegin;
3652   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);CHKERRQ(ierr);
3653   if (f) {
3654     ierr = (*f)(A,rperm,cperm);CHKERRQ(ierr);
3655   }
3656   PetscFunctionReturn(0);
3657 }
3658 
3659 EXTERN_C_BEGIN
3660 #undef __FUNCT__
3661 #define __FUNCT__ "MatAdjustForInodes_SeqAIJ_Inode"
3662 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
3663 {
3664   Mat_SeqAIJ      *a=(Mat_SeqAIJ*)A->data;
3665   PetscErrorCode ierr;
3666   PetscInt       m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
3667   const PetscInt *ridx,*cidx;
3668   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
3669   PetscInt       nslim_col,*ns_col;
3670   IS             ris = *rperm,cis = *cperm;
3671 
3672   PetscFunctionBegin;
3673   if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */
3674   if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */
3675 
3676   ierr  = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
3677   ierr  = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
3678   ierr  = PetscMalloc2(m,PetscInt,&permr,n,PetscInt,&permc);CHKERRQ(ierr);
3679 
3680   ierr  = ISGetIndices(ris,&ridx);CHKERRQ(ierr);
3681   ierr  = ISGetIndices(cis,&cidx);CHKERRQ(ierr);
3682 
3683   /* Form the inode structure for the rows of permuted matric using inv perm*/
3684   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
3685 
3686   /* Construct the permutations for rows*/
3687   for (i=0,row = 0; i<nslim_row; ++i){
3688     indx      = ridx[i];
3689     start_val = tns[indx];
3690     end_val   = tns[indx + 1];
3691     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
3692   }
3693 
3694   /* Form the inode structure for the columns of permuted matrix using inv perm*/
3695   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
3696 
3697  /* Construct permutations for columns */
3698   for (i=0,col=0; i<nslim_col; ++i){
3699     indx      = cidx[i];
3700     start_val = tns[indx];
3701     end_val   = tns[indx + 1];
3702     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
3703   }
3704 
3705   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);CHKERRQ(ierr);
3706   ierr = ISSetPermutation(*rperm);CHKERRQ(ierr);
3707   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);CHKERRQ(ierr);
3708   ierr = ISSetPermutation(*cperm);CHKERRQ(ierr);
3709 
3710   ierr  = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr);
3711   ierr  = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr);
3712 
3713   ierr = PetscFree(ns_col);CHKERRQ(ierr);
3714   ierr = PetscFree2(permr,permc);CHKERRQ(ierr);
3715   ierr = ISDestroy(cis);CHKERRQ(ierr);
3716   ierr = ISDestroy(ris);CHKERRQ(ierr);
3717   ierr = PetscFree(tns);CHKERRQ(ierr);
3718   PetscFunctionReturn(0);
3719 }
3720 EXTERN_C_END
3721 
3722 #undef __FUNCT__
3723 #define __FUNCT__ "MatInodeGetInodeSizes"
3724 /*@C
3725    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
3726 
3727    Collective on Mat
3728 
3729    Input Parameter:
3730 .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
3731 
3732    Output Parameter:
3733 +  node_count - no of inodes present in the matrix.
3734 .  sizes      - an array of size node_count,with sizes of each inode.
3735 -  limit      - the max size used to generate the inodes.
3736 
3737    Level: advanced
3738 
3739    Notes: This routine returns some internal storage information
3740    of the matrix, it is intended to be used by advanced users.
3741    It should be called after the matrix is assembled.
3742    The contents of the sizes[] array should not be changed.
3743    PETSC_NULL may be passed for information not requested.
3744 
3745 .keywords: matrix, seqaij, get, inode
3746 
3747 .seealso: MatGetInfo()
3748 @*/
3749 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
3750 {
3751   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
3752 
3753   PetscFunctionBegin;
3754   if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
3755   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr);
3756   if (f) {
3757     ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr);
3758   }
3759   PetscFunctionReturn(0);
3760 }
3761 
3762 EXTERN_C_BEGIN
3763 #undef __FUNCT__
3764 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode"
3765 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
3766 {
3767   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
3768 
3769   PetscFunctionBegin;
3770   if (node_count) *node_count = a->inode.node_count;
3771   if (sizes)      *sizes      = a->inode.size;
3772   if (limit)      *limit      = a->inode.limit;
3773   PetscFunctionReturn(0);
3774 }
3775 EXTERN_C_END
3776