xref: /petsc/src/mat/impls/aij/seq/inode.c (revision d8c6e1826fd7f4e55b583f73ec45b462b8b7d8dd)
1 #define PETSCMAT_DLL
2 
3 /*
4   This file provides high performance routines for the Inode format (compressed sparse row)
5   by taking advantage of rows with identical nonzero structure (I-nodes).
6 */
7 #include "src/mat/impls/aij/seq/aij.h"
8 
9 #undef __FUNCT__
10 #define __FUNCT__ "Mat_CreateColInode"
11 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns)
12 {
13   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
14   PetscErrorCode ierr;
15   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;
16 
17   PetscFunctionBegin;
18   n      = A->cmap.n;
19   m      = A->rmap.n;
20   ns_row = a->inode.size;
21 
22   min_mn = (m < n) ? m : n;
23   if (!ns) {
24     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++);
25     for(; count+1 < n; count++,i++);
26     if (count < n)  {
27       i++;
28     }
29     *size = i;
30     PetscFunctionReturn(0);
31   }
32   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr);
33 
34   /* Use the same row structure wherever feasible. */
35   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
36     ns_col[i] = ns_row[i];
37   }
38 
39   /* if m < n; pad up the remainder with inode_limit */
40   for(; count+1 < n; count++,i++) {
41     ns_col[i] = 1;
42   }
43   /* The last node is the odd ball. padd it up with the remaining rows; */
44   if (count < n)  {
45     ns_col[i] = n - count;
46     i++;
47   } else if (count > n) {
48     /* Adjust for the over estimation */
49     ns_col[i-1] += n - count;
50   }
51   *size = i;
52   *ns   = ns_col;
53   PetscFunctionReturn(0);
54 }
55 
56 
57 /*
58       This builds symmetric version of nonzero structure,
59 */
60 #undef __FUNCT__
61 #define __FUNCT__ "MatGetRowIJ_Inode_Symmetric"
62 static PetscErrorCode MatGetRowIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
63 {
64   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
65   PetscErrorCode ierr;
66   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n;
67   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j;
68 
69   PetscFunctionBegin;
70   nslim_row = a->inode.node_count;
71   m         = A->rmap.n;
72   n         = A->cmap.n;
73   if (m != n) SETERRQ(PETSC_ERR_SUP,"MatGetRowIJ_Inode_Symmetric: Matrix should be square");
74 
75   /* Use the row_inode as column_inode */
76   nslim_col = nslim_row;
77   ns_col    = ns_row;
78 
79   /* allocate space for reformated inode structure */
80   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
81   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
82   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
83 
84   for (i1=0,col=0; i1<nslim_col; ++i1){
85     nsz = ns_col[i1];
86     for (i2=0; i2<nsz; ++i2,++col)
87       tvc[col] = i1;
88   }
89   /* allocate space for row pointers */
90   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
91   *iia = ia;
92   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
93   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
94 
95   /* determine the number of columns in each row */
96   ia[0] = oshift;
97   for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) {
98 
99     j    = aj + ai[row] + ishift;
100     jmax = aj + ai[row+1] + ishift;
101     i2   = 0;
102     col  = *j++ + ishift;
103     i2   = tvc[col];
104     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
105       ia[i1+1]++;
106       ia[i2+1]++;
107       i2++;                     /* Start col of next node */
108       while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j;
109       i2 = tvc[col];
110     }
111     if(i2 == i1) ia[i2+1]++;    /* now the diagonal element */
112   }
113 
114   /* shift ia[i] to point to next row */
115   for (i1=1; i1<nslim_row+1; i1++) {
116     row        = ia[i1-1];
117     ia[i1]    += row;
118     work[i1-1] = row - oshift;
119   }
120 
121   /* allocate space for column pointers */
122   nz   = ia[nslim_row] + (!ishift);
123   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
124   *jja = ja;
125 
126  /* loop over lower triangular part putting into ja */
127   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
128     j    = aj + ai[row] + ishift;
129     jmax = aj + ai[row+1] + ishift;
130     i2   = 0;                     /* Col inode index */
131     col  = *j++ + ishift;
132     i2   = tvc[col];
133     while (i2<i1 && j<jmax) {
134       ja[work[i2]++] = i1 + oshift;
135       ja[work[i1]++] = i2 + oshift;
136       ++i2;
137       while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */
138       i2 = tvc[col];
139     }
140     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
141 
142   }
143   ierr = PetscFree(work);CHKERRQ(ierr);
144   ierr = PetscFree(tns);CHKERRQ(ierr);
145   ierr = PetscFree(tvc);CHKERRQ(ierr);
146   PetscFunctionReturn(0);
147 }
148 
149 /*
150       This builds nonsymmetric version of nonzero structure,
151 */
152 #undef __FUNCT__
153 #define __FUNCT__ "MatGetRowIJ_Inode_Nonsymmetric"
154 static PetscErrorCode MatGetRowIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
155 {
156   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
157   PetscErrorCode ierr;
158   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col;
159   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
160 
161   PetscFunctionBegin;
162   nslim_row = a->inode.node_count;
163   n         = A->cmap.n;
164 
165   /* Create The column_inode for this matrix */
166   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
167 
168   /* allocate space for reformated column_inode structure */
169   ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
170   ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
171   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
172 
173   for (i1=0,col=0; i1<nslim_col; ++i1){
174     nsz = ns_col[i1];
175     for (i2=0; i2<nsz; ++i2,++col)
176       tvc[col] = i1;
177   }
178   /* allocate space for row pointers */
179   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
180   *iia = ia;
181   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
182   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
183 
184   /* determine the number of columns in each row */
185   ia[0] = oshift;
186   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
187     j   = aj + ai[row] + ishift;
188     col = *j++ + ishift;
189     i2  = tvc[col];
190     nz  = ai[row+1] - ai[row];
191     while (nz-- > 0) {           /* off-diagonal elemets */
192       ia[i1+1]++;
193       i2++;                     /* Start col of next node */
194       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
195       if (nz > 0) i2 = tvc[col];
196     }
197   }
198 
199   /* shift ia[i] to point to next row */
200   for (i1=1; i1<nslim_row+1; i1++) {
201     row        = ia[i1-1];
202     ia[i1]    += row;
203     work[i1-1] = row - oshift;
204   }
205 
206   /* allocate space for column pointers */
207   nz   = ia[nslim_row] + (!ishift);
208   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
209   *jja = ja;
210 
211  /* loop over matrix putting into ja */
212   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
213     j   = aj + ai[row] + ishift;
214     i2  = 0;                     /* Col inode index */
215     col = *j++ + ishift;
216     i2  = tvc[col];
217     nz  = ai[row+1] - ai[row];
218     while (nz-- > 0) {
219       ja[work[i1]++] = i2 + oshift;
220       ++i2;
221       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
222       if (nz > 0) i2 = tvc[col];
223     }
224   }
225   ierr = PetscFree(ns_col);CHKERRQ(ierr);
226   ierr = PetscFree(work);CHKERRQ(ierr);
227   ierr = PetscFree(tns);CHKERRQ(ierr);
228   ierr = PetscFree(tvc);CHKERRQ(ierr);
229   PetscFunctionReturn(0);
230 }
231 
232 #undef __FUNCT__
233 #define __FUNCT__ "MatGetRowIJ_Inode"
234 static PetscErrorCode MatGetRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
235 {
236   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
237   PetscErrorCode ierr;
238 
239   PetscFunctionBegin;
240   *n     = a->inode.node_count;
241   if (!ia) PetscFunctionReturn(0);
242   if (!blockcompressed) {
243     ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
244   } else if (symmetric) {
245     ierr = MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
246   } else {
247     ierr = MatGetRowIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
248   }
249   PetscFunctionReturn(0);
250 }
251 
252 #undef __FUNCT__
253 #define __FUNCT__ "MatRestoreRowIJ_Inode"
254 static PetscErrorCode MatRestoreRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
255 {
256   PetscErrorCode ierr;
257 
258   PetscFunctionBegin;
259   if (!ia) PetscFunctionReturn(0);
260 
261   if (!blockcompressed) {
262     ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
263   } else {
264     ierr = PetscFree(*ia);CHKERRQ(ierr);
265     ierr = PetscFree(*ja);CHKERRQ(ierr);
266   }
267 
268   PetscFunctionReturn(0);
269 }
270 
271 /* ----------------------------------------------------------- */
272 
273 #undef __FUNCT__
274 #define __FUNCT__ "MatGetColumnIJ_Inode_Nonsymmetric"
275 static PetscErrorCode MatGetColumnIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
276 {
277   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
278   PetscErrorCode ierr;
279   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
280   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
281 
282   PetscFunctionBegin;
283   nslim_row = a->inode.node_count;
284   n         = A->cmap.n;
285 
286   /* Create The column_inode for this matrix */
287   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
288 
289   /* allocate space for reformated column_inode structure */
290   ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
291   ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
292   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
293 
294   for (i1=0,col=0; i1<nslim_col; ++i1){
295     nsz = ns_col[i1];
296     for (i2=0; i2<nsz; ++i2,++col)
297       tvc[col] = i1;
298   }
299   /* allocate space for column pointers */
300   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
301   *iia = ia;
302   ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr);
303   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
304 
305   /* determine the number of columns in each row */
306   ia[0] = oshift;
307   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
308     j   = aj + ai[row] + ishift;
309     col = *j++ + ishift;
310     i2  = tvc[col];
311     nz  = ai[row+1] - ai[row];
312     while (nz-- > 0) {           /* off-diagonal elemets */
313       /* ia[i1+1]++; */
314       ia[i2+1]++;
315       i2++;
316       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
317       if (nz > 0) i2 = tvc[col];
318     }
319   }
320 
321   /* shift ia[i] to point to next col */
322   for (i1=1; i1<nslim_col+1; i1++) {
323     col        = ia[i1-1];
324     ia[i1]    += col;
325     work[i1-1] = col - oshift;
326   }
327 
328   /* allocate space for column pointers */
329   nz   = ia[nslim_col] + (!ishift);
330   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
331   *jja = ja;
332 
333  /* loop over matrix putting into ja */
334   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
335     j   = aj + ai[row] + ishift;
336     i2  = 0;                     /* Col inode index */
337     col = *j++ + ishift;
338     i2  = tvc[col];
339     nz  = ai[row+1] - ai[row];
340     while (nz-- > 0) {
341       /* ja[work[i1]++] = i2 + oshift; */
342       ja[work[i2]++] = i1 + oshift;
343       i2++;
344       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
345       if (nz > 0) i2 = tvc[col];
346     }
347   }
348   ierr = PetscFree(ns_col);CHKERRQ(ierr);
349   ierr = PetscFree(work);CHKERRQ(ierr);
350   ierr = PetscFree(tns);CHKERRQ(ierr);
351   ierr = PetscFree(tvc);CHKERRQ(ierr);
352   PetscFunctionReturn(0);
353 }
354 
355 #undef __FUNCT__
356 #define __FUNCT__ "MatGetColumnIJ_Inode"
357 static PetscErrorCode MatGetColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
358 {
359   PetscErrorCode ierr;
360 
361   PetscFunctionBegin;
362   ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr);
363   if (!ia) PetscFunctionReturn(0);
364 
365   if (!blockcompressed) {
366     ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
367   } else if (symmetric) {
368     /* Since the indices are symmetric it does'nt matter */
369     ierr = MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
370   } else {
371     ierr = MatGetColumnIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
372   }
373   PetscFunctionReturn(0);
374 }
375 
376 #undef __FUNCT__
377 #define __FUNCT__ "MatRestoreColumnIJ_Inode"
378 static PetscErrorCode MatRestoreColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
379 {
380   PetscErrorCode ierr;
381 
382   PetscFunctionBegin;
383   if (!ia) PetscFunctionReturn(0);
384   if (!blockcompressed) {
385     ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
386   } else {
387     ierr = PetscFree(*ia);CHKERRQ(ierr);
388     ierr = PetscFree(*ja);CHKERRQ(ierr);
389   }
390   PetscFunctionReturn(0);
391 }
392 
393 /* ----------------------------------------------------------- */
394 
395 #undef __FUNCT__
396 #define __FUNCT__ "MatMult_Inode"
397 static PetscErrorCode MatMult_Inode(Mat A,Vec xx,Vec yy)
398 {
399   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
400   PetscScalar    sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
401   PetscScalar    *v1,*v2,*v3,*v4,*v5,*x,*y;
402   PetscErrorCode ierr;
403   PetscInt       *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
404 
405 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
406 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
407 #endif
408 
409   PetscFunctionBegin;
410   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
411   node_max = a->inode.node_count;
412   ns       = a->inode.size;     /* Node Size array */
413   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
414   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
415   idx  = a->j;
416   v1   = a->a;
417   ii   = a->i;
418 
419   for (i = 0,row = 0; i< node_max; ++i){
420     nsz  = ns[i];
421     n    = ii[1] - ii[0];
422     ii  += nsz;
423     sz   = n;                   /* No of non zeros in this row */
424                                 /* Switch on the size of Node */
425     switch (nsz){               /* Each loop in 'case' is unrolled */
426     case 1 :
427       sum1  = 0;
428 
429       for(n = 0; n< sz-1; n+=2) {
430         i1   = idx[0];          /* The instructions are ordered to */
431         i2   = idx[1];          /* make the compiler's job easy */
432         idx += 2;
433         tmp0 = x[i1];
434         tmp1 = x[i2];
435         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
436        }
437 
438       if (n == sz-1){          /* Take care of the last nonzero  */
439         tmp0  = x[*idx++];
440         sum1 += *v1++ * tmp0;
441       }
442       y[row++]=sum1;
443       break;
444     case 2:
445       sum1  = 0;
446       sum2  = 0;
447       v2    = v1 + n;
448 
449       for (n = 0; n< sz-1; n+=2) {
450         i1   = idx[0];
451         i2   = idx[1];
452         idx += 2;
453         tmp0 = x[i1];
454         tmp1 = x[i2];
455         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
456         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
457       }
458       if (n == sz-1){
459         tmp0  = x[*idx++];
460         sum1 += *v1++ * tmp0;
461         sum2 += *v2++ * tmp0;
462       }
463       y[row++]=sum1;
464       y[row++]=sum2;
465       v1      =v2;              /* Since the next block to be processed starts there*/
466       idx    +=sz;
467       break;
468     case 3:
469       sum1  = 0;
470       sum2  = 0;
471       sum3  = 0;
472       v2    = v1 + n;
473       v3    = v2 + n;
474 
475       for (n = 0; n< sz-1; n+=2) {
476         i1   = idx[0];
477         i2   = idx[1];
478         idx += 2;
479         tmp0 = x[i1];
480         tmp1 = x[i2];
481         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
482         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
483         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
484       }
485       if (n == sz-1){
486         tmp0  = x[*idx++];
487         sum1 += *v1++ * tmp0;
488         sum2 += *v2++ * tmp0;
489         sum3 += *v3++ * tmp0;
490       }
491       y[row++]=sum1;
492       y[row++]=sum2;
493       y[row++]=sum3;
494       v1       =v3;             /* Since the next block to be processed starts there*/
495       idx     +=2*sz;
496       break;
497     case 4:
498       sum1  = 0;
499       sum2  = 0;
500       sum3  = 0;
501       sum4  = 0;
502       v2    = v1 + n;
503       v3    = v2 + n;
504       v4    = v3 + n;
505 
506       for (n = 0; n< sz-1; n+=2) {
507         i1   = idx[0];
508         i2   = idx[1];
509         idx += 2;
510         tmp0 = x[i1];
511         tmp1 = x[i2];
512         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
513         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
514         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
515         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
516       }
517       if (n == sz-1){
518         tmp0  = x[*idx++];
519         sum1 += *v1++ * tmp0;
520         sum2 += *v2++ * tmp0;
521         sum3 += *v3++ * tmp0;
522         sum4 += *v4++ * tmp0;
523       }
524       y[row++]=sum1;
525       y[row++]=sum2;
526       y[row++]=sum3;
527       y[row++]=sum4;
528       v1      =v4;              /* Since the next block to be processed starts there*/
529       idx    +=3*sz;
530       break;
531     case 5:
532       sum1  = 0;
533       sum2  = 0;
534       sum3  = 0;
535       sum4  = 0;
536       sum5  = 0;
537       v2    = v1 + n;
538       v3    = v2 + n;
539       v4    = v3 + n;
540       v5    = v4 + n;
541 
542       for (n = 0; n<sz-1; n+=2) {
543         i1   = idx[0];
544         i2   = idx[1];
545         idx += 2;
546         tmp0 = x[i1];
547         tmp1 = x[i2];
548         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
549         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
550         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
551         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
552         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
553       }
554       if (n == sz-1){
555         tmp0  = x[*idx++];
556         sum1 += *v1++ * tmp0;
557         sum2 += *v2++ * tmp0;
558         sum3 += *v3++ * tmp0;
559         sum4 += *v4++ * tmp0;
560         sum5 += *v5++ * tmp0;
561       }
562       y[row++]=sum1;
563       y[row++]=sum2;
564       y[row++]=sum3;
565       y[row++]=sum4;
566       y[row++]=sum5;
567       v1      =v5;       /* Since the next block to be processed starts there */
568       idx    +=4*sz;
569       break;
570     default :
571       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
572     }
573   }
574   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
575   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
576   ierr = PetscLogFlops(2*a->nz - A->rmap.n);CHKERRQ(ierr);
577   PetscFunctionReturn(0);
578 }
579 /* ----------------------------------------------------------- */
580 /* Almost same code as the MatMult_Inode() */
581 #undef __FUNCT__
582 #define __FUNCT__ "MatMultAdd_Inode"
583 static PetscErrorCode MatMultAdd_Inode(Mat A,Vec xx,Vec zz,Vec yy)
584 {
585   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
586   PetscScalar    sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
587   PetscScalar    *v1,*v2,*v3,*v4,*v5,*x,*y,*z,*zt;
588   PetscErrorCode ierr;
589   PetscInt       *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
590 
591   PetscFunctionBegin;
592   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
593   node_max = a->inode.node_count;
594   ns       = a->inode.size;     /* Node Size array */
595   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
596   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
597   if (zz != yy) {
598     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
599   } else {
600     z = y;
601   }
602   zt = z;
603 
604   idx  = a->j;
605   v1   = a->a;
606   ii   = a->i;
607 
608   for (i = 0,row = 0; i< node_max; ++i){
609     nsz  = ns[i];
610     n    = ii[1] - ii[0];
611     ii  += nsz;
612     sz   = n;                   /* No of non zeros in this row */
613                                 /* Switch on the size of Node */
614     switch (nsz){               /* Each loop in 'case' is unrolled */
615     case 1 :
616       sum1  = *zt++;
617 
618       for(n = 0; n< sz-1; n+=2) {
619         i1   = idx[0];          /* The instructions are ordered to */
620         i2   = idx[1];          /* make the compiler's job easy */
621         idx += 2;
622         tmp0 = x[i1];
623         tmp1 = x[i2];
624         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
625        }
626 
627       if(n   == sz-1){          /* Take care of the last nonzero  */
628         tmp0  = x[*idx++];
629         sum1 += *v1++ * tmp0;
630       }
631       y[row++]=sum1;
632       break;
633     case 2:
634       sum1  = *zt++;
635       sum2  = *zt++;
636       v2    = v1 + n;
637 
638       for(n = 0; n< sz-1; n+=2) {
639         i1   = idx[0];
640         i2   = idx[1];
641         idx += 2;
642         tmp0 = x[i1];
643         tmp1 = x[i2];
644         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
645         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
646       }
647       if(n   == sz-1){
648         tmp0  = x[*idx++];
649         sum1 += *v1++ * tmp0;
650         sum2 += *v2++ * tmp0;
651       }
652       y[row++]=sum1;
653       y[row++]=sum2;
654       v1      =v2;              /* Since the next block to be processed starts there*/
655       idx    +=sz;
656       break;
657     case 3:
658       sum1  = *zt++;
659       sum2  = *zt++;
660       sum3  = *zt++;
661       v2    = v1 + n;
662       v3    = v2 + n;
663 
664       for (n = 0; n< sz-1; n+=2) {
665         i1   = idx[0];
666         i2   = idx[1];
667         idx += 2;
668         tmp0 = x[i1];
669         tmp1 = x[i2];
670         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
671         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
672         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
673       }
674       if (n == sz-1){
675         tmp0  = x[*idx++];
676         sum1 += *v1++ * tmp0;
677         sum2 += *v2++ * tmp0;
678         sum3 += *v3++ * tmp0;
679       }
680       y[row++]=sum1;
681       y[row++]=sum2;
682       y[row++]=sum3;
683       v1       =v3;             /* Since the next block to be processed starts there*/
684       idx     +=2*sz;
685       break;
686     case 4:
687       sum1  = *zt++;
688       sum2  = *zt++;
689       sum3  = *zt++;
690       sum4  = *zt++;
691       v2    = v1 + n;
692       v3    = v2 + n;
693       v4    = v3 + n;
694 
695       for (n = 0; n< sz-1; n+=2) {
696         i1   = idx[0];
697         i2   = idx[1];
698         idx += 2;
699         tmp0 = x[i1];
700         tmp1 = x[i2];
701         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
702         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
703         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
704         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
705       }
706       if (n == sz-1){
707         tmp0  = x[*idx++];
708         sum1 += *v1++ * tmp0;
709         sum2 += *v2++ * tmp0;
710         sum3 += *v3++ * tmp0;
711         sum4 += *v4++ * tmp0;
712       }
713       y[row++]=sum1;
714       y[row++]=sum2;
715       y[row++]=sum3;
716       y[row++]=sum4;
717       v1      =v4;              /* Since the next block to be processed starts there*/
718       idx    +=3*sz;
719       break;
720     case 5:
721       sum1  = *zt++;
722       sum2  = *zt++;
723       sum3  = *zt++;
724       sum4  = *zt++;
725       sum5  = *zt++;
726       v2    = v1 + n;
727       v3    = v2 + n;
728       v4    = v3 + n;
729       v5    = v4 + n;
730 
731       for (n = 0; n<sz-1; n+=2) {
732         i1   = idx[0];
733         i2   = idx[1];
734         idx += 2;
735         tmp0 = x[i1];
736         tmp1 = x[i2];
737         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
738         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
739         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
740         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
741         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
742       }
743       if(n   == sz-1){
744         tmp0  = x[*idx++];
745         sum1 += *v1++ * tmp0;
746         sum2 += *v2++ * tmp0;
747         sum3 += *v3++ * tmp0;
748         sum4 += *v4++ * tmp0;
749         sum5 += *v5++ * tmp0;
750       }
751       y[row++]=sum1;
752       y[row++]=sum2;
753       y[row++]=sum3;
754       y[row++]=sum4;
755       y[row++]=sum5;
756       v1      =v5;       /* Since the next block to be processed starts there */
757       idx    +=4*sz;
758       break;
759     default :
760       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
761     }
762   }
763   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
764   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
765   if (zz != yy) {
766     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
767   }
768   ierr = PetscLogFlops(2*a->nz);CHKERRQ(ierr);
769   PetscFunctionReturn(0);
770 }
771 
772 /* ----------------------------------------------------------- */
773 #undef __FUNCT__
774 #define __FUNCT__ "MatSolve_Inode"
775 PetscErrorCode MatSolve_Inode(Mat A,Vec bb,Vec xx)
776 {
777   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
778   IS             iscol = a->col,isrow = a->row;
779   PetscErrorCode ierr;
780   PetscInt       *r,*c,i,j,n = A->rmap.n,*ai = a->i,nz,*a_j = a->j;
781   PetscInt       node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1,*rout,*cout;
782   PetscScalar    *x,*b,*a_a = a->a,*tmp,*tmps,*aa,tmp0,tmp1;
783   PetscScalar    sum1,sum2,sum3,sum4,sum5,*v1,*v2,*v3,*v4,*v5;
784 
785   PetscFunctionBegin;
786   if (A->factor!=FACTOR_LU) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unfactored matrix");
787   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
788   node_max = a->inode.node_count;
789   ns       = a->inode.size;     /* Node Size array */
790 
791   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
792   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
793   tmp  = a->solve_work;
794 
795   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
796   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
797 
798   /* forward solve the lower triangular */
799   tmps = tmp ;
800   aa   = a_a ;
801   aj   = a_j ;
802   ad   = a->diag;
803 
804   for (i = 0,row = 0; i< node_max; ++i){
805     nsz = ns[i];
806     aii = ai[row];
807     v1  = aa + aii;
808     vi  = aj + aii;
809     nz  = ad[row]- aii;
810 
811     switch (nsz){               /* Each loop in 'case' is unrolled */
812     case 1 :
813       sum1 = b[*r++];
814       /*      while (nz--) sum1 -= *v1++ *tmps[*vi++];*/
815       for(j=0; j<nz-1; j+=2){
816         i0   = vi[0];
817         i1   = vi[1];
818         vi  +=2;
819         tmp0 = tmps[i0];
820         tmp1 = tmps[i1];
821         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
822       }
823       if(j == nz-1){
824         tmp0 = tmps[*vi++];
825         sum1 -= *v1++ *tmp0;
826       }
827       tmp[row ++]=sum1;
828       break;
829     case 2:
830       sum1 = b[*r++];
831       sum2 = b[*r++];
832       v2   = aa + ai[row+1];
833 
834       for(j=0; j<nz-1; j+=2){
835         i0   = vi[0];
836         i1   = vi[1];
837         vi  +=2;
838         tmp0 = tmps[i0];
839         tmp1 = tmps[i1];
840         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
841         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
842       }
843       if(j == nz-1){
844         tmp0 = tmps[*vi++];
845         sum1 -= *v1++ *tmp0;
846         sum2 -= *v2++ *tmp0;
847       }
848       sum2 -= *v2++ * sum1;
849       tmp[row ++]=sum1;
850       tmp[row ++]=sum2;
851       break;
852     case 3:
853       sum1 = b[*r++];
854       sum2 = b[*r++];
855       sum3 = b[*r++];
856       v2   = aa + ai[row+1];
857       v3   = aa + ai[row+2];
858 
859       for (j=0; j<nz-1; j+=2){
860         i0   = vi[0];
861         i1   = vi[1];
862         vi  +=2;
863         tmp0 = tmps[i0];
864         tmp1 = tmps[i1];
865         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
866         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
867         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
868       }
869       if (j == nz-1){
870         tmp0 = tmps[*vi++];
871         sum1 -= *v1++ *tmp0;
872         sum2 -= *v2++ *tmp0;
873         sum3 -= *v3++ *tmp0;
874       }
875       sum2 -= *v2++ * sum1;
876       sum3 -= *v3++ * sum1;
877       sum3 -= *v3++ * sum2;
878       tmp[row ++]=sum1;
879       tmp[row ++]=sum2;
880       tmp[row ++]=sum3;
881       break;
882 
883     case 4:
884       sum1 = b[*r++];
885       sum2 = b[*r++];
886       sum3 = b[*r++];
887       sum4 = b[*r++];
888       v2   = aa + ai[row+1];
889       v3   = aa + ai[row+2];
890       v4   = aa + ai[row+3];
891 
892       for (j=0; j<nz-1; j+=2){
893         i0   = vi[0];
894         i1   = vi[1];
895         vi  +=2;
896         tmp0 = tmps[i0];
897         tmp1 = tmps[i1];
898         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
899         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
900         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
901         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
902       }
903       if (j == nz-1){
904         tmp0 = tmps[*vi++];
905         sum1 -= *v1++ *tmp0;
906         sum2 -= *v2++ *tmp0;
907         sum3 -= *v3++ *tmp0;
908         sum4 -= *v4++ *tmp0;
909       }
910       sum2 -= *v2++ * sum1;
911       sum3 -= *v3++ * sum1;
912       sum4 -= *v4++ * sum1;
913       sum3 -= *v3++ * sum2;
914       sum4 -= *v4++ * sum2;
915       sum4 -= *v4++ * sum3;
916 
917       tmp[row ++]=sum1;
918       tmp[row ++]=sum2;
919       tmp[row ++]=sum3;
920       tmp[row ++]=sum4;
921       break;
922     case 5:
923       sum1 = b[*r++];
924       sum2 = b[*r++];
925       sum3 = b[*r++];
926       sum4 = b[*r++];
927       sum5 = b[*r++];
928       v2   = aa + ai[row+1];
929       v3   = aa + ai[row+2];
930       v4   = aa + ai[row+3];
931       v5   = aa + ai[row+4];
932 
933       for (j=0; j<nz-1; j+=2){
934         i0   = vi[0];
935         i1   = vi[1];
936         vi  +=2;
937         tmp0 = tmps[i0];
938         tmp1 = tmps[i1];
939         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
940         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
941         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
942         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
943         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
944       }
945       if (j == nz-1){
946         tmp0 = tmps[*vi++];
947         sum1 -= *v1++ *tmp0;
948         sum2 -= *v2++ *tmp0;
949         sum3 -= *v3++ *tmp0;
950         sum4 -= *v4++ *tmp0;
951         sum5 -= *v5++ *tmp0;
952       }
953 
954       sum2 -= *v2++ * sum1;
955       sum3 -= *v3++ * sum1;
956       sum4 -= *v4++ * sum1;
957       sum5 -= *v5++ * sum1;
958       sum3 -= *v3++ * sum2;
959       sum4 -= *v4++ * sum2;
960       sum5 -= *v5++ * sum2;
961       sum4 -= *v4++ * sum3;
962       sum5 -= *v5++ * sum3;
963       sum5 -= *v5++ * sum4;
964 
965       tmp[row ++]=sum1;
966       tmp[row ++]=sum2;
967       tmp[row ++]=sum3;
968       tmp[row ++]=sum4;
969       tmp[row ++]=sum5;
970       break;
971     default:
972       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
973     }
974   }
975   /* backward solve the upper triangular */
976   for (i=node_max -1 ,row = n-1 ; i>=0; i--){
977     nsz = ns[i];
978     aii = ai[row+1] -1;
979     v1  = aa + aii;
980     vi  = aj + aii;
981     nz  = aii- ad[row];
982     switch (nsz){               /* Each loop in 'case' is unrolled */
983     case 1 :
984       sum1 = tmp[row];
985 
986       for(j=nz ; j>1; j-=2){
987         vi  -=2;
988         i0   = vi[2];
989         i1   = vi[1];
990         tmp0 = tmps[i0];
991         tmp1 = tmps[i1];
992         v1   -= 2;
993         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
994       }
995       if (j==1){
996         tmp0  = tmps[*vi--];
997         sum1 -= *v1-- * tmp0;
998       }
999       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1000       break;
1001     case 2 :
1002       sum1 = tmp[row];
1003       sum2 = tmp[row -1];
1004       v2   = aa + ai[row]-1;
1005       for (j=nz ; j>1; j-=2){
1006         vi  -=2;
1007         i0   = vi[2];
1008         i1   = vi[1];
1009         tmp0 = tmps[i0];
1010         tmp1 = tmps[i1];
1011         v1   -= 2;
1012         v2   -= 2;
1013         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1014         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1015       }
1016       if (j==1){
1017         tmp0  = tmps[*vi--];
1018         sum1 -= *v1-- * tmp0;
1019         sum2 -= *v2-- * tmp0;
1020       }
1021 
1022       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1023       sum2   -= *v2-- * tmp0;
1024       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1025       break;
1026     case 3 :
1027       sum1 = tmp[row];
1028       sum2 = tmp[row -1];
1029       sum3 = tmp[row -2];
1030       v2   = aa + ai[row]-1;
1031       v3   = aa + ai[row -1]-1;
1032       for (j=nz ; j>1; j-=2){
1033         vi  -=2;
1034         i0   = vi[2];
1035         i1   = vi[1];
1036         tmp0 = tmps[i0];
1037         tmp1 = tmps[i1];
1038         v1   -= 2;
1039         v2   -= 2;
1040         v3   -= 2;
1041         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1042         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1043         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1044       }
1045       if (j==1){
1046         tmp0  = tmps[*vi--];
1047         sum1 -= *v1-- * tmp0;
1048         sum2 -= *v2-- * tmp0;
1049         sum3 -= *v3-- * tmp0;
1050       }
1051       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1052       sum2   -= *v2-- * tmp0;
1053       sum3   -= *v3-- * tmp0;
1054       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1055       sum3   -= *v3-- * tmp0;
1056       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1057 
1058       break;
1059     case 4 :
1060       sum1 = tmp[row];
1061       sum2 = tmp[row -1];
1062       sum3 = tmp[row -2];
1063       sum4 = tmp[row -3];
1064       v2   = aa + ai[row]-1;
1065       v3   = aa + ai[row -1]-1;
1066       v4   = aa + ai[row -2]-1;
1067 
1068       for (j=nz ; j>1; j-=2){
1069         vi  -=2;
1070         i0   = vi[2];
1071         i1   = vi[1];
1072         tmp0 = tmps[i0];
1073         tmp1 = tmps[i1];
1074         v1  -= 2;
1075         v2  -= 2;
1076         v3  -= 2;
1077         v4  -= 2;
1078         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1079         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1080         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1081         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1082       }
1083       if (j==1){
1084         tmp0  = tmps[*vi--];
1085         sum1 -= *v1-- * tmp0;
1086         sum2 -= *v2-- * tmp0;
1087         sum3 -= *v3-- * tmp0;
1088         sum4 -= *v4-- * tmp0;
1089       }
1090 
1091       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1092       sum2   -= *v2-- * tmp0;
1093       sum3   -= *v3-- * tmp0;
1094       sum4   -= *v4-- * tmp0;
1095       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1096       sum3   -= *v3-- * tmp0;
1097       sum4   -= *v4-- * tmp0;
1098       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1099       sum4   -= *v4-- * tmp0;
1100       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1101       break;
1102     case 5 :
1103       sum1 = tmp[row];
1104       sum2 = tmp[row -1];
1105       sum3 = tmp[row -2];
1106       sum4 = tmp[row -3];
1107       sum5 = tmp[row -4];
1108       v2   = aa + ai[row]-1;
1109       v3   = aa + ai[row -1]-1;
1110       v4   = aa + ai[row -2]-1;
1111       v5   = aa + ai[row -3]-1;
1112       for (j=nz ; j>1; j-=2){
1113         vi  -= 2;
1114         i0   = vi[2];
1115         i1   = vi[1];
1116         tmp0 = tmps[i0];
1117         tmp1 = tmps[i1];
1118         v1   -= 2;
1119         v2   -= 2;
1120         v3   -= 2;
1121         v4   -= 2;
1122         v5   -= 2;
1123         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1124         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1125         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1126         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1127         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1128       }
1129       if (j==1){
1130         tmp0  = tmps[*vi--];
1131         sum1 -= *v1-- * tmp0;
1132         sum2 -= *v2-- * tmp0;
1133         sum3 -= *v3-- * tmp0;
1134         sum4 -= *v4-- * tmp0;
1135         sum5 -= *v5-- * tmp0;
1136       }
1137 
1138       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1139       sum2   -= *v2-- * tmp0;
1140       sum3   -= *v3-- * tmp0;
1141       sum4   -= *v4-- * tmp0;
1142       sum5   -= *v5-- * tmp0;
1143       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1144       sum3   -= *v3-- * tmp0;
1145       sum4   -= *v4-- * tmp0;
1146       sum5   -= *v5-- * tmp0;
1147       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1148       sum4   -= *v4-- * tmp0;
1149       sum5   -= *v5-- * tmp0;
1150       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1151       sum5   -= *v5-- * tmp0;
1152       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1153       break;
1154     default:
1155       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1156     }
1157   }
1158   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1159   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1160   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1161   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1162   ierr = PetscLogFlops(2*a->nz - A->cmap.n);CHKERRQ(ierr);
1163   PetscFunctionReturn(0);
1164 }
1165 
1166 #undef __FUNCT__
1167 #define __FUNCT__ "MatLUFactorNumeric_Inode"
1168 PetscErrorCode MatLUFactorNumeric_Inode(Mat A,MatFactorInfo *info,Mat *B)
1169 {
1170   Mat            C = *B;
1171   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1172   IS             iscol = b->col,isrow = b->row,isicol = b->icol;
1173   PetscErrorCode ierr;
1174   PetscInt       *r,*ic,*c,n = A->rmap.n,*bi = b->i;
1175   PetscInt       *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1176   PetscInt       *ics,i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz;
1177   PetscInt       *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1178   PetscScalar    *rtmp1,*rtmp2,*rtmp3,*v1,*v2,*v3,*pc1,*pc2,*pc3,mul1,mul2,mul3;
1179   PetscScalar    tmp,*ba = b->a,*aa = a->a,*pv;
1180   PetscReal      rs=0.0;
1181   LUShift_Ctx    sctx;
1182   PetscInt       newshift;
1183 
1184   PetscFunctionBegin;
1185   sctx.shift_top  = 0;
1186   sctx.nshift_max = 0;
1187   sctx.shift_lo   = 0;
1188   sctx.shift_hi   = 0;
1189 
1190   /* if both shift schemes are chosen by user, only use info->shiftpd */
1191   if (info->shiftpd && info->shiftnz) info->shiftnz = 0.0;
1192   if (info->shiftpd) { /* set sctx.shift_top=max{rs} */
1193     sctx.shift_top = 0;
1194     for (i=0; i<n; i++) {
1195       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1196       rs    = 0.0;
1197       ajtmp = aj + ai[i];
1198       rtmp1 = aa + ai[i];
1199       nz = ai[i+1] - ai[i];
1200       for (j=0; j<nz; j++){
1201         if (*ajtmp != i){
1202           rs += PetscAbsScalar(*rtmp1++);
1203         } else {
1204           rs -= PetscRealPart(*rtmp1++);
1205         }
1206         ajtmp++;
1207       }
1208       if (rs>sctx.shift_top) sctx.shift_top = rs;
1209     }
1210     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1211     sctx.shift_top *= 1.1;
1212     sctx.nshift_max = 5;
1213     sctx.shift_lo   = 0.;
1214     sctx.shift_hi   = 1.;
1215   }
1216   sctx.shift_amount = 0;
1217   sctx.nshift       = 0;
1218 
1219   ierr  = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1220   ierr  = ISGetIndices(iscol,&c);CHKERRQ(ierr);
1221   ierr  = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1222   ierr  = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp1);CHKERRQ(ierr);
1223   ierr  = PetscMemzero(rtmp1,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr);
1224   ics   = ic ;
1225   rtmp2 = rtmp1 + n;
1226   rtmp3 = rtmp2 + n;
1227 
1228   node_max = a->inode.node_count;
1229   ns       = a->inode.size ;
1230   if (!ns){
1231     SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information");
1232   }
1233 
1234   /* If max inode size > 3, split it into two inodes.*/
1235   /* also map the inode sizes according to the ordering */
1236   ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr);
1237   for (i=0,j=0; i<node_max; ++i,++j){
1238     if (ns[i]>3) {
1239       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1240       ++j;
1241       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1242     } else {
1243       tmp_vec1[j] = ns[i];
1244     }
1245   }
1246   /* Use the correct node_max */
1247   node_max = j;
1248 
1249   /* Now reorder the inode info based on mat re-ordering info */
1250   /* First create a row -> inode_size_array_index map */
1251   ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr);
1252   ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr);
1253   for (i=0,row=0; i<node_max; i++) {
1254     nodesz = tmp_vec1[i];
1255     for (j=0; j<nodesz; j++,row++) {
1256       nsmap[row] = i;
1257     }
1258   }
1259   /* Using nsmap, create a reordered ns structure */
1260   for (i=0,j=0; i< node_max; i++) {
1261     nodesz       = tmp_vec1[nsmap[r[j]]];    /* here the reordered row_no is in r[] */
1262     tmp_vec2[i]  = nodesz;
1263     j           += nodesz;
1264   }
1265   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1266   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1267   /* Now use the correct ns */
1268   ns = tmp_vec2;
1269 
1270   do {
1271     sctx.lushift = PETSC_FALSE;
1272     /* Now loop over each block-row, and do the factorization */
1273     for (i=0,row=0; i<node_max; i++) {
1274       nodesz = ns[i];
1275       nz     = bi[row+1] - bi[row];
1276       bjtmp  = bj + bi[row];
1277 
1278       switch (nodesz){
1279       case 1:
1280         for  (j=0; j<nz; j++){
1281           idx        = bjtmp[j];
1282           rtmp1[idx] = 0.0;
1283         }
1284 
1285         /* load in initial (unfactored row) */
1286         idx    = r[row];
1287         nz_tmp = ai[idx+1] - ai[idx];
1288         ajtmp  = aj + ai[idx];
1289         v1     = aa + ai[idx];
1290 
1291         for (j=0; j<nz_tmp; j++) {
1292           idx        = ics[ajtmp[j]];
1293           rtmp1[idx] = v1[j];
1294         }
1295         rtmp1[ics[r[row]]] += sctx.shift_amount;
1296 
1297         prow = *bjtmp++ ;
1298         while (prow < row) {
1299           pc1 = rtmp1 + prow;
1300           if (*pc1 != 0.0){
1301             pv   = ba + bd[prow];
1302             pj   = nbj + bd[prow];
1303             mul1 = *pc1 * *pv++;
1304             *pc1 = mul1;
1305             nz_tmp = bi[prow+1] - bd[prow] - 1;
1306             ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr);
1307             for (j=0; j<nz_tmp; j++) {
1308               tmp = pv[j];
1309               idx = pj[j];
1310               rtmp1[idx] -= mul1 * tmp;
1311             }
1312           }
1313           prow = *bjtmp++ ;
1314         }
1315         pj  = bj + bi[row];
1316         pc1 = ba + bi[row];
1317 
1318         sctx.pv    = rtmp1[row];
1319         rtmp1[row] = 1.0/rtmp1[row]; /* invert diag */
1320         rs         = 0.0;
1321         for (j=0; j<nz; j++) {
1322           idx    = pj[j];
1323           pc1[j] = rtmp1[idx]; /* rtmp1 -> ba */
1324           if (idx != row) rs += PetscAbsScalar(pc1[j]);
1325         }
1326         sctx.rs  = rs;
1327         ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr);
1328         if (newshift == 1) goto endofwhile;
1329         break;
1330 
1331       case 2:
1332         for (j=0; j<nz; j++) {
1333           idx        = bjtmp[j];
1334           rtmp1[idx] = 0.0;
1335           rtmp2[idx] = 0.0;
1336         }
1337 
1338         /* load in initial (unfactored row) */
1339         idx    = r[row];
1340         nz_tmp = ai[idx+1] - ai[idx];
1341         ajtmp  = aj + ai[idx];
1342         v1     = aa + ai[idx];
1343         v2     = aa + ai[idx+1];
1344         for (j=0; j<nz_tmp; j++) {
1345           idx        = ics[ajtmp[j]];
1346           rtmp1[idx] = v1[j];
1347           rtmp2[idx] = v2[j];
1348         }
1349         rtmp1[ics[r[row]]]   += sctx.shift_amount;
1350         rtmp2[ics[r[row+1]]] += sctx.shift_amount;
1351 
1352         prow = *bjtmp++ ;
1353         while (prow < row) {
1354           pc1 = rtmp1 + prow;
1355           pc2 = rtmp2 + prow;
1356           if (*pc1 != 0.0 || *pc2 != 0.0){
1357             pv   = ba + bd[prow];
1358             pj   = nbj + bd[prow];
1359             mul1 = *pc1 * *pv;
1360             mul2 = *pc2 * *pv;
1361             ++pv;
1362             *pc1 = mul1;
1363             *pc2 = mul2;
1364 
1365             nz_tmp = bi[prow+1] - bd[prow] - 1;
1366             for (j=0; j<nz_tmp; j++) {
1367               tmp = pv[j];
1368               idx = pj[j];
1369               rtmp1[idx] -= mul1 * tmp;
1370               rtmp2[idx] -= mul2 * tmp;
1371             }
1372             ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr);
1373           }
1374           prow = *bjtmp++ ;
1375         }
1376 
1377         /* Now take care of diagonal 2x2 block. Note: prow = row here */
1378         pc1 = rtmp1 + prow;
1379         pc2 = rtmp2 + prow;
1380 
1381         sctx.pv = *pc1;
1382         pj      = bj + bi[prow];
1383         rs      = 0.0;
1384         for (j=0; j<nz; j++){
1385           idx = pj[j];
1386           if (idx != prow) rs += PetscAbsScalar(rtmp1[idx]);
1387         }
1388         sctx.rs = rs;
1389         ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr);
1390         if (newshift == 1) goto endofwhile;
1391 
1392         if (*pc2 != 0.0){
1393           pj     = nbj + bd[prow];
1394           mul2   = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
1395           *pc2   = mul2;
1396           nz_tmp = bi[prow+1] - bd[prow] - 1;
1397           for (j=0; j<nz_tmp; j++) {
1398             idx = pj[j] ;
1399             tmp = rtmp1[idx];
1400             rtmp2[idx] -= mul2 * tmp;
1401           }
1402           ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr);
1403         }
1404 
1405         pj  = bj + bi[row];
1406         pc1 = ba + bi[row];
1407         pc2 = ba + bi[row+1];
1408 
1409         sctx.pv = rtmp2[row+1];
1410         rs = 0.0;
1411         rtmp1[row]   = 1.0/rtmp1[row];
1412         rtmp2[row+1] = 1.0/rtmp2[row+1];
1413         /* copy row entries from dense representation to sparse */
1414         for (j=0; j<nz; j++) {
1415           idx    = pj[j];
1416           pc1[j] = rtmp1[idx];
1417           pc2[j] = rtmp2[idx];
1418           if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
1419         }
1420         sctx.rs = rs;
1421         ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr);
1422         if (newshift == 1) goto endofwhile;
1423         break;
1424 
1425       case 3:
1426         for  (j=0; j<nz; j++) {
1427           idx        = bjtmp[j];
1428           rtmp1[idx] = 0.0;
1429           rtmp2[idx] = 0.0;
1430           rtmp3[idx] = 0.0;
1431         }
1432         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
1433         idx    = r[row];
1434         nz_tmp = ai[idx+1] - ai[idx];
1435         ajtmp = aj + ai[idx];
1436         v1    = aa + ai[idx];
1437         v2    = aa + ai[idx+1];
1438         v3    = aa + ai[idx+2];
1439         for (j=0; j<nz_tmp; j++) {
1440           idx        = ics[ajtmp[j]];
1441           rtmp1[idx] = v1[j];
1442           rtmp2[idx] = v2[j];
1443           rtmp3[idx] = v3[j];
1444         }
1445         rtmp1[ics[r[row]]]   += sctx.shift_amount;
1446         rtmp2[ics[r[row+1]]] += sctx.shift_amount;
1447         rtmp3[ics[r[row+2]]] += sctx.shift_amount;
1448 
1449         /* loop over all pivot row blocks above this row block */
1450         prow = *bjtmp++ ;
1451         while (prow < row) {
1452           pc1 = rtmp1 + prow;
1453           pc2 = rtmp2 + prow;
1454           pc3 = rtmp3 + prow;
1455           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){
1456             pv   = ba  + bd[prow];
1457             pj   = nbj + bd[prow];
1458             mul1 = *pc1 * *pv;
1459             mul2 = *pc2 * *pv;
1460             mul3 = *pc3 * *pv;
1461             ++pv;
1462             *pc1 = mul1;
1463             *pc2 = mul2;
1464             *pc3 = mul3;
1465 
1466             nz_tmp = bi[prow+1] - bd[prow] - 1;
1467             /* update this row based on pivot row */
1468             for (j=0; j<nz_tmp; j++) {
1469               tmp = pv[j];
1470               idx = pj[j];
1471               rtmp1[idx] -= mul1 * tmp;
1472               rtmp2[idx] -= mul2 * tmp;
1473               rtmp3[idx] -= mul3 * tmp;
1474             }
1475             ierr = PetscLogFlops(6*nz_tmp);CHKERRQ(ierr);
1476           }
1477           prow = *bjtmp++ ;
1478         }
1479 
1480         /* Now take care of diagonal 3x3 block in this set of rows */
1481         /* note: prow = row here */
1482         pc1 = rtmp1 + prow;
1483         pc2 = rtmp2 + prow;
1484         pc3 = rtmp3 + prow;
1485 
1486         sctx.pv = *pc1;
1487         pj      = bj + bi[prow];
1488         rs      = 0.0;
1489         for (j=0; j<nz; j++){
1490           idx = pj[j];
1491           if (idx != row) rs += PetscAbsScalar(rtmp1[idx]);
1492         }
1493         sctx.rs = rs;
1494         ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr);
1495         if (newshift == 1) goto endofwhile;
1496 
1497         if (*pc2 != 0.0 || *pc3 != 0.0){
1498           mul2 = (*pc2)/(*pc1);
1499           mul3 = (*pc3)/(*pc1);
1500           *pc2 = mul2;
1501           *pc3 = mul3;
1502           nz_tmp = bi[prow+1] - bd[prow] - 1;
1503           pj     = nbj + bd[prow];
1504           for (j=0; j<nz_tmp; j++) {
1505             idx = pj[j] ;
1506             tmp = rtmp1[idx];
1507             rtmp2[idx] -= mul2 * tmp;
1508             rtmp3[idx] -= mul3 * tmp;
1509           }
1510           ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr);
1511         }
1512         ++prow;
1513 
1514         pc2 = rtmp2 + prow;
1515         pc3 = rtmp3 + prow;
1516         sctx.pv = *pc2;
1517         pj      = bj + bi[prow];
1518         rs      = 0.0;
1519         for (j=0; j<nz; j++){
1520           idx = pj[j];
1521           if (idx != prow) rs += PetscAbsScalar(rtmp2[idx]);
1522         }
1523         sctx.rs = rs;
1524         ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr);
1525         if (newshift == 1) goto endofwhile;
1526 
1527         if (*pc3 != 0.0){
1528           mul3   = (*pc3)/(*pc2);
1529           *pc3   = mul3;
1530           pj     = nbj + bd[prow];
1531           nz_tmp = bi[prow+1] - bd[prow] - 1;
1532           for (j=0; j<nz_tmp; j++) {
1533             idx = pj[j] ;
1534             tmp = rtmp2[idx];
1535             rtmp3[idx] -= mul3 * tmp;
1536           }
1537           ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr);
1538         }
1539 
1540         pj  = bj + bi[row];
1541         pc1 = ba + bi[row];
1542         pc2 = ba + bi[row+1];
1543         pc3 = ba + bi[row+2];
1544 
1545         sctx.pv = rtmp3[row+2];
1546         rs = 0.0;
1547         rtmp1[row]   = 1.0/rtmp1[row];
1548         rtmp2[row+1] = 1.0/rtmp2[row+1];
1549         rtmp3[row+2] = 1.0/rtmp3[row+2];
1550         /* copy row entries from dense representation to sparse */
1551         for (j=0; j<nz; j++) {
1552           idx    = pj[j];
1553           pc1[j] = rtmp1[idx];
1554           pc2[j] = rtmp2[idx];
1555           pc3[j] = rtmp3[idx];
1556           if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
1557         }
1558 
1559         sctx.rs = rs;
1560         ierr = MatLUCheckShift_inline(info,sctx,row+2,newshift);CHKERRQ(ierr);
1561         if (newshift == 1) goto endofwhile;
1562         break;
1563 
1564       default:
1565         SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1566       }
1567       row += nodesz;                 /* Update the row */
1568     }
1569     endofwhile:;
1570   } while (sctx.lushift);
1571   ierr = PetscFree(rtmp1);CHKERRQ(ierr);
1572   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
1573   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
1574   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1575   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
1576   C->factor      = FACTOR_LU;
1577   C->assembled   = PETSC_TRUE;
1578   if (sctx.nshift) {
1579     if (info->shiftnz) {
1580       ierr = PetscInfo2(0,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
1581     } else if (info->shiftpd) {
1582       ierr = PetscInfo4(0,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,info->shift_fraction,sctx.shift_top);CHKERRQ(ierr);
1583     }
1584   }
1585   ierr = PetscLogFlops(C->cmap.n);CHKERRQ(ierr);
1586   PetscFunctionReturn(0);
1587 }
1588 
1589 /*
1590      Makes a longer coloring[] array and calls the usual code with that
1591 */
1592 #undef __FUNCT__
1593 #define __FUNCT__ "MatColoringPatch_Inode"
1594 PetscErrorCode MatColoringPatch_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
1595 {
1596   Mat_SeqAIJ       *a = (Mat_SeqAIJ*)mat->data;
1597   PetscErrorCode  ierr;
1598   PetscInt        n = mat->cmap.n,m = a->inode.node_count,j,*ns = a->inode.size,row;
1599   PetscInt        *colorused,i;
1600   ISColoringValue *newcolor;
1601 
1602   PetscFunctionBegin;
1603   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr);
1604   /* loop over inodes, marking a color for each column*/
1605   row = 0;
1606   for (i=0; i<m; i++){
1607     for (j=0; j<ns[i]; j++) {
1608       newcolor[row++] = coloring[i] + j*ncolors;
1609     }
1610   }
1611 
1612   /* eliminate unneeded colors */
1613   ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr);
1614   ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr);
1615   for (i=0; i<n; i++) {
1616     colorused[newcolor[i]] = 1;
1617   }
1618 
1619   for (i=1; i<5*ncolors; i++) {
1620     colorused[i] += colorused[i-1];
1621   }
1622   ncolors = colorused[5*ncolors-1];
1623   for (i=0; i<n; i++) {
1624     newcolor[i] = colorused[newcolor[i]]-1;
1625   }
1626   ierr = PetscFree(colorused);CHKERRQ(ierr);
1627   ierr = ISColoringCreate(mat->comm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr);
1628   ierr = PetscFree(coloring);CHKERRQ(ierr);
1629   PetscFunctionReturn(0);
1630 }
1631 
1632 #include "src/inline/ilu.h"
1633 
1634 #undef __FUNCT__
1635 #define __FUNCT__ "MatRelax_Inode"
1636 PetscErrorCode MatRelax_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
1637 {
1638   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
1639   PetscScalar     *x,*xs,*ibdiag,*bdiag,sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3,*v1,*v2,*v3,*v4,*v5;
1640   PetscScalar     *v = a->a,*b,*xb,tmp4,tmp5,x1,x2,x3,x4,x5;
1641   PetscErrorCode  ierr;
1642   PetscInt        n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2;
1643   PetscInt        *idx,*diag = a->diag,*ii = a->i,sz,k;
1644 
1645   PetscFunctionBegin;
1646   if (omega != 1.0) SETERRQ(PETSC_ERR_SUP,"No support for omega != 1.0");
1647 
1648   its = its*lits;
1649 
1650   if (!a->inode.ibdiag) {
1651     /* calculate space needed for diagonal blocks */
1652     for (i=0; i<m; i++) {
1653       cnt += sizes[i]*sizes[i];
1654     }
1655     a->inode.bdiagsize = cnt;
1656     ierr   = PetscMalloc2(cnt,PetscScalar,&a->inode.ibdiag,cnt,PetscScalar,&a->inode.bdiag);CHKERRQ(ierr);
1657     ibdiag = a->inode.ibdiag;
1658     bdiag  = a->inode.bdiag;
1659     /* copy over the diagonal blocks and invert them */
1660 
1661     cnt = 0;
1662     for (i=0, row = 0; i<m; i++) {
1663       for (j=0; j<sizes[i]; j++) {
1664         for (k=0; k<sizes[i]; k++) {
1665           bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
1666         }
1667       }
1668       ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(PetscScalar));CHKERRQ(ierr);
1669 
1670       switch(sizes[i]) {
1671         case 1:
1672           /* Create matrix data structure */
1673           if (!ibdiag[cnt]) SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
1674           ierr = ibdiag[cnt] = 1.0/ibdiag[cnt];
1675           break;
1676         case 2:
1677           ierr = Kernel_A_gets_inverse_A_2(ibdiag+cnt);CHKERRQ(ierr);
1678           break;
1679         case 3:
1680           ierr = Kernel_A_gets_inverse_A_3(ibdiag+cnt);CHKERRQ(ierr);
1681           break;
1682         case 4:
1683           ierr = Kernel_A_gets_inverse_A_4(ibdiag+cnt);CHKERRQ(ierr);
1684           break;
1685         case 5:
1686           ierr = Kernel_A_gets_inverse_A_5(ibdiag+cnt);CHKERRQ(ierr);
1687           break;
1688        default:
1689 	 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
1690       }
1691       cnt += sizes[i]*sizes[i];
1692       row += sizes[i];
1693     }
1694   }
1695   ibdiag = a->inode.ibdiag;
1696   bdiag  = a->inode.bdiag;
1697 
1698   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1699   if (xx != bb) {
1700     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1701   } else {
1702     b = x;
1703   }
1704 
1705   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
1706   xs   = x;
1707   if (flag & SOR_ZERO_INITIAL_GUESS) {
1708     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){
1709 
1710       for (i=0, row=0; i<m; i++) {
1711         sz  = diag[row] - ii[row];
1712         v1  = a->a + ii[row];
1713         idx = a->j + ii[row];
1714 
1715         /* see comments for MatMult_Inode() for how this is coded */
1716         switch (sizes[i]){
1717           case 1:
1718 
1719             sum1  = b[row];
1720             for(n = 0; n<sz-1; n+=2) {
1721               i1   = idx[0];
1722               i2   = idx[1];
1723               idx += 2;
1724               tmp0 = x[i1];
1725               tmp1 = x[i2];
1726               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1727             }
1728 
1729             if (n == sz-1){
1730               tmp0  = x[*idx];
1731               sum1 -= *v1 * tmp0;
1732             }
1733             x[row++] = sum1*(*ibdiag++);
1734             break;
1735           case 2:
1736             v2    = a->a + ii[row+1];
1737             sum1  = b[row];
1738             sum2  = b[row+1];
1739             for(n = 0; n<sz-1; n+=2) {
1740               i1   = idx[0];
1741               i2   = idx[1];
1742               idx += 2;
1743               tmp0 = x[i1];
1744               tmp1 = x[i2];
1745               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1746               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1747             }
1748 
1749             if (n == sz-1){
1750               tmp0  = x[*idx];
1751               sum1 -= v1[0] * tmp0;
1752               sum2 -= v2[0] * tmp0;
1753             }
1754             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
1755             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
1756             ibdiag  += 4;
1757             break;
1758           case 3:
1759             v2    = a->a + ii[row+1];
1760             v3    = a->a + ii[row+2];
1761             sum1  = b[row];
1762             sum2  = b[row+1];
1763             sum3  = b[row+2];
1764             for(n = 0; n<sz-1; n+=2) {
1765               i1   = idx[0];
1766               i2   = idx[1];
1767               idx += 2;
1768               tmp0 = x[i1];
1769               tmp1 = x[i2];
1770               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1771               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1772               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1773             }
1774 
1775             if (n == sz-1){
1776               tmp0  = x[*idx];
1777               sum1 -= v1[0] * tmp0;
1778               sum2 -= v2[0] * tmp0;
1779               sum3 -= v3[0] * tmp0;
1780             }
1781             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
1782             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
1783             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
1784             ibdiag  += 9;
1785             break;
1786           case 4:
1787             v2    = a->a + ii[row+1];
1788             v3    = a->a + ii[row+2];
1789             v4    = a->a + ii[row+3];
1790             sum1  = b[row];
1791             sum2  = b[row+1];
1792             sum3  = b[row+2];
1793             sum4  = b[row+3];
1794             for(n = 0; n<sz-1; n+=2) {
1795               i1   = idx[0];
1796               i2   = idx[1];
1797               idx += 2;
1798               tmp0 = x[i1];
1799               tmp1 = x[i2];
1800               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1801               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1802               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1803               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
1804             }
1805 
1806             if (n == sz-1){
1807               tmp0  = x[*idx];
1808               sum1 -= v1[0] * tmp0;
1809               sum2 -= v2[0] * tmp0;
1810               sum3 -= v3[0] * tmp0;
1811               sum4 -= v4[0] * tmp0;
1812             }
1813             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
1814             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
1815             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
1816             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
1817             ibdiag  += 16;
1818             break;
1819           case 5:
1820             v2    = a->a + ii[row+1];
1821             v3    = a->a + ii[row+2];
1822             v4    = a->a + ii[row+3];
1823             v5    = a->a + ii[row+4];
1824             sum1  = b[row];
1825             sum2  = b[row+1];
1826             sum3  = b[row+2];
1827             sum4  = b[row+3];
1828             sum5  = b[row+4];
1829             for(n = 0; n<sz-1; n+=2) {
1830               i1   = idx[0];
1831               i2   = idx[1];
1832               idx += 2;
1833               tmp0 = x[i1];
1834               tmp1 = x[i2];
1835               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1836               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1837               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1838               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
1839               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
1840             }
1841 
1842             if (n == sz-1){
1843               tmp0  = x[*idx];
1844               sum1 -= v1[0] * tmp0;
1845               sum2 -= v2[0] * tmp0;
1846               sum3 -= v3[0] * tmp0;
1847               sum4 -= v4[0] * tmp0;
1848               sum5 -= v5[0] * tmp0;
1849             }
1850             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
1851             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
1852             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
1853             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
1854             x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
1855             ibdiag  += 25;
1856             break;
1857 	  default:
1858    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
1859         }
1860       }
1861 
1862       xb = x;
1863       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
1864     } else xb = b;
1865     if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) &&
1866         (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1867       cnt = 0;
1868       for (i=0, row=0; i<m; i++) {
1869 
1870         switch (sizes[i]){
1871           case 1:
1872             x[row++] *= bdiag[cnt++];
1873             break;
1874           case 2:
1875             x1   = x[row]; x2 = x[row+1];
1876             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
1877             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
1878             x[row++] = tmp1;
1879             x[row++] = tmp2;
1880             cnt += 4;
1881             break;
1882           case 3:
1883             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2];
1884             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
1885             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
1886             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
1887             x[row++] = tmp1;
1888             x[row++] = tmp2;
1889             x[row++] = tmp3;
1890             cnt += 9;
1891             break;
1892           case 4:
1893             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
1894             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
1895             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
1896             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
1897             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
1898             x[row++] = tmp1;
1899             x[row++] = tmp2;
1900             x[row++] = tmp3;
1901             x[row++] = tmp4;
1902             cnt += 16;
1903             break;
1904           case 5:
1905             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
1906             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
1907             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
1908             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
1909             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
1910             tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
1911             x[row++] = tmp1;
1912             x[row++] = tmp2;
1913             x[row++] = tmp3;
1914             x[row++] = tmp4;
1915             x[row++] = tmp5;
1916             cnt += 25;
1917             break;
1918 	  default:
1919    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
1920         }
1921       }
1922       ierr = PetscLogFlops(m);CHKERRQ(ierr);
1923     }
1924     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){
1925 
1926       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
1927       for (i=m-1, row=A->rmap.n-1; i>=0; i--) {
1928         ibdiag -= sizes[i]*sizes[i];
1929         sz      = ii[row+1] - diag[row] - 1;
1930         v1      = a->a + diag[row] + 1;
1931         idx     = a->j + diag[row] + 1;
1932 
1933         /* see comments for MatMult_Inode() for how this is coded */
1934         switch (sizes[i]){
1935           case 1:
1936 
1937             sum1  = xb[row];
1938             for(n = 0; n<sz-1; n+=2) {
1939               i1   = idx[0];
1940               i2   = idx[1];
1941               idx += 2;
1942               tmp0 = x[i1];
1943               tmp1 = x[i2];
1944               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1945             }
1946 
1947             if (n == sz-1){
1948               tmp0  = x[*idx];
1949               sum1 -= *v1*tmp0;
1950             }
1951             x[row--] = sum1*(*ibdiag);
1952             break;
1953 
1954           case 2:
1955 
1956             sum1  = xb[row];
1957             sum2  = xb[row-1];
1958             /* note that sum1 is associated with the second of the two rows */
1959             v2    = a->a + diag[row-1] + 2;
1960             for(n = 0; n<sz-1; n+=2) {
1961               i1   = idx[0];
1962               i2   = idx[1];
1963               idx += 2;
1964               tmp0 = x[i1];
1965               tmp1 = x[i2];
1966               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1967               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1968             }
1969 
1970             if (n == sz-1){
1971               tmp0  = x[*idx];
1972               sum1 -= *v1*tmp0;
1973               sum2 -= *v2*tmp0;
1974             }
1975             x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
1976             x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
1977             break;
1978           case 3:
1979 
1980             sum1  = xb[row];
1981             sum2  = xb[row-1];
1982             sum3  = xb[row-2];
1983             v2    = a->a + diag[row-1] + 2;
1984             v3    = a->a + diag[row-2] + 3;
1985             for(n = 0; n<sz-1; n+=2) {
1986               i1   = idx[0];
1987               i2   = idx[1];
1988               idx += 2;
1989               tmp0 = x[i1];
1990               tmp1 = x[i2];
1991               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1992               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1993               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1994             }
1995 
1996             if (n == sz-1){
1997               tmp0  = x[*idx];
1998               sum1 -= *v1*tmp0;
1999               sum2 -= *v2*tmp0;
2000               sum3 -= *v3*tmp0;
2001             }
2002             x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
2003             x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
2004             x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
2005             break;
2006           case 4:
2007 
2008             sum1  = xb[row];
2009             sum2  = xb[row-1];
2010             sum3  = xb[row-2];
2011             sum4  = xb[row-3];
2012             v2    = a->a + diag[row-1] + 2;
2013             v3    = a->a + diag[row-2] + 3;
2014             v4    = a->a + diag[row-3] + 4;
2015             for(n = 0; n<sz-1; n+=2) {
2016               i1   = idx[0];
2017               i2   = idx[1];
2018               idx += 2;
2019               tmp0 = x[i1];
2020               tmp1 = x[i2];
2021               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2022               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2023               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2024               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2025             }
2026 
2027             if (n == sz-1){
2028               tmp0  = x[*idx];
2029               sum1 -= *v1*tmp0;
2030               sum2 -= *v2*tmp0;
2031               sum3 -= *v3*tmp0;
2032               sum4 -= *v4*tmp0;
2033             }
2034             x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
2035             x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
2036             x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
2037             x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
2038             break;
2039           case 5:
2040 
2041             sum1  = xb[row];
2042             sum2  = xb[row-1];
2043             sum3  = xb[row-2];
2044             sum4  = xb[row-3];
2045             sum5  = xb[row-4];
2046             v2    = a->a + diag[row-1] + 2;
2047             v3    = a->a + diag[row-2] + 3;
2048             v4    = a->a + diag[row-3] + 4;
2049             v5    = a->a + diag[row-4] + 5;
2050             for(n = 0; n<sz-1; n+=2) {
2051               i1   = idx[0];
2052               i2   = idx[1];
2053               idx += 2;
2054               tmp0 = x[i1];
2055               tmp1 = x[i2];
2056               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2057               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2058               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2059               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2060               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2061             }
2062 
2063             if (n == sz-1){
2064               tmp0  = x[*idx];
2065               sum1 -= *v1*tmp0;
2066               sum2 -= *v2*tmp0;
2067               sum3 -= *v3*tmp0;
2068               sum4 -= *v4*tmp0;
2069               sum5 -= *v5*tmp0;
2070             }
2071             x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
2072             x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
2073             x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
2074             x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
2075             x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
2076             break;
2077 	  default:
2078    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2079         }
2080       }
2081 
2082       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2083     }
2084     its--;
2085   }
2086   if (its) SETERRQ(PETSC_ERR_SUP,"Currently no support for multiply SOR sweeps using inode version of AIJ matrix format;\n run with the option -mat_no_inode");
2087   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2088   if (bb != xx) {ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);}
2089   PetscFunctionReturn(0);
2090 }
2091 
2092 
2093 /*
2094     samestructure indicates that the matrix has not changed its nonzero structure so we
2095     do not need to recompute the inodes
2096 */
2097 #undef __FUNCT__
2098 #define __FUNCT__ "Mat_CheckInode"
2099 PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure)
2100 {
2101   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
2102   PetscErrorCode ierr;
2103   PetscInt       i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size;
2104   PetscTruth     flag;
2105 
2106   PetscFunctionBegin;
2107   if (!a->inode.use)                     PetscFunctionReturn(0);
2108   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
2109 
2110 
2111   m = A->rmap.n;
2112   if (a->inode.size) {ns = a->inode.size;}
2113   else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);}
2114 
2115   i          = 0;
2116   node_count = 0;
2117   idx        = a->j;
2118   ii         = a->i;
2119   while (i < m){                /* For each row */
2120     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
2121     /* Limits the number of elements in a node to 'a->inode.limit' */
2122     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
2123       nzy     = ii[j+1] - ii[j]; /* Same number of nonzeros */
2124       if (nzy != nzx) break;
2125       idy  += nzx;             /* Same nonzero pattern */
2126       ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
2127       if (!flag) break;
2128     }
2129     ns[node_count++] = blk_size;
2130     idx += blk_size*nzx;
2131     i    = j;
2132   }
2133   /* If not enough inodes found,, do not use inode version of the routines */
2134   if (!a->inode.size && m && node_count > .9*m) {
2135     ierr = PetscFree(ns);CHKERRQ(ierr);
2136     a->inode.node_count     = 0;
2137     a->inode.size           = PETSC_NULL;
2138     a->inode.use            = PETSC_FALSE;
2139     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
2140   } else {
2141     A->ops->mult            = MatMult_Inode;
2142     A->ops->relax           = MatRelax_Inode;
2143     A->ops->multadd         = MatMultAdd_Inode;
2144     A->ops->solve           = MatSolve_Inode;
2145     A->ops->lufactornumeric = MatLUFactorNumeric_Inode;
2146     A->ops->getrowij        = MatGetRowIJ_Inode;
2147     A->ops->restorerowij    = MatRestoreRowIJ_Inode;
2148     A->ops->getcolumnij     = MatGetColumnIJ_Inode;
2149     A->ops->restorecolumnij = MatRestoreColumnIJ_Inode;
2150     A->ops->coloringpatch   = MatColoringPatch_Inode;
2151     a->inode.node_count     = node_count;
2152     a->inode.size           = ns;
2153     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
2154   }
2155   PetscFunctionReturn(0);
2156 }
2157 
2158 /*
2159      This is really ugly. if inodes are used this replaces the
2160   permutations with ones that correspond to rows/cols of the matrix
2161   rather then inode blocks
2162 */
2163 #undef __FUNCT__
2164 #define __FUNCT__ "MatInodeAdjustForInodes"
2165 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
2166 {
2167   PetscErrorCode ierr,(*f)(Mat,IS*,IS*);
2168 
2169   PetscFunctionBegin;
2170   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);CHKERRQ(ierr);
2171   if (f) {
2172     ierr = (*f)(A,rperm,cperm);CHKERRQ(ierr);
2173   }
2174   PetscFunctionReturn(0);
2175 }
2176 
2177 EXTERN_C_BEGIN
2178 #undef __FUNCT__
2179 #define __FUNCT__ "MatAdjustForInodes_Inode"
2180 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes_Inode(Mat A,IS *rperm,IS *cperm)
2181 {
2182   Mat_SeqAIJ      *a=(Mat_SeqAIJ*)A->data;
2183   PetscErrorCode ierr;
2184   PetscInt       m = A->rmap.n,n = A->cmap.n,i,j,*ridx,*cidx,nslim_row = a->inode.node_count;
2185   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
2186   PetscInt       nslim_col,*ns_col;
2187   IS             ris = *rperm,cis = *cperm;
2188 
2189   PetscFunctionBegin;
2190   if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */
2191   if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */
2192 
2193   ierr  = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
2194   ierr  = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
2195   ierr  = PetscMalloc((m+n+1)*sizeof(PetscInt),&permr);CHKERRQ(ierr);
2196   permc = permr + m;
2197 
2198   ierr  = ISGetIndices(ris,&ridx);CHKERRQ(ierr);
2199   ierr  = ISGetIndices(cis,&cidx);CHKERRQ(ierr);
2200 
2201   /* Form the inode structure for the rows of permuted matric using inv perm*/
2202   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
2203 
2204   /* Construct the permutations for rows*/
2205   for (i=0,row = 0; i<nslim_row; ++i){
2206     indx      = ridx[i];
2207     start_val = tns[indx];
2208     end_val   = tns[indx + 1];
2209     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
2210   }
2211 
2212   /* Form the inode structure for the columns of permuted matrix using inv perm*/
2213   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
2214 
2215  /* Construct permutations for columns */
2216   for (i=0,col=0; i<nslim_col; ++i){
2217     indx      = cidx[i];
2218     start_val = tns[indx];
2219     end_val   = tns[indx + 1];
2220     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
2221   }
2222 
2223   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);CHKERRQ(ierr);
2224   ISSetPermutation(*rperm);
2225   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);CHKERRQ(ierr);
2226   ISSetPermutation(*cperm);
2227 
2228   ierr  = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr);
2229   ierr  = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr);
2230 
2231   ierr = PetscFree(ns_col);CHKERRQ(ierr);
2232   ierr = PetscFree(permr);CHKERRQ(ierr);
2233   ierr = ISDestroy(cis);CHKERRQ(ierr);
2234   ierr = ISDestroy(ris);CHKERRQ(ierr);
2235   ierr = PetscFree(tns);CHKERRQ(ierr);
2236   PetscFunctionReturn(0);
2237 }
2238 EXTERN_C_END
2239 
2240 #undef __FUNCT__
2241 #define __FUNCT__ "MatInodeGetInodeSizes"
2242 /*@C
2243    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
2244 
2245    Collective on Mat
2246 
2247    Input Parameter:
2248 .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
2249 
2250    Output Parameter:
2251 +  node_count - no of inodes present in the matrix.
2252 .  sizes      - an array of size node_count,with sizes of each inode.
2253 -  limit      - the max size used to generate the inodes.
2254 
2255    Level: advanced
2256 
2257    Notes: This routine returns some internal storage information
2258    of the matrix, it is intended to be used by advanced users.
2259    It should be called after the matrix is assembled.
2260    The contents of the sizes[] array should not be changed.
2261    PETSC_NULL may be passed for information not requested.
2262 
2263 .keywords: matrix, seqaij, get, inode
2264 
2265 .seealso: MatGetInfo()
2266 @*/
2267 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
2268 {
2269   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
2270 
2271   PetscFunctionBegin;
2272   if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
2273   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr);
2274   if (f) {
2275     ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr);
2276   }
2277   PetscFunctionReturn(0);
2278 }
2279 
2280 EXTERN_C_BEGIN
2281 #undef __FUNCT__
2282 #define __FUNCT__ "MatInodeGetInodeSizes_Inode"
2283 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
2284 {
2285   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
2286 
2287   PetscFunctionBegin;
2288   if (node_count) *node_count = a->inode.node_count;
2289   if (sizes)      *sizes      = a->inode.size;
2290   if (limit)      *limit      = a->inode.limit;
2291   PetscFunctionReturn(0);
2292 }
2293 EXTERN_C_END
2294