xref: /petsc/src/mat/impls/aij/seq/inode.c (revision 1cbcd9c41d26426b9e0594c714fd92552a02df39)
1 #define PETSCMAT_DLL
2 
3 /*
4   This file provides high performance routines for the Inode format (compressed sparse row)
5   by taking advantage of rows with identical nonzero structure (I-nodes).
6 */
7 #include "src/mat/impls/aij/seq/aij.h"
8 
9 #undef __FUNCT__
10 #define __FUNCT__ "Mat_CreateColInode"
11 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns)
12 {
13   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
14   PetscErrorCode ierr;
15   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;
16 
17   PetscFunctionBegin;
18   n      = A->cmap.n;
19   m      = A->rmap.n;
20   ns_row = a->inode.size;
21 
22   min_mn = (m < n) ? m : n;
23   if (!ns) {
24     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++);
25     for(; count+1 < n; count++,i++);
26     if (count < n)  {
27       i++;
28     }
29     *size = i;
30     PetscFunctionReturn(0);
31   }
32   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr);
33 
34   /* Use the same row structure wherever feasible. */
35   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
36     ns_col[i] = ns_row[i];
37   }
38 
39   /* if m < n; pad up the remainder with inode_limit */
40   for(; count+1 < n; count++,i++) {
41     ns_col[i] = 1;
42   }
43   /* The last node is the odd ball. padd it up with the remaining rows; */
44   if (count < n)  {
45     ns_col[i] = n - count;
46     i++;
47   } else if (count > n) {
48     /* Adjust for the over estimation */
49     ns_col[i-1] += n - count;
50   }
51   *size = i;
52   *ns   = ns_col;
53   PetscFunctionReturn(0);
54 }
55 
56 
57 /*
58       This builds symmetric version of nonzero structure,
59 */
60 #undef __FUNCT__
61 #define __FUNCT__ "MatGetRowIJ_Inode_Symmetric"
62 static PetscErrorCode MatGetRowIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
63 {
64   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
65   PetscErrorCode ierr;
66   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n;
67   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j;
68 
69   PetscFunctionBegin;
70   nslim_row = a->inode.node_count;
71   m         = A->rmap.n;
72   n         = A->cmap.n;
73   if (m != n) SETERRQ(PETSC_ERR_SUP,"MatGetRowIJ_Inode_Symmetric: Matrix should be square");
74 
75   /* Use the row_inode as column_inode */
76   nslim_col = nslim_row;
77   ns_col    = ns_row;
78 
79   /* allocate space for reformated inode structure */
80   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
81   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
82   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
83 
84   for (i1=0,col=0; i1<nslim_col; ++i1){
85     nsz = ns_col[i1];
86     for (i2=0; i2<nsz; ++i2,++col)
87       tvc[col] = i1;
88   }
89   /* allocate space for row pointers */
90   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
91   *iia = ia;
92   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
93   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
94 
95   /* determine the number of columns in each row */
96   ia[0] = oshift;
97   for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) {
98 
99     j    = aj + ai[row] + ishift;
100     jmax = aj + ai[row+1] + ishift;
101     i2   = 0;
102     col  = *j++ + ishift;
103     i2   = tvc[col];
104     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
105       ia[i1+1]++;
106       ia[i2+1]++;
107       i2++;                     /* Start col of next node */
108       while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j;
109       i2 = tvc[col];
110     }
111     if(i2 == i1) ia[i2+1]++;    /* now the diagonal element */
112   }
113 
114   /* shift ia[i] to point to next row */
115   for (i1=1; i1<nslim_row+1; i1++) {
116     row        = ia[i1-1];
117     ia[i1]    += row;
118     work[i1-1] = row - oshift;
119   }
120 
121   /* allocate space for column pointers */
122   nz   = ia[nslim_row] + (!ishift);
123   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
124   *jja = ja;
125 
126  /* loop over lower triangular part putting into ja */
127   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
128     j    = aj + ai[row] + ishift;
129     jmax = aj + ai[row+1] + ishift;
130     i2   = 0;                     /* Col inode index */
131     col  = *j++ + ishift;
132     i2   = tvc[col];
133     while (i2<i1 && j<jmax) {
134       ja[work[i2]++] = i1 + oshift;
135       ja[work[i1]++] = i2 + oshift;
136       ++i2;
137       while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */
138       i2 = tvc[col];
139     }
140     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
141 
142   }
143   ierr = PetscFree(work);CHKERRQ(ierr);
144   ierr = PetscFree(tns);CHKERRQ(ierr);
145   ierr = PetscFree(tvc);CHKERRQ(ierr);
146   PetscFunctionReturn(0);
147 }
148 
149 /*
150       This builds nonsymmetric version of nonzero structure,
151 */
152 #undef __FUNCT__
153 #define __FUNCT__ "MatGetRowIJ_Inode_Nonsymmetric"
154 static PetscErrorCode MatGetRowIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
155 {
156   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
157   PetscErrorCode ierr;
158   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col;
159   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
160 
161   PetscFunctionBegin;
162   nslim_row = a->inode.node_count;
163   n         = A->cmap.n;
164 
165   /* Create The column_inode for this matrix */
166   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
167 
168   /* allocate space for reformated column_inode structure */
169   ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
170   ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
171   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
172 
173   for (i1=0,col=0; i1<nslim_col; ++i1){
174     nsz = ns_col[i1];
175     for (i2=0; i2<nsz; ++i2,++col)
176       tvc[col] = i1;
177   }
178   /* allocate space for row pointers */
179   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
180   *iia = ia;
181   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
182   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
183 
184   /* determine the number of columns in each row */
185   ia[0] = oshift;
186   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
187     j   = aj + ai[row] + ishift;
188     col = *j++ + ishift;
189     i2  = tvc[col];
190     nz  = ai[row+1] - ai[row];
191     while (nz-- > 0) {           /* off-diagonal elemets */
192       ia[i1+1]++;
193       i2++;                     /* Start col of next node */
194       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
195       if (nz > 0) i2 = tvc[col];
196     }
197   }
198 
199   /* shift ia[i] to point to next row */
200   for (i1=1; i1<nslim_row+1; i1++) {
201     row        = ia[i1-1];
202     ia[i1]    += row;
203     work[i1-1] = row - oshift;
204   }
205 
206   /* allocate space for column pointers */
207   nz   = ia[nslim_row] + (!ishift);
208   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
209   *jja = ja;
210 
211  /* loop over matrix putting into ja */
212   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
213     j   = aj + ai[row] + ishift;
214     i2  = 0;                     /* Col inode index */
215     col = *j++ + ishift;
216     i2  = tvc[col];
217     nz  = ai[row+1] - ai[row];
218     while (nz-- > 0) {
219       ja[work[i1]++] = i2 + oshift;
220       ++i2;
221       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
222       if (nz > 0) i2 = tvc[col];
223     }
224   }
225   ierr = PetscFree(ns_col);CHKERRQ(ierr);
226   ierr = PetscFree(work);CHKERRQ(ierr);
227   ierr = PetscFree(tns);CHKERRQ(ierr);
228   ierr = PetscFree(tvc);CHKERRQ(ierr);
229   PetscFunctionReturn(0);
230 }
231 
232 #undef __FUNCT__
233 #define __FUNCT__ "MatGetRowIJ_Inode"
234 static PetscErrorCode MatGetRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
235 {
236   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
237   PetscErrorCode ierr;
238 
239   PetscFunctionBegin;
240   *n     = a->inode.node_count;
241   if (!ia) PetscFunctionReturn(0);
242   if (!blockcompressed) {
243     ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
244   } else if (symmetric) {
245     ierr = MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
246   } else {
247     ierr = MatGetRowIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
248   }
249   PetscFunctionReturn(0);
250 }
251 
252 #undef __FUNCT__
253 #define __FUNCT__ "MatRestoreRowIJ_Inode"
254 static PetscErrorCode MatRestoreRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
255 {
256   PetscErrorCode ierr;
257 
258   PetscFunctionBegin;
259   if (!ia) PetscFunctionReturn(0);
260 
261   if (!blockcompressed) {
262     ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
263   } else {
264     ierr = PetscFree(*ia);CHKERRQ(ierr);
265     ierr = PetscFree(*ja);CHKERRQ(ierr);
266   }
267 
268   PetscFunctionReturn(0);
269 }
270 
271 /* ----------------------------------------------------------- */
272 
273 #undef __FUNCT__
274 #define __FUNCT__ "MatGetColumnIJ_Inode_Nonsymmetric"
275 static PetscErrorCode MatGetColumnIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
276 {
277   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
278   PetscErrorCode ierr;
279   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
280   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
281 
282   PetscFunctionBegin;
283   nslim_row = a->inode.node_count;
284   n         = A->cmap.n;
285 
286   /* Create The column_inode for this matrix */
287   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
288 
289   /* allocate space for reformated column_inode structure */
290   ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
291   ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
292   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
293 
294   for (i1=0,col=0; i1<nslim_col; ++i1){
295     nsz = ns_col[i1];
296     for (i2=0; i2<nsz; ++i2,++col)
297       tvc[col] = i1;
298   }
299   /* allocate space for column pointers */
300   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
301   *iia = ia;
302   ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr);
303   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
304 
305   /* determine the number of columns in each row */
306   ia[0] = oshift;
307   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
308     j   = aj + ai[row] + ishift;
309     col = *j++ + ishift;
310     i2  = tvc[col];
311     nz  = ai[row+1] - ai[row];
312     while (nz-- > 0) {           /* off-diagonal elemets */
313       /* ia[i1+1]++; */
314       ia[i2+1]++;
315       i2++;
316       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
317       if (nz > 0) i2 = tvc[col];
318     }
319   }
320 
321   /* shift ia[i] to point to next col */
322   for (i1=1; i1<nslim_col+1; i1++) {
323     col        = ia[i1-1];
324     ia[i1]    += col;
325     work[i1-1] = col - oshift;
326   }
327 
328   /* allocate space for column pointers */
329   nz   = ia[nslim_col] + (!ishift);
330   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
331   *jja = ja;
332 
333  /* loop over matrix putting into ja */
334   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
335     j   = aj + ai[row] + ishift;
336     i2  = 0;                     /* Col inode index */
337     col = *j++ + ishift;
338     i2  = tvc[col];
339     nz  = ai[row+1] - ai[row];
340     while (nz-- > 0) {
341       /* ja[work[i1]++] = i2 + oshift; */
342       ja[work[i2]++] = i1 + oshift;
343       i2++;
344       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
345       if (nz > 0) i2 = tvc[col];
346     }
347   }
348   ierr = PetscFree(ns_col);CHKERRQ(ierr);
349   ierr = PetscFree(work);CHKERRQ(ierr);
350   ierr = PetscFree(tns);CHKERRQ(ierr);
351   ierr = PetscFree(tvc);CHKERRQ(ierr);
352   PetscFunctionReturn(0);
353 }
354 
355 #undef __FUNCT__
356 #define __FUNCT__ "MatGetColumnIJ_Inode"
357 static PetscErrorCode MatGetColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
358 {
359   PetscErrorCode ierr;
360 
361   PetscFunctionBegin;
362   ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr);
363   if (!ia) PetscFunctionReturn(0);
364 
365   if (!blockcompressed) {
366     ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
367   } else if (symmetric) {
368     /* Since the indices are symmetric it does'nt matter */
369     ierr = MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
370   } else {
371     ierr = MatGetColumnIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
372   }
373   PetscFunctionReturn(0);
374 }
375 
376 #undef __FUNCT__
377 #define __FUNCT__ "MatRestoreColumnIJ_Inode"
378 static PetscErrorCode MatRestoreColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
379 {
380   PetscErrorCode ierr;
381 
382   PetscFunctionBegin;
383   if (!ia) PetscFunctionReturn(0);
384   if (!blockcompressed) {
385     ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
386   } else {
387     ierr = PetscFree(*ia);CHKERRQ(ierr);
388     ierr = PetscFree(*ja);CHKERRQ(ierr);
389   }
390   PetscFunctionReturn(0);
391 }
392 
393 /* ----------------------------------------------------------- */
394 
395 #undef __FUNCT__
396 #define __FUNCT__ "MatMult_Inode"
397 static PetscErrorCode MatMult_Inode(Mat A,Vec xx,Vec yy)
398 {
399   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
400   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
401   PetscScalar       *y;
402   const PetscScalar *x,*v1,*v2,*v3,*v4,*v5;
403   PetscErrorCode    ierr;
404   PetscInt          *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0;
405 
406 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
407 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
408 #endif
409 
410   PetscFunctionBegin;
411   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
412   node_max = a->inode.node_count;
413   ns       = a->inode.size;     /* Node Size array */
414   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
415   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
416   idx  = a->j;
417   v1   = a->a;
418   ii   = a->i;
419 
420   for (i = 0,row = 0; i< node_max; ++i){
421     nsz  = ns[i];
422     n    = ii[1] - ii[0];
423     nonzerorow += (n>0)*nsz;
424     ii  += nsz;
425     sz   = n;                   /* No of non zeros in this row */
426                                 /* Switch on the size of Node */
427     switch (nsz){               /* Each loop in 'case' is unrolled */
428     case 1 :
429       sum1  = 0;
430 
431       for(n = 0; n< sz-1; n+=2) {
432         i1   = idx[0];          /* The instructions are ordered to */
433         i2   = idx[1];          /* make the compiler's job easy */
434         idx += 2;
435         tmp0 = x[i1];
436         tmp1 = x[i2];
437         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
438        }
439 
440       if (n == sz-1){          /* Take care of the last nonzero  */
441         tmp0  = x[*idx++];
442         sum1 += *v1++ * tmp0;
443       }
444       y[row++]=sum1;
445       break;
446     case 2:
447       sum1  = 0;
448       sum2  = 0;
449       v2    = v1 + n;
450 
451       for (n = 0; n< sz-1; n+=2) {
452         i1   = idx[0];
453         i2   = idx[1];
454         idx += 2;
455         tmp0 = x[i1];
456         tmp1 = x[i2];
457         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
458         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
459       }
460       if (n == sz-1){
461         tmp0  = x[*idx++];
462         sum1 += *v1++ * tmp0;
463         sum2 += *v2++ * tmp0;
464       }
465       y[row++]=sum1;
466       y[row++]=sum2;
467       v1      =v2;              /* Since the next block to be processed starts there*/
468       idx    +=sz;
469       break;
470     case 3:
471       sum1  = 0;
472       sum2  = 0;
473       sum3  = 0;
474       v2    = v1 + n;
475       v3    = v2 + n;
476 
477       for (n = 0; n< sz-1; n+=2) {
478         i1   = idx[0];
479         i2   = idx[1];
480         idx += 2;
481         tmp0 = x[i1];
482         tmp1 = x[i2];
483         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
484         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
485         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
486       }
487       if (n == sz-1){
488         tmp0  = x[*idx++];
489         sum1 += *v1++ * tmp0;
490         sum2 += *v2++ * tmp0;
491         sum3 += *v3++ * tmp0;
492       }
493       y[row++]=sum1;
494       y[row++]=sum2;
495       y[row++]=sum3;
496       v1       =v3;             /* Since the next block to be processed starts there*/
497       idx     +=2*sz;
498       break;
499     case 4:
500       sum1  = 0;
501       sum2  = 0;
502       sum3  = 0;
503       sum4  = 0;
504       v2    = v1 + n;
505       v3    = v2 + n;
506       v4    = v3 + n;
507 
508       for (n = 0; n< sz-1; n+=2) {
509         i1   = idx[0];
510         i2   = idx[1];
511         idx += 2;
512         tmp0 = x[i1];
513         tmp1 = x[i2];
514         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
515         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
516         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
517         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
518       }
519       if (n == sz-1){
520         tmp0  = x[*idx++];
521         sum1 += *v1++ * tmp0;
522         sum2 += *v2++ * tmp0;
523         sum3 += *v3++ * tmp0;
524         sum4 += *v4++ * tmp0;
525       }
526       y[row++]=sum1;
527       y[row++]=sum2;
528       y[row++]=sum3;
529       y[row++]=sum4;
530       v1      =v4;              /* Since the next block to be processed starts there*/
531       idx    +=3*sz;
532       break;
533     case 5:
534       sum1  = 0;
535       sum2  = 0;
536       sum3  = 0;
537       sum4  = 0;
538       sum5  = 0;
539       v2    = v1 + n;
540       v3    = v2 + n;
541       v4    = v3 + n;
542       v5    = v4 + n;
543 
544       for (n = 0; n<sz-1; n+=2) {
545         i1   = idx[0];
546         i2   = idx[1];
547         idx += 2;
548         tmp0 = x[i1];
549         tmp1 = x[i2];
550         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
551         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
552         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
553         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
554         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
555       }
556       if (n == sz-1){
557         tmp0  = x[*idx++];
558         sum1 += *v1++ * tmp0;
559         sum2 += *v2++ * tmp0;
560         sum3 += *v3++ * tmp0;
561         sum4 += *v4++ * tmp0;
562         sum5 += *v5++ * tmp0;
563       }
564       y[row++]=sum1;
565       y[row++]=sum2;
566       y[row++]=sum3;
567       y[row++]=sum4;
568       y[row++]=sum5;
569       v1      =v5;       /* Since the next block to be processed starts there */
570       idx    +=4*sz;
571       break;
572     default :
573       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
574     }
575   }
576   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
577   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
578   ierr = PetscLogFlops(2*a->nz - nonzerorow);CHKERRQ(ierr);
579   PetscFunctionReturn(0);
580 }
581 /* ----------------------------------------------------------- */
582 /* Almost same code as the MatMult_Inode() */
583 #undef __FUNCT__
584 #define __FUNCT__ "MatMultAdd_Inode"
585 static PetscErrorCode MatMultAdd_Inode(Mat A,Vec xx,Vec zz,Vec yy)
586 {
587   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
588   PetscScalar    sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
589   PetscScalar    *v1,*v2,*v3,*v4,*v5,*x,*y,*z,*zt;
590   PetscErrorCode ierr;
591   PetscInt       *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
592 
593   PetscFunctionBegin;
594   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
595   node_max = a->inode.node_count;
596   ns       = a->inode.size;     /* Node Size array */
597   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
598   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
599   if (zz != yy) {
600     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
601   } else {
602     z = y;
603   }
604   zt = z;
605 
606   idx  = a->j;
607   v1   = a->a;
608   ii   = a->i;
609 
610   for (i = 0,row = 0; i< node_max; ++i){
611     nsz  = ns[i];
612     n    = ii[1] - ii[0];
613     ii  += nsz;
614     sz   = n;                   /* No of non zeros in this row */
615                                 /* Switch on the size of Node */
616     switch (nsz){               /* Each loop in 'case' is unrolled */
617     case 1 :
618       sum1  = *zt++;
619 
620       for(n = 0; n< sz-1; n+=2) {
621         i1   = idx[0];          /* The instructions are ordered to */
622         i2   = idx[1];          /* make the compiler's job easy */
623         idx += 2;
624         tmp0 = x[i1];
625         tmp1 = x[i2];
626         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
627        }
628 
629       if(n   == sz-1){          /* Take care of the last nonzero  */
630         tmp0  = x[*idx++];
631         sum1 += *v1++ * tmp0;
632       }
633       y[row++]=sum1;
634       break;
635     case 2:
636       sum1  = *zt++;
637       sum2  = *zt++;
638       v2    = v1 + n;
639 
640       for(n = 0; n< sz-1; n+=2) {
641         i1   = idx[0];
642         i2   = idx[1];
643         idx += 2;
644         tmp0 = x[i1];
645         tmp1 = x[i2];
646         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
647         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
648       }
649       if(n   == sz-1){
650         tmp0  = x[*idx++];
651         sum1 += *v1++ * tmp0;
652         sum2 += *v2++ * tmp0;
653       }
654       y[row++]=sum1;
655       y[row++]=sum2;
656       v1      =v2;              /* Since the next block to be processed starts there*/
657       idx    +=sz;
658       break;
659     case 3:
660       sum1  = *zt++;
661       sum2  = *zt++;
662       sum3  = *zt++;
663       v2    = v1 + n;
664       v3    = v2 + n;
665 
666       for (n = 0; n< sz-1; n+=2) {
667         i1   = idx[0];
668         i2   = idx[1];
669         idx += 2;
670         tmp0 = x[i1];
671         tmp1 = x[i2];
672         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
673         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
674         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
675       }
676       if (n == sz-1){
677         tmp0  = x[*idx++];
678         sum1 += *v1++ * tmp0;
679         sum2 += *v2++ * tmp0;
680         sum3 += *v3++ * tmp0;
681       }
682       y[row++]=sum1;
683       y[row++]=sum2;
684       y[row++]=sum3;
685       v1       =v3;             /* Since the next block to be processed starts there*/
686       idx     +=2*sz;
687       break;
688     case 4:
689       sum1  = *zt++;
690       sum2  = *zt++;
691       sum3  = *zt++;
692       sum4  = *zt++;
693       v2    = v1 + n;
694       v3    = v2 + n;
695       v4    = v3 + n;
696 
697       for (n = 0; n< sz-1; n+=2) {
698         i1   = idx[0];
699         i2   = idx[1];
700         idx += 2;
701         tmp0 = x[i1];
702         tmp1 = x[i2];
703         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
704         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
705         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
706         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
707       }
708       if (n == sz-1){
709         tmp0  = x[*idx++];
710         sum1 += *v1++ * tmp0;
711         sum2 += *v2++ * tmp0;
712         sum3 += *v3++ * tmp0;
713         sum4 += *v4++ * tmp0;
714       }
715       y[row++]=sum1;
716       y[row++]=sum2;
717       y[row++]=sum3;
718       y[row++]=sum4;
719       v1      =v4;              /* Since the next block to be processed starts there*/
720       idx    +=3*sz;
721       break;
722     case 5:
723       sum1  = *zt++;
724       sum2  = *zt++;
725       sum3  = *zt++;
726       sum4  = *zt++;
727       sum5  = *zt++;
728       v2    = v1 + n;
729       v3    = v2 + n;
730       v4    = v3 + n;
731       v5    = v4 + n;
732 
733       for (n = 0; n<sz-1; n+=2) {
734         i1   = idx[0];
735         i2   = idx[1];
736         idx += 2;
737         tmp0 = x[i1];
738         tmp1 = x[i2];
739         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
740         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
741         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
742         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
743         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
744       }
745       if(n   == sz-1){
746         tmp0  = x[*idx++];
747         sum1 += *v1++ * tmp0;
748         sum2 += *v2++ * tmp0;
749         sum3 += *v3++ * tmp0;
750         sum4 += *v4++ * tmp0;
751         sum5 += *v5++ * tmp0;
752       }
753       y[row++]=sum1;
754       y[row++]=sum2;
755       y[row++]=sum3;
756       y[row++]=sum4;
757       y[row++]=sum5;
758       v1      =v5;       /* Since the next block to be processed starts there */
759       idx    +=4*sz;
760       break;
761     default :
762       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
763     }
764   }
765   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
766   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
767   if (zz != yy) {
768     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
769   }
770   ierr = PetscLogFlops(2*a->nz);CHKERRQ(ierr);
771   PetscFunctionReturn(0);
772 }
773 
774 /* ----------------------------------------------------------- */
775 #undef __FUNCT__
776 #define __FUNCT__ "MatSolve_Inode"
777 PetscErrorCode MatSolve_Inode(Mat A,Vec bb,Vec xx)
778 {
779   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
780   IS                iscol = a->col,isrow = a->row;
781   PetscErrorCode    ierr;
782   PetscInt          *r,*c,i,j,n = A->rmap.n,*ai = a->i,nz,*a_j = a->j;
783   PetscInt          node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1,*rout,*cout;
784   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
785   PetscScalar       sum1,sum2,sum3,sum4,sum5;
786   const PetscScalar *v1,*v2,*v3,*v4,*v5,*b,*a_a = a->a,*aa;
787 
788   PetscFunctionBegin;
789   if (A->factor!=FACTOR_LU) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unfactored matrix");
790   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
791   node_max = a->inode.node_count;
792   ns       = a->inode.size;     /* Node Size array */
793 
794   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
795   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
796   tmp  = a->solve_work;
797 
798   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
799   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
800 
801   /* forward solve the lower triangular */
802   tmps = tmp ;
803   aa   = a_a ;
804   aj   = a_j ;
805   ad   = a->diag;
806 
807   for (i = 0,row = 0; i< node_max; ++i){
808     nsz = ns[i];
809     aii = ai[row];
810     v1  = aa + aii;
811     vi  = aj + aii;
812     nz  = ad[row]- aii;
813 
814     switch (nsz){               /* Each loop in 'case' is unrolled */
815     case 1 :
816       sum1 = b[*r++];
817       /*      while (nz--) sum1 -= *v1++ *tmps[*vi++];*/
818       for(j=0; j<nz-1; j+=2){
819         i0   = vi[0];
820         i1   = vi[1];
821         vi  +=2;
822         tmp0 = tmps[i0];
823         tmp1 = tmps[i1];
824         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
825       }
826       if(j == nz-1){
827         tmp0 = tmps[*vi++];
828         sum1 -= *v1++ *tmp0;
829       }
830       tmp[row ++]=sum1;
831       break;
832     case 2:
833       sum1 = b[*r++];
834       sum2 = b[*r++];
835       v2   = aa + ai[row+1];
836 
837       for(j=0; j<nz-1; j+=2){
838         i0   = vi[0];
839         i1   = vi[1];
840         vi  +=2;
841         tmp0 = tmps[i0];
842         tmp1 = tmps[i1];
843         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
844         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
845       }
846       if(j == nz-1){
847         tmp0 = tmps[*vi++];
848         sum1 -= *v1++ *tmp0;
849         sum2 -= *v2++ *tmp0;
850       }
851       sum2 -= *v2++ * sum1;
852       tmp[row ++]=sum1;
853       tmp[row ++]=sum2;
854       break;
855     case 3:
856       sum1 = b[*r++];
857       sum2 = b[*r++];
858       sum3 = b[*r++];
859       v2   = aa + ai[row+1];
860       v3   = aa + ai[row+2];
861 
862       for (j=0; j<nz-1; j+=2){
863         i0   = vi[0];
864         i1   = vi[1];
865         vi  +=2;
866         tmp0 = tmps[i0];
867         tmp1 = tmps[i1];
868         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
869         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
870         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
871       }
872       if (j == nz-1){
873         tmp0 = tmps[*vi++];
874         sum1 -= *v1++ *tmp0;
875         sum2 -= *v2++ *tmp0;
876         sum3 -= *v3++ *tmp0;
877       }
878       sum2 -= *v2++ * sum1;
879       sum3 -= *v3++ * sum1;
880       sum3 -= *v3++ * sum2;
881       tmp[row ++]=sum1;
882       tmp[row ++]=sum2;
883       tmp[row ++]=sum3;
884       break;
885 
886     case 4:
887       sum1 = b[*r++];
888       sum2 = b[*r++];
889       sum3 = b[*r++];
890       sum4 = b[*r++];
891       v2   = aa + ai[row+1];
892       v3   = aa + ai[row+2];
893       v4   = aa + ai[row+3];
894 
895       for (j=0; j<nz-1; j+=2){
896         i0   = vi[0];
897         i1   = vi[1];
898         vi  +=2;
899         tmp0 = tmps[i0];
900         tmp1 = tmps[i1];
901         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
902         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
903         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
904         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
905       }
906       if (j == nz-1){
907         tmp0 = tmps[*vi++];
908         sum1 -= *v1++ *tmp0;
909         sum2 -= *v2++ *tmp0;
910         sum3 -= *v3++ *tmp0;
911         sum4 -= *v4++ *tmp0;
912       }
913       sum2 -= *v2++ * sum1;
914       sum3 -= *v3++ * sum1;
915       sum4 -= *v4++ * sum1;
916       sum3 -= *v3++ * sum2;
917       sum4 -= *v4++ * sum2;
918       sum4 -= *v4++ * sum3;
919 
920       tmp[row ++]=sum1;
921       tmp[row ++]=sum2;
922       tmp[row ++]=sum3;
923       tmp[row ++]=sum4;
924       break;
925     case 5:
926       sum1 = b[*r++];
927       sum2 = b[*r++];
928       sum3 = b[*r++];
929       sum4 = b[*r++];
930       sum5 = b[*r++];
931       v2   = aa + ai[row+1];
932       v3   = aa + ai[row+2];
933       v4   = aa + ai[row+3];
934       v5   = aa + ai[row+4];
935 
936       for (j=0; j<nz-1; j+=2){
937         i0   = vi[0];
938         i1   = vi[1];
939         vi  +=2;
940         tmp0 = tmps[i0];
941         tmp1 = tmps[i1];
942         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
943         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
944         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
945         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
946         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
947       }
948       if (j == nz-1){
949         tmp0 = tmps[*vi++];
950         sum1 -= *v1++ *tmp0;
951         sum2 -= *v2++ *tmp0;
952         sum3 -= *v3++ *tmp0;
953         sum4 -= *v4++ *tmp0;
954         sum5 -= *v5++ *tmp0;
955       }
956 
957       sum2 -= *v2++ * sum1;
958       sum3 -= *v3++ * sum1;
959       sum4 -= *v4++ * sum1;
960       sum5 -= *v5++ * sum1;
961       sum3 -= *v3++ * sum2;
962       sum4 -= *v4++ * sum2;
963       sum5 -= *v5++ * sum2;
964       sum4 -= *v4++ * sum3;
965       sum5 -= *v5++ * sum3;
966       sum5 -= *v5++ * sum4;
967 
968       tmp[row ++]=sum1;
969       tmp[row ++]=sum2;
970       tmp[row ++]=sum3;
971       tmp[row ++]=sum4;
972       tmp[row ++]=sum5;
973       break;
974     default:
975       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
976     }
977   }
978   /* backward solve the upper triangular */
979   for (i=node_max -1 ,row = n-1 ; i>=0; i--){
980     nsz = ns[i];
981     aii = ai[row+1] -1;
982     v1  = aa + aii;
983     vi  = aj + aii;
984     nz  = aii- ad[row];
985     switch (nsz){               /* Each loop in 'case' is unrolled */
986     case 1 :
987       sum1 = tmp[row];
988 
989       for(j=nz ; j>1; j-=2){
990         vi  -=2;
991         i0   = vi[2];
992         i1   = vi[1];
993         tmp0 = tmps[i0];
994         tmp1 = tmps[i1];
995         v1   -= 2;
996         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
997       }
998       if (j==1){
999         tmp0  = tmps[*vi--];
1000         sum1 -= *v1-- * tmp0;
1001       }
1002       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1003       break;
1004     case 2 :
1005       sum1 = tmp[row];
1006       sum2 = tmp[row -1];
1007       v2   = aa + ai[row]-1;
1008       for (j=nz ; j>1; j-=2){
1009         vi  -=2;
1010         i0   = vi[2];
1011         i1   = vi[1];
1012         tmp0 = tmps[i0];
1013         tmp1 = tmps[i1];
1014         v1   -= 2;
1015         v2   -= 2;
1016         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1017         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1018       }
1019       if (j==1){
1020         tmp0  = tmps[*vi--];
1021         sum1 -= *v1-- * tmp0;
1022         sum2 -= *v2-- * tmp0;
1023       }
1024 
1025       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1026       sum2   -= *v2-- * tmp0;
1027       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1028       break;
1029     case 3 :
1030       sum1 = tmp[row];
1031       sum2 = tmp[row -1];
1032       sum3 = tmp[row -2];
1033       v2   = aa + ai[row]-1;
1034       v3   = aa + ai[row -1]-1;
1035       for (j=nz ; j>1; j-=2){
1036         vi  -=2;
1037         i0   = vi[2];
1038         i1   = vi[1];
1039         tmp0 = tmps[i0];
1040         tmp1 = tmps[i1];
1041         v1   -= 2;
1042         v2   -= 2;
1043         v3   -= 2;
1044         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1045         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1046         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1047       }
1048       if (j==1){
1049         tmp0  = tmps[*vi--];
1050         sum1 -= *v1-- * tmp0;
1051         sum2 -= *v2-- * tmp0;
1052         sum3 -= *v3-- * tmp0;
1053       }
1054       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1055       sum2   -= *v2-- * tmp0;
1056       sum3   -= *v3-- * tmp0;
1057       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1058       sum3   -= *v3-- * tmp0;
1059       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1060 
1061       break;
1062     case 4 :
1063       sum1 = tmp[row];
1064       sum2 = tmp[row -1];
1065       sum3 = tmp[row -2];
1066       sum4 = tmp[row -3];
1067       v2   = aa + ai[row]-1;
1068       v3   = aa + ai[row -1]-1;
1069       v4   = aa + ai[row -2]-1;
1070 
1071       for (j=nz ; j>1; j-=2){
1072         vi  -=2;
1073         i0   = vi[2];
1074         i1   = vi[1];
1075         tmp0 = tmps[i0];
1076         tmp1 = tmps[i1];
1077         v1  -= 2;
1078         v2  -= 2;
1079         v3  -= 2;
1080         v4  -= 2;
1081         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1082         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1083         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1084         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1085       }
1086       if (j==1){
1087         tmp0  = tmps[*vi--];
1088         sum1 -= *v1-- * tmp0;
1089         sum2 -= *v2-- * tmp0;
1090         sum3 -= *v3-- * tmp0;
1091         sum4 -= *v4-- * tmp0;
1092       }
1093 
1094       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1095       sum2   -= *v2-- * tmp0;
1096       sum3   -= *v3-- * tmp0;
1097       sum4   -= *v4-- * tmp0;
1098       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1099       sum3   -= *v3-- * tmp0;
1100       sum4   -= *v4-- * tmp0;
1101       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1102       sum4   -= *v4-- * tmp0;
1103       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1104       break;
1105     case 5 :
1106       sum1 = tmp[row];
1107       sum2 = tmp[row -1];
1108       sum3 = tmp[row -2];
1109       sum4 = tmp[row -3];
1110       sum5 = tmp[row -4];
1111       v2   = aa + ai[row]-1;
1112       v3   = aa + ai[row -1]-1;
1113       v4   = aa + ai[row -2]-1;
1114       v5   = aa + ai[row -3]-1;
1115       for (j=nz ; j>1; j-=2){
1116         vi  -= 2;
1117         i0   = vi[2];
1118         i1   = vi[1];
1119         tmp0 = tmps[i0];
1120         tmp1 = tmps[i1];
1121         v1   -= 2;
1122         v2   -= 2;
1123         v3   -= 2;
1124         v4   -= 2;
1125         v5   -= 2;
1126         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1127         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1128         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1129         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1130         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1131       }
1132       if (j==1){
1133         tmp0  = tmps[*vi--];
1134         sum1 -= *v1-- * tmp0;
1135         sum2 -= *v2-- * tmp0;
1136         sum3 -= *v3-- * tmp0;
1137         sum4 -= *v4-- * tmp0;
1138         sum5 -= *v5-- * tmp0;
1139       }
1140 
1141       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1142       sum2   -= *v2-- * tmp0;
1143       sum3   -= *v3-- * tmp0;
1144       sum4   -= *v4-- * tmp0;
1145       sum5   -= *v5-- * tmp0;
1146       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1147       sum3   -= *v3-- * tmp0;
1148       sum4   -= *v4-- * tmp0;
1149       sum5   -= *v5-- * tmp0;
1150       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1151       sum4   -= *v4-- * tmp0;
1152       sum5   -= *v5-- * tmp0;
1153       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1154       sum5   -= *v5-- * tmp0;
1155       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1156       break;
1157     default:
1158       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1159     }
1160   }
1161   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1162   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1163   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1164   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1165   ierr = PetscLogFlops(2*a->nz - A->cmap.n);CHKERRQ(ierr);
1166   PetscFunctionReturn(0);
1167 }
1168 
1169 #undef __FUNCT__
1170 #define __FUNCT__ "MatLUFactorNumeric_Inode"
1171 PetscErrorCode MatLUFactorNumeric_Inode(Mat A,MatFactorInfo *info,Mat *B)
1172 {
1173   Mat               C = *B;
1174   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1175   IS                iscol = b->col,isrow = b->row,isicol = b->icol;
1176   PetscErrorCode    ierr;
1177   PetscInt          *r,*ic,*c,n = A->rmap.n,*bi = b->i;
1178   PetscInt          *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1179   PetscInt          *ics,i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz;
1180   PetscInt          *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1181   PetscScalar       *pc1,*pc2,*pc3,mul1,mul2,mul3;
1182   PetscScalar       tmp,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1183   const PetscScalar *v1,*v2,*v3,*aa = a->a,*rtmp1;
1184   PetscReal         rs=0.0;
1185   LUShift_Ctx       sctx;
1186   PetscInt          newshift;
1187 
1188   PetscFunctionBegin;
1189   sctx.shift_top  = 0;
1190   sctx.nshift_max = 0;
1191   sctx.shift_lo   = 0;
1192   sctx.shift_hi   = 0;
1193 
1194   /* if both shift schemes are chosen by user, only use info->shiftpd */
1195   if (info->shiftpd && info->shiftnz) info->shiftnz = 0.0;
1196   if (info->shiftpd) { /* set sctx.shift_top=max{rs} */
1197     sctx.shift_top = 0;
1198     for (i=0; i<n; i++) {
1199       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1200       rs    = 0.0;
1201       ajtmp = aj + ai[i];
1202       rtmp1 = aa + ai[i];
1203       nz = ai[i+1] - ai[i];
1204       for (j=0; j<nz; j++){
1205         if (*ajtmp != i){
1206           rs += PetscAbsScalar(*rtmp1++);
1207         } else {
1208           rs -= PetscRealPart(*rtmp1++);
1209         }
1210         ajtmp++;
1211       }
1212       if (rs>sctx.shift_top) sctx.shift_top = rs;
1213     }
1214     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1215     sctx.shift_top *= 1.1;
1216     sctx.nshift_max = 5;
1217     sctx.shift_lo   = 0.;
1218     sctx.shift_hi   = 1.;
1219   }
1220   sctx.shift_amount = 0;
1221   sctx.nshift       = 0;
1222 
1223   ierr  = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1224   ierr  = ISGetIndices(iscol,&c);CHKERRQ(ierr);
1225   ierr  = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1226   ierr  = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp11);CHKERRQ(ierr);
1227   ierr  = PetscMemzero(rtmp11,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr);
1228   ics   = ic ;
1229   rtmp22 = rtmp11 + n;
1230   rtmp33 = rtmp22 + n;
1231 
1232   node_max = a->inode.node_count;
1233   ns       = a->inode.size ;
1234   if (!ns){
1235     SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information");
1236   }
1237 
1238   /* If max inode size > 3, split it into two inodes.*/
1239   /* also map the inode sizes according to the ordering */
1240   ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr);
1241   for (i=0,j=0; i<node_max; ++i,++j){
1242     if (ns[i]>3) {
1243       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1244       ++j;
1245       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1246     } else {
1247       tmp_vec1[j] = ns[i];
1248     }
1249   }
1250   /* Use the correct node_max */
1251   node_max = j;
1252 
1253   /* Now reorder the inode info based on mat re-ordering info */
1254   /* First create a row -> inode_size_array_index map */
1255   ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr);
1256   ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr);
1257   for (i=0,row=0; i<node_max; i++) {
1258     nodesz = tmp_vec1[i];
1259     for (j=0; j<nodesz; j++,row++) {
1260       nsmap[row] = i;
1261     }
1262   }
1263   /* Using nsmap, create a reordered ns structure */
1264   for (i=0,j=0; i< node_max; i++) {
1265     nodesz       = tmp_vec1[nsmap[r[j]]];    /* here the reordered row_no is in r[] */
1266     tmp_vec2[i]  = nodesz;
1267     j           += nodesz;
1268   }
1269   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1270   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1271   /* Now use the correct ns */
1272   ns = tmp_vec2;
1273 
1274   do {
1275     sctx.lushift = PETSC_FALSE;
1276     /* Now loop over each block-row, and do the factorization */
1277     for (i=0,row=0; i<node_max; i++) {
1278       nodesz = ns[i];
1279       nz     = bi[row+1] - bi[row];
1280       bjtmp  = bj + bi[row];
1281 
1282       switch (nodesz){
1283       case 1:
1284         for  (j=0; j<nz; j++){
1285           idx        = bjtmp[j];
1286           rtmp11[idx] = 0.0;
1287         }
1288 
1289         /* load in initial (unfactored row) */
1290         idx    = r[row];
1291         nz_tmp = ai[idx+1] - ai[idx];
1292         ajtmp  = aj + ai[idx];
1293         v1     = aa + ai[idx];
1294 
1295         for (j=0; j<nz_tmp; j++) {
1296           idx        = ics[ajtmp[j]];
1297           rtmp11[idx] = v1[j];
1298         }
1299         rtmp11[ics[r[row]]] += sctx.shift_amount;
1300 
1301         prow = *bjtmp++ ;
1302         while (prow < row) {
1303           pc1 = rtmp11 + prow;
1304           if (*pc1 != 0.0){
1305             pv   = ba + bd[prow];
1306             pj   = nbj + bd[prow];
1307             mul1 = *pc1 * *pv++;
1308             *pc1 = mul1;
1309             nz_tmp = bi[prow+1] - bd[prow] - 1;
1310             ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr);
1311             for (j=0; j<nz_tmp; j++) {
1312               tmp = pv[j];
1313               idx = pj[j];
1314               rtmp11[idx] -= mul1 * tmp;
1315             }
1316           }
1317           prow = *bjtmp++ ;
1318         }
1319         pj  = bj + bi[row];
1320         pc1 = ba + bi[row];
1321 
1322         sctx.pv    = rtmp11[row];
1323         rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
1324         rs         = 0.0;
1325         for (j=0; j<nz; j++) {
1326           idx    = pj[j];
1327           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
1328           if (idx != row) rs += PetscAbsScalar(pc1[j]);
1329         }
1330         sctx.rs  = rs;
1331         ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr);
1332         if (newshift == 1) goto endofwhile;
1333         break;
1334 
1335       case 2:
1336         for (j=0; j<nz; j++) {
1337           idx        = bjtmp[j];
1338           rtmp11[idx] = 0.0;
1339           rtmp22[idx] = 0.0;
1340         }
1341 
1342         /* load in initial (unfactored row) */
1343         idx    = r[row];
1344         nz_tmp = ai[idx+1] - ai[idx];
1345         ajtmp  = aj + ai[idx];
1346         v1     = aa + ai[idx];
1347         v2     = aa + ai[idx+1];
1348         for (j=0; j<nz_tmp; j++) {
1349           idx        = ics[ajtmp[j]];
1350           rtmp11[idx] = v1[j];
1351           rtmp22[idx] = v2[j];
1352         }
1353         rtmp11[ics[r[row]]]   += sctx.shift_amount;
1354         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
1355 
1356         prow = *bjtmp++ ;
1357         while (prow < row) {
1358           pc1 = rtmp11 + prow;
1359           pc2 = rtmp22 + prow;
1360           if (*pc1 != 0.0 || *pc2 != 0.0){
1361             pv   = ba + bd[prow];
1362             pj   = nbj + bd[prow];
1363             mul1 = *pc1 * *pv;
1364             mul2 = *pc2 * *pv;
1365             ++pv;
1366             *pc1 = mul1;
1367             *pc2 = mul2;
1368 
1369             nz_tmp = bi[prow+1] - bd[prow] - 1;
1370             for (j=0; j<nz_tmp; j++) {
1371               tmp = pv[j];
1372               idx = pj[j];
1373               rtmp11[idx] -= mul1 * tmp;
1374               rtmp22[idx] -= mul2 * tmp;
1375             }
1376             ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr);
1377           }
1378           prow = *bjtmp++ ;
1379         }
1380 
1381         /* Now take care of diagonal 2x2 block. Note: prow = row here */
1382         pc1 = rtmp11 + prow;
1383         pc2 = rtmp22 + prow;
1384 
1385         sctx.pv = *pc1;
1386         pj      = bj + bi[prow];
1387         rs      = 0.0;
1388         for (j=0; j<nz; j++){
1389           idx = pj[j];
1390           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
1391         }
1392         sctx.rs = rs;
1393         ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr);
1394         if (newshift == 1) goto endofwhile;
1395 
1396         if (*pc2 != 0.0){
1397           pj     = nbj + bd[prow];
1398           mul2   = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
1399           *pc2   = mul2;
1400           nz_tmp = bi[prow+1] - bd[prow] - 1;
1401           for (j=0; j<nz_tmp; j++) {
1402             idx = pj[j] ;
1403             tmp = rtmp11[idx];
1404             rtmp22[idx] -= mul2 * tmp;
1405           }
1406           ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr);
1407         }
1408 
1409         pj  = bj + bi[row];
1410         pc1 = ba + bi[row];
1411         pc2 = ba + bi[row+1];
1412 
1413         sctx.pv = rtmp22[row+1];
1414         rs = 0.0;
1415         rtmp11[row]   = 1.0/rtmp11[row];
1416         rtmp22[row+1] = 1.0/rtmp22[row+1];
1417         /* copy row entries from dense representation to sparse */
1418         for (j=0; j<nz; j++) {
1419           idx    = pj[j];
1420           pc1[j] = rtmp11[idx];
1421           pc2[j] = rtmp22[idx];
1422           if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
1423         }
1424         sctx.rs = rs;
1425         ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr);
1426         if (newshift == 1) goto endofwhile;
1427         break;
1428 
1429       case 3:
1430         for  (j=0; j<nz; j++) {
1431           idx        = bjtmp[j];
1432           rtmp11[idx] = 0.0;
1433           rtmp22[idx] = 0.0;
1434           rtmp33[idx] = 0.0;
1435         }
1436         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
1437         idx    = r[row];
1438         nz_tmp = ai[idx+1] - ai[idx];
1439         ajtmp = aj + ai[idx];
1440         v1    = aa + ai[idx];
1441         v2    = aa + ai[idx+1];
1442         v3    = aa + ai[idx+2];
1443         for (j=0; j<nz_tmp; j++) {
1444           idx        = ics[ajtmp[j]];
1445           rtmp11[idx] = v1[j];
1446           rtmp22[idx] = v2[j];
1447           rtmp33[idx] = v3[j];
1448         }
1449         rtmp11[ics[r[row]]]   += sctx.shift_amount;
1450         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
1451         rtmp33[ics[r[row+2]]] += sctx.shift_amount;
1452 
1453         /* loop over all pivot row blocks above this row block */
1454         prow = *bjtmp++ ;
1455         while (prow < row) {
1456           pc1 = rtmp11 + prow;
1457           pc2 = rtmp22 + prow;
1458           pc3 = rtmp33 + prow;
1459           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){
1460             pv   = ba  + bd[prow];
1461             pj   = nbj + bd[prow];
1462             mul1 = *pc1 * *pv;
1463             mul2 = *pc2 * *pv;
1464             mul3 = *pc3 * *pv;
1465             ++pv;
1466             *pc1 = mul1;
1467             *pc2 = mul2;
1468             *pc3 = mul3;
1469 
1470             nz_tmp = bi[prow+1] - bd[prow] - 1;
1471             /* update this row based on pivot row */
1472             for (j=0; j<nz_tmp; j++) {
1473               tmp = pv[j];
1474               idx = pj[j];
1475               rtmp11[idx] -= mul1 * tmp;
1476               rtmp22[idx] -= mul2 * tmp;
1477               rtmp33[idx] -= mul3 * tmp;
1478             }
1479             ierr = PetscLogFlops(6*nz_tmp);CHKERRQ(ierr);
1480           }
1481           prow = *bjtmp++ ;
1482         }
1483 
1484         /* Now take care of diagonal 3x3 block in this set of rows */
1485         /* note: prow = row here */
1486         pc1 = rtmp11 + prow;
1487         pc2 = rtmp22 + prow;
1488         pc3 = rtmp33 + prow;
1489 
1490         sctx.pv = *pc1;
1491         pj      = bj + bi[prow];
1492         rs      = 0.0;
1493         for (j=0; j<nz; j++){
1494           idx = pj[j];
1495           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
1496         }
1497         sctx.rs = rs;
1498         ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr);
1499         if (newshift == 1) goto endofwhile;
1500 
1501         if (*pc2 != 0.0 || *pc3 != 0.0){
1502           mul2 = (*pc2)/(*pc1);
1503           mul3 = (*pc3)/(*pc1);
1504           *pc2 = mul2;
1505           *pc3 = mul3;
1506           nz_tmp = bi[prow+1] - bd[prow] - 1;
1507           pj     = nbj + bd[prow];
1508           for (j=0; j<nz_tmp; j++) {
1509             idx = pj[j] ;
1510             tmp = rtmp11[idx];
1511             rtmp22[idx] -= mul2 * tmp;
1512             rtmp33[idx] -= mul3 * tmp;
1513           }
1514           ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr);
1515         }
1516         ++prow;
1517 
1518         pc2 = rtmp22 + prow;
1519         pc3 = rtmp33 + prow;
1520         sctx.pv = *pc2;
1521         pj      = bj + bi[prow];
1522         rs      = 0.0;
1523         for (j=0; j<nz; j++){
1524           idx = pj[j];
1525           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
1526         }
1527         sctx.rs = rs;
1528         ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr);
1529         if (newshift == 1) goto endofwhile;
1530 
1531         if (*pc3 != 0.0){
1532           mul3   = (*pc3)/(*pc2);
1533           *pc3   = mul3;
1534           pj     = nbj + bd[prow];
1535           nz_tmp = bi[prow+1] - bd[prow] - 1;
1536           for (j=0; j<nz_tmp; j++) {
1537             idx = pj[j] ;
1538             tmp = rtmp22[idx];
1539             rtmp33[idx] -= mul3 * tmp;
1540           }
1541           ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr);
1542         }
1543 
1544         pj  = bj + bi[row];
1545         pc1 = ba + bi[row];
1546         pc2 = ba + bi[row+1];
1547         pc3 = ba + bi[row+2];
1548 
1549         sctx.pv = rtmp33[row+2];
1550         rs = 0.0;
1551         rtmp11[row]   = 1.0/rtmp11[row];
1552         rtmp22[row+1] = 1.0/rtmp22[row+1];
1553         rtmp33[row+2] = 1.0/rtmp33[row+2];
1554         /* copy row entries from dense representation to sparse */
1555         for (j=0; j<nz; j++) {
1556           idx    = pj[j];
1557           pc1[j] = rtmp11[idx];
1558           pc2[j] = rtmp22[idx];
1559           pc3[j] = rtmp33[idx];
1560           if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
1561         }
1562 
1563         sctx.rs = rs;
1564         ierr = MatLUCheckShift_inline(info,sctx,row+2,newshift);CHKERRQ(ierr);
1565         if (newshift == 1) goto endofwhile;
1566         break;
1567 
1568       default:
1569         SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1570       }
1571       row += nodesz;                 /* Update the row */
1572     }
1573     endofwhile:;
1574   } while (sctx.lushift);
1575   ierr = PetscFree(rtmp11);CHKERRQ(ierr);
1576   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
1577   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
1578   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1579   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
1580   C->factor      = FACTOR_LU;
1581   C->assembled   = PETSC_TRUE;
1582   if (sctx.nshift) {
1583     if (info->shiftnz) {
1584       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
1585     } else if (info->shiftpd) {
1586       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,info->shift_fraction,sctx.shift_top);CHKERRQ(ierr);
1587     }
1588   }
1589   ierr = PetscLogFlops(C->cmap.n);CHKERRQ(ierr);
1590   PetscFunctionReturn(0);
1591 }
1592 
1593 /*
1594      Makes a longer coloring[] array and calls the usual code with that
1595 */
1596 #undef __FUNCT__
1597 #define __FUNCT__ "MatColoringPatch_Inode"
1598 PetscErrorCode MatColoringPatch_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
1599 {
1600   Mat_SeqAIJ       *a = (Mat_SeqAIJ*)mat->data;
1601   PetscErrorCode  ierr;
1602   PetscInt        n = mat->cmap.n,m = a->inode.node_count,j,*ns = a->inode.size,row;
1603   PetscInt        *colorused,i;
1604   ISColoringValue *newcolor;
1605 
1606   PetscFunctionBegin;
1607   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr);
1608   /* loop over inodes, marking a color for each column*/
1609   row = 0;
1610   for (i=0; i<m; i++){
1611     for (j=0; j<ns[i]; j++) {
1612       newcolor[row++] = coloring[i] + j*ncolors;
1613     }
1614   }
1615 
1616   /* eliminate unneeded colors */
1617   ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr);
1618   ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr);
1619   for (i=0; i<n; i++) {
1620     colorused[newcolor[i]] = 1;
1621   }
1622 
1623   for (i=1; i<5*ncolors; i++) {
1624     colorused[i] += colorused[i-1];
1625   }
1626   ncolors = colorused[5*ncolors-1];
1627   for (i=0; i<n; i++) {
1628     newcolor[i] = colorused[newcolor[i]]-1;
1629   }
1630   ierr = PetscFree(colorused);CHKERRQ(ierr);
1631   ierr = ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr);
1632   ierr = PetscFree(coloring);CHKERRQ(ierr);
1633   PetscFunctionReturn(0);
1634 }
1635 
1636 #include "src/inline/ilu.h"
1637 
1638 #undef __FUNCT__
1639 #define __FUNCT__ "MatRelax_Inode"
1640 PetscErrorCode MatRelax_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
1641 {
1642   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1643   PetscScalar        *x,*xs,*ibdiag,*bdiag,sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3;
1644   PetscScalar        *b,*xb,tmp4,tmp5,x1,x2,x3,x4,x5;
1645   const PetscScalar  *v = a->a,*v1,*v2,*v3,*v4,*v5;
1646   PetscReal          zeropivot = 1.0e-15;
1647   PetscErrorCode     ierr;
1648   PetscInt           n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2;
1649   PetscInt           *idx,*diag = a->diag,*ii = a->i,sz,k;
1650 
1651   PetscFunctionBegin;
1652   if (omega != 1.0) SETERRQ(PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
1653   if (fshift != 0.0) SETERRQ(PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
1654   if (flag & SOR_EISENSTAT) SETERRQ(PETSC_ERR_SUP,"No support for Eisenstat trick; use -mat_no_inode");
1655 
1656   if (!a->inode.ibdiagvalid) {
1657     if (!a->inode.ibdiag) {
1658       /* calculate space needed for diagonal blocks */
1659       for (i=0; i<m; i++) {
1660 	cnt += sizes[i]*sizes[i];
1661       }
1662       a->inode.bdiagsize = cnt;
1663       ierr   = PetscMalloc2(cnt,PetscScalar,&a->inode.ibdiag,cnt,PetscScalar,&a->inode.bdiag);CHKERRQ(ierr);
1664     }
1665 
1666     /* copy over the diagonal blocks and invert them */
1667     ibdiag = a->inode.ibdiag;
1668     bdiag  = a->inode.bdiag;
1669     cnt = 0;
1670     for (i=0, row = 0; i<m; i++) {
1671       for (j=0; j<sizes[i]; j++) {
1672         for (k=0; k<sizes[i]; k++) {
1673           bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
1674         }
1675       }
1676       ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(PetscScalar));CHKERRQ(ierr);
1677 
1678       switch(sizes[i]) {
1679         case 1:
1680           /* Create matrix data structure */
1681           if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
1682           ibdiag[cnt] = 1.0/ibdiag[cnt];
1683           break;
1684         case 2:
1685           ierr = Kernel_A_gets_inverse_A_2(ibdiag+cnt);CHKERRQ(ierr);
1686           break;
1687         case 3:
1688           ierr = Kernel_A_gets_inverse_A_3(ibdiag+cnt);CHKERRQ(ierr);
1689           break;
1690         case 4:
1691           ierr = Kernel_A_gets_inverse_A_4(ibdiag+cnt);CHKERRQ(ierr);
1692           break;
1693         case 5:
1694           ierr = Kernel_A_gets_inverse_A_5(ibdiag+cnt);CHKERRQ(ierr);
1695           break;
1696        default:
1697 	 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
1698       }
1699       cnt += sizes[i]*sizes[i];
1700       row += sizes[i];
1701     }
1702     a->inode.ibdiagvalid = PETSC_TRUE;
1703   }
1704   ibdiag = a->inode.ibdiag;
1705   bdiag  = a->inode.bdiag;
1706 
1707   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1708   if (xx != bb) {
1709     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1710   } else {
1711     b = x;
1712   }
1713 
1714   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
1715   xs   = x;
1716   if (flag & SOR_ZERO_INITIAL_GUESS) {
1717     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){
1718 
1719       for (i=0, row=0; i<m; i++) {
1720         sz  = diag[row] - ii[row];
1721         v1  = a->a + ii[row];
1722         idx = a->j + ii[row];
1723 
1724         /* see comments for MatMult_Inode() for how this is coded */
1725         switch (sizes[i]){
1726           case 1:
1727 
1728             sum1  = b[row];
1729             for(n = 0; n<sz-1; n+=2) {
1730               i1   = idx[0];
1731               i2   = idx[1];
1732               idx += 2;
1733               tmp0 = x[i1];
1734               tmp1 = x[i2];
1735               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1736             }
1737 
1738             if (n == sz-1){
1739               tmp0  = x[*idx];
1740               sum1 -= *v1 * tmp0;
1741             }
1742             x[row++] = sum1*(*ibdiag++);
1743             break;
1744           case 2:
1745             v2    = a->a + ii[row+1];
1746             sum1  = b[row];
1747             sum2  = b[row+1];
1748             for(n = 0; n<sz-1; n+=2) {
1749               i1   = idx[0];
1750               i2   = idx[1];
1751               idx += 2;
1752               tmp0 = x[i1];
1753               tmp1 = x[i2];
1754               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1755               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1756             }
1757 
1758             if (n == sz-1){
1759               tmp0  = x[*idx];
1760               sum1 -= v1[0] * tmp0;
1761               sum2 -= v2[0] * tmp0;
1762             }
1763             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
1764             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
1765             ibdiag  += 4;
1766             break;
1767           case 3:
1768             v2    = a->a + ii[row+1];
1769             v3    = a->a + ii[row+2];
1770             sum1  = b[row];
1771             sum2  = b[row+1];
1772             sum3  = b[row+2];
1773             for(n = 0; n<sz-1; n+=2) {
1774               i1   = idx[0];
1775               i2   = idx[1];
1776               idx += 2;
1777               tmp0 = x[i1];
1778               tmp1 = x[i2];
1779               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1780               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1781               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1782             }
1783 
1784             if (n == sz-1){
1785               tmp0  = x[*idx];
1786               sum1 -= v1[0] * tmp0;
1787               sum2 -= v2[0] * tmp0;
1788               sum3 -= v3[0] * tmp0;
1789             }
1790             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
1791             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
1792             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
1793             ibdiag  += 9;
1794             break;
1795           case 4:
1796             v2    = a->a + ii[row+1];
1797             v3    = a->a + ii[row+2];
1798             v4    = a->a + ii[row+3];
1799             sum1  = b[row];
1800             sum2  = b[row+1];
1801             sum3  = b[row+2];
1802             sum4  = b[row+3];
1803             for(n = 0; n<sz-1; n+=2) {
1804               i1   = idx[0];
1805               i2   = idx[1];
1806               idx += 2;
1807               tmp0 = x[i1];
1808               tmp1 = x[i2];
1809               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1810               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1811               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1812               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
1813             }
1814 
1815             if (n == sz-1){
1816               tmp0  = x[*idx];
1817               sum1 -= v1[0] * tmp0;
1818               sum2 -= v2[0] * tmp0;
1819               sum3 -= v3[0] * tmp0;
1820               sum4 -= v4[0] * tmp0;
1821             }
1822             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
1823             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
1824             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
1825             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
1826             ibdiag  += 16;
1827             break;
1828           case 5:
1829             v2    = a->a + ii[row+1];
1830             v3    = a->a + ii[row+2];
1831             v4    = a->a + ii[row+3];
1832             v5    = a->a + ii[row+4];
1833             sum1  = b[row];
1834             sum2  = b[row+1];
1835             sum3  = b[row+2];
1836             sum4  = b[row+3];
1837             sum5  = b[row+4];
1838             for(n = 0; n<sz-1; n+=2) {
1839               i1   = idx[0];
1840               i2   = idx[1];
1841               idx += 2;
1842               tmp0 = x[i1];
1843               tmp1 = x[i2];
1844               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1845               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1846               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1847               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
1848               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
1849             }
1850 
1851             if (n == sz-1){
1852               tmp0  = x[*idx];
1853               sum1 -= v1[0] * tmp0;
1854               sum2 -= v2[0] * tmp0;
1855               sum3 -= v3[0] * tmp0;
1856               sum4 -= v4[0] * tmp0;
1857               sum5 -= v5[0] * tmp0;
1858             }
1859             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
1860             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
1861             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
1862             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
1863             x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
1864             ibdiag  += 25;
1865             break;
1866 	  default:
1867    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
1868         }
1869       }
1870 
1871       xb = x;
1872       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
1873     } else xb = b;
1874     if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) &&
1875         (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1876       cnt = 0;
1877       for (i=0, row=0; i<m; i++) {
1878 
1879         switch (sizes[i]){
1880           case 1:
1881             x[row++] *= bdiag[cnt++];
1882             break;
1883           case 2:
1884             x1   = x[row]; x2 = x[row+1];
1885             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
1886             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
1887             x[row++] = tmp1;
1888             x[row++] = tmp2;
1889             cnt += 4;
1890             break;
1891           case 3:
1892             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2];
1893             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
1894             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
1895             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
1896             x[row++] = tmp1;
1897             x[row++] = tmp2;
1898             x[row++] = tmp3;
1899             cnt += 9;
1900             break;
1901           case 4:
1902             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
1903             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
1904             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
1905             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
1906             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
1907             x[row++] = tmp1;
1908             x[row++] = tmp2;
1909             x[row++] = tmp3;
1910             x[row++] = tmp4;
1911             cnt += 16;
1912             break;
1913           case 5:
1914             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
1915             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
1916             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
1917             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
1918             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
1919             tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
1920             x[row++] = tmp1;
1921             x[row++] = tmp2;
1922             x[row++] = tmp3;
1923             x[row++] = tmp4;
1924             x[row++] = tmp5;
1925             cnt += 25;
1926             break;
1927 	  default:
1928    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
1929         }
1930       }
1931       ierr = PetscLogFlops(m);CHKERRQ(ierr);
1932     }
1933     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){
1934 
1935       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
1936       for (i=m-1, row=A->rmap.n-1; i>=0; i--) {
1937         ibdiag -= sizes[i]*sizes[i];
1938         sz      = ii[row+1] - diag[row] - 1;
1939         v1      = a->a + diag[row] + 1;
1940         idx     = a->j + diag[row] + 1;
1941 
1942         /* see comments for MatMult_Inode() for how this is coded */
1943         switch (sizes[i]){
1944           case 1:
1945 
1946             sum1  = xb[row];
1947             for(n = 0; n<sz-1; n+=2) {
1948               i1   = idx[0];
1949               i2   = idx[1];
1950               idx += 2;
1951               tmp0 = x[i1];
1952               tmp1 = x[i2];
1953               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1954             }
1955 
1956             if (n == sz-1){
1957               tmp0  = x[*idx];
1958               sum1 -= *v1*tmp0;
1959             }
1960             x[row--] = sum1*(*ibdiag);
1961             break;
1962 
1963           case 2:
1964 
1965             sum1  = xb[row];
1966             sum2  = xb[row-1];
1967             /* note that sum1 is associated with the second of the two rows */
1968             v2    = a->a + diag[row-1] + 2;
1969             for(n = 0; n<sz-1; n+=2) {
1970               i1   = idx[0];
1971               i2   = idx[1];
1972               idx += 2;
1973               tmp0 = x[i1];
1974               tmp1 = x[i2];
1975               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1976               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1977             }
1978 
1979             if (n == sz-1){
1980               tmp0  = x[*idx];
1981               sum1 -= *v1*tmp0;
1982               sum2 -= *v2*tmp0;
1983             }
1984             x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
1985             x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
1986             break;
1987           case 3:
1988 
1989             sum1  = xb[row];
1990             sum2  = xb[row-1];
1991             sum3  = xb[row-2];
1992             v2    = a->a + diag[row-1] + 2;
1993             v3    = a->a + diag[row-2] + 3;
1994             for(n = 0; n<sz-1; n+=2) {
1995               i1   = idx[0];
1996               i2   = idx[1];
1997               idx += 2;
1998               tmp0 = x[i1];
1999               tmp1 = x[i2];
2000               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2001               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2002               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2003             }
2004 
2005             if (n == sz-1){
2006               tmp0  = x[*idx];
2007               sum1 -= *v1*tmp0;
2008               sum2 -= *v2*tmp0;
2009               sum3 -= *v3*tmp0;
2010             }
2011             x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
2012             x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
2013             x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
2014             break;
2015           case 4:
2016 
2017             sum1  = xb[row];
2018             sum2  = xb[row-1];
2019             sum3  = xb[row-2];
2020             sum4  = xb[row-3];
2021             v2    = a->a + diag[row-1] + 2;
2022             v3    = a->a + diag[row-2] + 3;
2023             v4    = a->a + diag[row-3] + 4;
2024             for(n = 0; n<sz-1; n+=2) {
2025               i1   = idx[0];
2026               i2   = idx[1];
2027               idx += 2;
2028               tmp0 = x[i1];
2029               tmp1 = x[i2];
2030               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2031               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2032               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2033               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2034             }
2035 
2036             if (n == sz-1){
2037               tmp0  = x[*idx];
2038               sum1 -= *v1*tmp0;
2039               sum2 -= *v2*tmp0;
2040               sum3 -= *v3*tmp0;
2041               sum4 -= *v4*tmp0;
2042             }
2043             x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
2044             x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
2045             x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
2046             x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
2047             break;
2048           case 5:
2049 
2050             sum1  = xb[row];
2051             sum2  = xb[row-1];
2052             sum3  = xb[row-2];
2053             sum4  = xb[row-3];
2054             sum5  = xb[row-4];
2055             v2    = a->a + diag[row-1] + 2;
2056             v3    = a->a + diag[row-2] + 3;
2057             v4    = a->a + diag[row-3] + 4;
2058             v5    = a->a + diag[row-4] + 5;
2059             for(n = 0; n<sz-1; n+=2) {
2060               i1   = idx[0];
2061               i2   = idx[1];
2062               idx += 2;
2063               tmp0 = x[i1];
2064               tmp1 = x[i2];
2065               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2066               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2067               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2068               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2069               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2070             }
2071 
2072             if (n == sz-1){
2073               tmp0  = x[*idx];
2074               sum1 -= *v1*tmp0;
2075               sum2 -= *v2*tmp0;
2076               sum3 -= *v3*tmp0;
2077               sum4 -= *v4*tmp0;
2078               sum5 -= *v5*tmp0;
2079             }
2080             x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
2081             x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
2082             x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
2083             x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
2084             x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
2085             break;
2086 	  default:
2087    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2088         }
2089       }
2090 
2091       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2092     }
2093     its--;
2094   }
2095   if (its) SETERRQ(PETSC_ERR_SUP,"Currently no support for multiply SOR sweeps using inode version of AIJ matrix format;\n run with the option -mat_no_inode");
2096   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2097   if (bb != xx) {ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);}
2098   PetscFunctionReturn(0);
2099 }
2100 
2101 
2102 /*
2103     samestructure indicates that the matrix has not changed its nonzero structure so we
2104     do not need to recompute the inodes
2105 */
2106 #undef __FUNCT__
2107 #define __FUNCT__ "Mat_CheckInode"
2108 PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure)
2109 {
2110   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
2111   PetscErrorCode ierr;
2112   PetscInt       i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size;
2113   PetscTruth     flag;
2114 
2115   PetscFunctionBegin;
2116   if (!a->inode.use)                     PetscFunctionReturn(0);
2117   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
2118 
2119 
2120   m = A->rmap.n;
2121   if (a->inode.size) {ns = a->inode.size;}
2122   else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);}
2123 
2124   i          = 0;
2125   node_count = 0;
2126   idx        = a->j;
2127   ii         = a->i;
2128   while (i < m){                /* For each row */
2129     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
2130     /* Limits the number of elements in a node to 'a->inode.limit' */
2131     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
2132       nzy     = ii[j+1] - ii[j]; /* Same number of nonzeros */
2133       if (nzy != nzx) break;
2134       idy  += nzx;             /* Same nonzero pattern */
2135       ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
2136       if (!flag) break;
2137     }
2138     ns[node_count++] = blk_size;
2139     idx += blk_size*nzx;
2140     i    = j;
2141   }
2142   /* If not enough inodes found,, do not use inode version of the routines */
2143   if (!a->inode.size && m && node_count > .9*m) {
2144     ierr = PetscFree(ns);CHKERRQ(ierr);
2145     a->inode.node_count     = 0;
2146     a->inode.size           = PETSC_NULL;
2147     a->inode.use            = PETSC_FALSE;
2148     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
2149   } else {
2150     A->ops->mult            = MatMult_Inode;
2151     A->ops->relax           = MatRelax_Inode;
2152     A->ops->multadd         = MatMultAdd_Inode;
2153     A->ops->solve           = MatSolve_Inode;
2154     A->ops->lufactornumeric = MatLUFactorNumeric_Inode;
2155     A->ops->getrowij        = MatGetRowIJ_Inode;
2156     A->ops->restorerowij    = MatRestoreRowIJ_Inode;
2157     A->ops->getcolumnij     = MatGetColumnIJ_Inode;
2158     A->ops->restorecolumnij = MatRestoreColumnIJ_Inode;
2159     A->ops->coloringpatch   = MatColoringPatch_Inode;
2160     a->inode.node_count     = node_count;
2161     a->inode.size           = ns;
2162     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
2163   }
2164   PetscFunctionReturn(0);
2165 }
2166 
2167 /*
2168      This is really ugly. if inodes are used this replaces the
2169   permutations with ones that correspond to rows/cols of the matrix
2170   rather then inode blocks
2171 */
2172 #undef __FUNCT__
2173 #define __FUNCT__ "MatInodeAdjustForInodes"
2174 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
2175 {
2176   PetscErrorCode ierr,(*f)(Mat,IS*,IS*);
2177 
2178   PetscFunctionBegin;
2179   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);CHKERRQ(ierr);
2180   if (f) {
2181     ierr = (*f)(A,rperm,cperm);CHKERRQ(ierr);
2182   }
2183   PetscFunctionReturn(0);
2184 }
2185 
2186 EXTERN_C_BEGIN
2187 #undef __FUNCT__
2188 #define __FUNCT__ "MatAdjustForInodes_Inode"
2189 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes_Inode(Mat A,IS *rperm,IS *cperm)
2190 {
2191   Mat_SeqAIJ      *a=(Mat_SeqAIJ*)A->data;
2192   PetscErrorCode ierr;
2193   PetscInt       m = A->rmap.n,n = A->cmap.n,i,j,*ridx,*cidx,nslim_row = a->inode.node_count;
2194   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
2195   PetscInt       nslim_col,*ns_col;
2196   IS             ris = *rperm,cis = *cperm;
2197 
2198   PetscFunctionBegin;
2199   if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */
2200   if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */
2201 
2202   ierr  = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
2203   ierr  = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
2204   ierr  = PetscMalloc((m+n+1)*sizeof(PetscInt),&permr);CHKERRQ(ierr);
2205   permc = permr + m;
2206 
2207   ierr  = ISGetIndices(ris,&ridx);CHKERRQ(ierr);
2208   ierr  = ISGetIndices(cis,&cidx);CHKERRQ(ierr);
2209 
2210   /* Form the inode structure for the rows of permuted matric using inv perm*/
2211   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
2212 
2213   /* Construct the permutations for rows*/
2214   for (i=0,row = 0; i<nslim_row; ++i){
2215     indx      = ridx[i];
2216     start_val = tns[indx];
2217     end_val   = tns[indx + 1];
2218     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
2219   }
2220 
2221   /* Form the inode structure for the columns of permuted matrix using inv perm*/
2222   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
2223 
2224  /* Construct permutations for columns */
2225   for (i=0,col=0; i<nslim_col; ++i){
2226     indx      = cidx[i];
2227     start_val = tns[indx];
2228     end_val   = tns[indx + 1];
2229     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
2230   }
2231 
2232   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);CHKERRQ(ierr);
2233   ISSetPermutation(*rperm);
2234   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);CHKERRQ(ierr);
2235   ISSetPermutation(*cperm);
2236 
2237   ierr  = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr);
2238   ierr  = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr);
2239 
2240   ierr = PetscFree(ns_col);CHKERRQ(ierr);
2241   ierr = PetscFree(permr);CHKERRQ(ierr);
2242   ierr = ISDestroy(cis);CHKERRQ(ierr);
2243   ierr = ISDestroy(ris);CHKERRQ(ierr);
2244   ierr = PetscFree(tns);CHKERRQ(ierr);
2245   PetscFunctionReturn(0);
2246 }
2247 EXTERN_C_END
2248 
2249 #undef __FUNCT__
2250 #define __FUNCT__ "MatInodeGetInodeSizes"
2251 /*@C
2252    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
2253 
2254    Collective on Mat
2255 
2256    Input Parameter:
2257 .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
2258 
2259    Output Parameter:
2260 +  node_count - no of inodes present in the matrix.
2261 .  sizes      - an array of size node_count,with sizes of each inode.
2262 -  limit      - the max size used to generate the inodes.
2263 
2264    Level: advanced
2265 
2266    Notes: This routine returns some internal storage information
2267    of the matrix, it is intended to be used by advanced users.
2268    It should be called after the matrix is assembled.
2269    The contents of the sizes[] array should not be changed.
2270    PETSC_NULL may be passed for information not requested.
2271 
2272 .keywords: matrix, seqaij, get, inode
2273 
2274 .seealso: MatGetInfo()
2275 @*/
2276 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
2277 {
2278   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
2279 
2280   PetscFunctionBegin;
2281   if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
2282   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr);
2283   if (f) {
2284     ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr);
2285   }
2286   PetscFunctionReturn(0);
2287 }
2288 
2289 EXTERN_C_BEGIN
2290 #undef __FUNCT__
2291 #define __FUNCT__ "MatInodeGetInodeSizes_Inode"
2292 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
2293 {
2294   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
2295 
2296   PetscFunctionBegin;
2297   if (node_count) *node_count = a->inode.node_count;
2298   if (sizes)      *sizes      = a->inode.size;
2299   if (limit)      *limit      = a->inode.limit;
2300   PetscFunctionReturn(0);
2301 }
2302 EXTERN_C_END
2303