xref: /petsc/src/mat/impls/aij/seq/inode.c (revision 3971808ee38d59bdec5e2f678e2f5332b9a41b1d)
1 #define PETSCMAT_DLL
2 
3 /*
4   This file provides high performance routines for the Inode format (compressed sparse row)
5   by taking advantage of rows with identical nonzero structure (I-nodes).
6 */
7 #include "src/mat/impls/aij/seq/aij.h"
8 
9 #undef __FUNCT__
10 #define __FUNCT__ "Mat_CreateColInode"
11 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns)
12 {
13   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
14   PetscErrorCode ierr;
15   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;
16 
17   PetscFunctionBegin;
18   n      = A->cmap.n;
19   m      = A->rmap.n;
20   ns_row = a->inode.size;
21 
22   min_mn = (m < n) ? m : n;
23   if (!ns) {
24     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++);
25     for(; count+1 < n; count++,i++);
26     if (count < n)  {
27       i++;
28     }
29     *size = i;
30     PetscFunctionReturn(0);
31   }
32   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr);
33 
34   /* Use the same row structure wherever feasible. */
35   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
36     ns_col[i] = ns_row[i];
37   }
38 
39   /* if m < n; pad up the remainder with inode_limit */
40   for(; count+1 < n; count++,i++) {
41     ns_col[i] = 1;
42   }
43   /* The last node is the odd ball. padd it up with the remaining rows; */
44   if (count < n)  {
45     ns_col[i] = n - count;
46     i++;
47   } else if (count > n) {
48     /* Adjust for the over estimation */
49     ns_col[i-1] += n - count;
50   }
51   *size = i;
52   *ns   = ns_col;
53   PetscFunctionReturn(0);
54 }
55 
56 
57 /*
58       This builds symmetric version of nonzero structure,
59 */
60 #undef __FUNCT__
61 #define __FUNCT__ "MatGetRowIJ_Inode_Symmetric"
62 static PetscErrorCode MatGetRowIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
63 {
64   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
65   PetscErrorCode ierr;
66   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n;
67   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j;
68 
69   PetscFunctionBegin;
70   nslim_row = a->inode.node_count;
71   m         = A->rmap.n;
72   n         = A->cmap.n;
73   if (m != n) SETERRQ(PETSC_ERR_SUP,"MatGetRowIJ_Inode_Symmetric: Matrix should be square");
74 
75   /* Use the row_inode as column_inode */
76   nslim_col = nslim_row;
77   ns_col    = ns_row;
78 
79   /* allocate space for reformated inode structure */
80   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
81   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
82   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
83 
84   for (i1=0,col=0; i1<nslim_col; ++i1){
85     nsz = ns_col[i1];
86     for (i2=0; i2<nsz; ++i2,++col)
87       tvc[col] = i1;
88   }
89   /* allocate space for row pointers */
90   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
91   *iia = ia;
92   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
93   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
94 
95   /* determine the number of columns in each row */
96   ia[0] = oshift;
97   for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) {
98 
99     j    = aj + ai[row] + ishift;
100     jmax = aj + ai[row+1] + ishift;
101     i2   = 0;
102     col  = *j++ + ishift;
103     i2   = tvc[col];
104     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
105       ia[i1+1]++;
106       ia[i2+1]++;
107       i2++;                     /* Start col of next node */
108       while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j;
109       i2 = tvc[col];
110     }
111     if(i2 == i1) ia[i2+1]++;    /* now the diagonal element */
112   }
113 
114   /* shift ia[i] to point to next row */
115   for (i1=1; i1<nslim_row+1; i1++) {
116     row        = ia[i1-1];
117     ia[i1]    += row;
118     work[i1-1] = row - oshift;
119   }
120 
121   /* allocate space for column pointers */
122   nz   = ia[nslim_row] + (!ishift);
123   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
124   *jja = ja;
125 
126  /* loop over lower triangular part putting into ja */
127   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
128     j    = aj + ai[row] + ishift;
129     jmax = aj + ai[row+1] + ishift;
130     i2   = 0;                     /* Col inode index */
131     col  = *j++ + ishift;
132     i2   = tvc[col];
133     while (i2<i1 && j<jmax) {
134       ja[work[i2]++] = i1 + oshift;
135       ja[work[i1]++] = i2 + oshift;
136       ++i2;
137       while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */
138       i2 = tvc[col];
139     }
140     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
141 
142   }
143   ierr = PetscFree(work);CHKERRQ(ierr);
144   ierr = PetscFree(tns);CHKERRQ(ierr);
145   ierr = PetscFree(tvc);CHKERRQ(ierr);
146   PetscFunctionReturn(0);
147 }
148 
149 /*
150       This builds nonsymmetric version of nonzero structure,
151 */
152 #undef __FUNCT__
153 #define __FUNCT__ "MatGetRowIJ_Inode_Nonsymmetric"
154 static PetscErrorCode MatGetRowIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
155 {
156   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
157   PetscErrorCode ierr;
158   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col;
159   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
160 
161   PetscFunctionBegin;
162   nslim_row = a->inode.node_count;
163   n         = A->cmap.n;
164 
165   /* Create The column_inode for this matrix */
166   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
167 
168   /* allocate space for reformated column_inode structure */
169   ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
170   ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
171   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
172 
173   for (i1=0,col=0; i1<nslim_col; ++i1){
174     nsz = ns_col[i1];
175     for (i2=0; i2<nsz; ++i2,++col)
176       tvc[col] = i1;
177   }
178   /* allocate space for row pointers */
179   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
180   *iia = ia;
181   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
182   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
183 
184   /* determine the number of columns in each row */
185   ia[0] = oshift;
186   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
187     j   = aj + ai[row] + ishift;
188     col = *j++ + ishift;
189     i2  = tvc[col];
190     nz  = ai[row+1] - ai[row];
191     while (nz-- > 0) {           /* off-diagonal elemets */
192       ia[i1+1]++;
193       i2++;                     /* Start col of next node */
194       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
195       if (nz > 0) i2 = tvc[col];
196     }
197   }
198 
199   /* shift ia[i] to point to next row */
200   for (i1=1; i1<nslim_row+1; i1++) {
201     row        = ia[i1-1];
202     ia[i1]    += row;
203     work[i1-1] = row - oshift;
204   }
205 
206   /* allocate space for column pointers */
207   nz   = ia[nslim_row] + (!ishift);
208   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
209   *jja = ja;
210 
211  /* loop over matrix putting into ja */
212   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
213     j   = aj + ai[row] + ishift;
214     i2  = 0;                     /* Col inode index */
215     col = *j++ + ishift;
216     i2  = tvc[col];
217     nz  = ai[row+1] - ai[row];
218     while (nz-- > 0) {
219       ja[work[i1]++] = i2 + oshift;
220       ++i2;
221       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
222       if (nz > 0) i2 = tvc[col];
223     }
224   }
225   ierr = PetscFree(ns_col);CHKERRQ(ierr);
226   ierr = PetscFree(work);CHKERRQ(ierr);
227   ierr = PetscFree(tns);CHKERRQ(ierr);
228   ierr = PetscFree(tvc);CHKERRQ(ierr);
229   PetscFunctionReturn(0);
230 }
231 
232 #undef __FUNCT__
233 #define __FUNCT__ "MatGetRowIJ_Inode"
234 static PetscErrorCode MatGetRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
235 {
236   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
237   PetscErrorCode ierr;
238 
239   PetscFunctionBegin;
240   *n     = a->inode.node_count;
241   if (!ia) PetscFunctionReturn(0);
242   if (!blockcompressed) {
243     ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
244   } else if (symmetric) {
245     ierr = MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
246   } else {
247     ierr = MatGetRowIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
248   }
249   PetscFunctionReturn(0);
250 }
251 
252 #undef __FUNCT__
253 #define __FUNCT__ "MatRestoreRowIJ_Inode"
254 static PetscErrorCode MatRestoreRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
255 {
256   PetscErrorCode ierr;
257 
258   PetscFunctionBegin;
259   if (!ia) PetscFunctionReturn(0);
260 
261   if (!blockcompressed) {
262     ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
263   } else {
264     ierr = PetscFree(*ia);CHKERRQ(ierr);
265     ierr = PetscFree(*ja);CHKERRQ(ierr);
266   }
267 
268   PetscFunctionReturn(0);
269 }
270 
271 /* ----------------------------------------------------------- */
272 
273 #undef __FUNCT__
274 #define __FUNCT__ "MatGetColumnIJ_Inode_Nonsymmetric"
275 static PetscErrorCode MatGetColumnIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
276 {
277   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
278   PetscErrorCode ierr;
279   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
280   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
281 
282   PetscFunctionBegin;
283   nslim_row = a->inode.node_count;
284   n         = A->cmap.n;
285 
286   /* Create The column_inode for this matrix */
287   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
288 
289   /* allocate space for reformated column_inode structure */
290   ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
291   ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
292   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
293 
294   for (i1=0,col=0; i1<nslim_col; ++i1){
295     nsz = ns_col[i1];
296     for (i2=0; i2<nsz; ++i2,++col)
297       tvc[col] = i1;
298   }
299   /* allocate space for column pointers */
300   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
301   *iia = ia;
302   ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr);
303   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
304 
305   /* determine the number of columns in each row */
306   ia[0] = oshift;
307   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
308     j   = aj + ai[row] + ishift;
309     col = *j++ + ishift;
310     i2  = tvc[col];
311     nz  = ai[row+1] - ai[row];
312     while (nz-- > 0) {           /* off-diagonal elemets */
313       /* ia[i1+1]++; */
314       ia[i2+1]++;
315       i2++;
316       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
317       if (nz > 0) i2 = tvc[col];
318     }
319   }
320 
321   /* shift ia[i] to point to next col */
322   for (i1=1; i1<nslim_col+1; i1++) {
323     col        = ia[i1-1];
324     ia[i1]    += col;
325     work[i1-1] = col - oshift;
326   }
327 
328   /* allocate space for column pointers */
329   nz   = ia[nslim_col] + (!ishift);
330   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
331   *jja = ja;
332 
333  /* loop over matrix putting into ja */
334   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
335     j   = aj + ai[row] + ishift;
336     i2  = 0;                     /* Col inode index */
337     col = *j++ + ishift;
338     i2  = tvc[col];
339     nz  = ai[row+1] - ai[row];
340     while (nz-- > 0) {
341       /* ja[work[i1]++] = i2 + oshift; */
342       ja[work[i2]++] = i1 + oshift;
343       i2++;
344       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
345       if (nz > 0) i2 = tvc[col];
346     }
347   }
348   ierr = PetscFree(ns_col);CHKERRQ(ierr);
349   ierr = PetscFree(work);CHKERRQ(ierr);
350   ierr = PetscFree(tns);CHKERRQ(ierr);
351   ierr = PetscFree(tvc);CHKERRQ(ierr);
352   PetscFunctionReturn(0);
353 }
354 
355 #undef __FUNCT__
356 #define __FUNCT__ "MatGetColumnIJ_Inode"
357 static PetscErrorCode MatGetColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
358 {
359   PetscErrorCode ierr;
360 
361   PetscFunctionBegin;
362   ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr);
363   if (!ia) PetscFunctionReturn(0);
364 
365   if (!blockcompressed) {
366     ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
367   } else if (symmetric) {
368     /* Since the indices are symmetric it does'nt matter */
369     ierr = MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
370   } else {
371     ierr = MatGetColumnIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
372   }
373   PetscFunctionReturn(0);
374 }
375 
376 #undef __FUNCT__
377 #define __FUNCT__ "MatRestoreColumnIJ_Inode"
378 static PetscErrorCode MatRestoreColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
379 {
380   PetscErrorCode ierr;
381 
382   PetscFunctionBegin;
383   if (!ia) PetscFunctionReturn(0);
384   if (!blockcompressed) {
385     ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
386   } else {
387     ierr = PetscFree(*ia);CHKERRQ(ierr);
388     ierr = PetscFree(*ja);CHKERRQ(ierr);
389   }
390   PetscFunctionReturn(0);
391 }
392 
393 /* ----------------------------------------------------------- */
394 
395 #undef __FUNCT__
396 #define __FUNCT__ "MatMult_Inode"
397 static PetscErrorCode MatMult_Inode(Mat A,Vec xx,Vec yy)
398 {
399   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
400   PetscScalar    sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
401   PetscScalar    *v1,*v2,*v3,*v4,*v5,*x,*y;
402   PetscErrorCode ierr;
403   PetscInt       *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
404 
405 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
406 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
407 #endif
408 
409   PetscFunctionBegin;
410   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
411   node_max = a->inode.node_count;
412   ns       = a->inode.size;     /* Node Size array */
413   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
414   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
415   idx  = a->j;
416   v1   = a->a;
417   ii   = a->i;
418 
419   for (i = 0,row = 0; i< node_max; ++i){
420     nsz  = ns[i];
421     n    = ii[1] - ii[0];
422     ii  += nsz;
423     sz   = n;                   /* No of non zeros in this row */
424                                 /* Switch on the size of Node */
425     switch (nsz){               /* Each loop in 'case' is unrolled */
426     case 1 :
427       sum1  = 0;
428 
429       for(n = 0; n< sz-1; n+=2) {
430         i1   = idx[0];          /* The instructions are ordered to */
431         i2   = idx[1];          /* make the compiler's job easy */
432         idx += 2;
433         tmp0 = x[i1];
434         tmp1 = x[i2];
435         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
436        }
437 
438       if (n == sz-1){          /* Take care of the last nonzero  */
439         tmp0  = x[*idx++];
440         sum1 += *v1++ * tmp0;
441       }
442       y[row++]=sum1;
443       break;
444     case 2:
445       sum1  = 0;
446       sum2  = 0;
447       v2    = v1 + n;
448 
449       for (n = 0; n< sz-1; n+=2) {
450         i1   = idx[0];
451         i2   = idx[1];
452         idx += 2;
453         tmp0 = x[i1];
454         tmp1 = x[i2];
455         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
456         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
457       }
458       if (n == sz-1){
459         tmp0  = x[*idx++];
460         sum1 += *v1++ * tmp0;
461         sum2 += *v2++ * tmp0;
462       }
463       y[row++]=sum1;
464       y[row++]=sum2;
465       v1      =v2;              /* Since the next block to be processed starts there*/
466       idx    +=sz;
467       break;
468     case 3:
469       sum1  = 0;
470       sum2  = 0;
471       sum3  = 0;
472       v2    = v1 + n;
473       v3    = v2 + n;
474 
475       for (n = 0; n< sz-1; n+=2) {
476         i1   = idx[0];
477         i2   = idx[1];
478         idx += 2;
479         tmp0 = x[i1];
480         tmp1 = x[i2];
481         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
482         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
483         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
484       }
485       if (n == sz-1){
486         tmp0  = x[*idx++];
487         sum1 += *v1++ * tmp0;
488         sum2 += *v2++ * tmp0;
489         sum3 += *v3++ * tmp0;
490       }
491       y[row++]=sum1;
492       y[row++]=sum2;
493       y[row++]=sum3;
494       v1       =v3;             /* Since the next block to be processed starts there*/
495       idx     +=2*sz;
496       break;
497     case 4:
498       sum1  = 0;
499       sum2  = 0;
500       sum3  = 0;
501       sum4  = 0;
502       v2    = v1 + n;
503       v3    = v2 + n;
504       v4    = v3 + n;
505 
506       for (n = 0; n< sz-1; n+=2) {
507         i1   = idx[0];
508         i2   = idx[1];
509         idx += 2;
510         tmp0 = x[i1];
511         tmp1 = x[i2];
512         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
513         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
514         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
515         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
516       }
517       if (n == sz-1){
518         tmp0  = x[*idx++];
519         sum1 += *v1++ * tmp0;
520         sum2 += *v2++ * tmp0;
521         sum3 += *v3++ * tmp0;
522         sum4 += *v4++ * tmp0;
523       }
524       y[row++]=sum1;
525       y[row++]=sum2;
526       y[row++]=sum3;
527       y[row++]=sum4;
528       v1      =v4;              /* Since the next block to be processed starts there*/
529       idx    +=3*sz;
530       break;
531     case 5:
532       sum1  = 0;
533       sum2  = 0;
534       sum3  = 0;
535       sum4  = 0;
536       sum5  = 0;
537       v2    = v1 + n;
538       v3    = v2 + n;
539       v4    = v3 + n;
540       v5    = v4 + n;
541 
542       for (n = 0; n<sz-1; n+=2) {
543         i1   = idx[0];
544         i2   = idx[1];
545         idx += 2;
546         tmp0 = x[i1];
547         tmp1 = x[i2];
548         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
549         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
550         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
551         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
552         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
553       }
554       if (n == sz-1){
555         tmp0  = x[*idx++];
556         sum1 += *v1++ * tmp0;
557         sum2 += *v2++ * tmp0;
558         sum3 += *v3++ * tmp0;
559         sum4 += *v4++ * tmp0;
560         sum5 += *v5++ * tmp0;
561       }
562       y[row++]=sum1;
563       y[row++]=sum2;
564       y[row++]=sum3;
565       y[row++]=sum4;
566       y[row++]=sum5;
567       v1      =v5;       /* Since the next block to be processed starts there */
568       idx    +=4*sz;
569       break;
570     default :
571       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
572     }
573   }
574   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
575   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
576   ierr = PetscLogFlops(2*a->nz - A->rmap.n);CHKERRQ(ierr);
577   PetscFunctionReturn(0);
578 }
579 /* ----------------------------------------------------------- */
580 /* Almost same code as the MatMult_Inode() */
581 #undef __FUNCT__
582 #define __FUNCT__ "MatMultAdd_Inode"
583 static PetscErrorCode MatMultAdd_Inode(Mat A,Vec xx,Vec zz,Vec yy)
584 {
585   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
586   PetscScalar    sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
587   PetscScalar    *v1,*v2,*v3,*v4,*v5,*x,*y,*z,*zt;
588   PetscErrorCode ierr;
589   PetscInt       *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
590 
591   PetscFunctionBegin;
592   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
593   node_max = a->inode.node_count;
594   ns       = a->inode.size;     /* Node Size array */
595   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
596   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
597   if (zz != yy) {
598     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
599   } else {
600     z = y;
601   }
602   zt = z;
603 
604   idx  = a->j;
605   v1   = a->a;
606   ii   = a->i;
607 
608   for (i = 0,row = 0; i< node_max; ++i){
609     nsz  = ns[i];
610     n    = ii[1] - ii[0];
611     ii  += nsz;
612     sz   = n;                   /* No of non zeros in this row */
613                                 /* Switch on the size of Node */
614     switch (nsz){               /* Each loop in 'case' is unrolled */
615     case 1 :
616       sum1  = *zt++;
617 
618       for(n = 0; n< sz-1; n+=2) {
619         i1   = idx[0];          /* The instructions are ordered to */
620         i2   = idx[1];          /* make the compiler's job easy */
621         idx += 2;
622         tmp0 = x[i1];
623         tmp1 = x[i2];
624         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
625        }
626 
627       if(n   == sz-1){          /* Take care of the last nonzero  */
628         tmp0  = x[*idx++];
629         sum1 += *v1++ * tmp0;
630       }
631       y[row++]=sum1;
632       break;
633     case 2:
634       sum1  = *zt++;
635       sum2  = *zt++;
636       v2    = v1 + n;
637 
638       for(n = 0; n< sz-1; n+=2) {
639         i1   = idx[0];
640         i2   = idx[1];
641         idx += 2;
642         tmp0 = x[i1];
643         tmp1 = x[i2];
644         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
645         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
646       }
647       if(n   == sz-1){
648         tmp0  = x[*idx++];
649         sum1 += *v1++ * tmp0;
650         sum2 += *v2++ * tmp0;
651       }
652       y[row++]=sum1;
653       y[row++]=sum2;
654       v1      =v2;              /* Since the next block to be processed starts there*/
655       idx    +=sz;
656       break;
657     case 3:
658       sum1  = *zt++;
659       sum2  = *zt++;
660       sum3  = *zt++;
661       v2    = v1 + n;
662       v3    = v2 + n;
663 
664       for (n = 0; n< sz-1; n+=2) {
665         i1   = idx[0];
666         i2   = idx[1];
667         idx += 2;
668         tmp0 = x[i1];
669         tmp1 = x[i2];
670         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
671         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
672         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
673       }
674       if (n == sz-1){
675         tmp0  = x[*idx++];
676         sum1 += *v1++ * tmp0;
677         sum2 += *v2++ * tmp0;
678         sum3 += *v3++ * tmp0;
679       }
680       y[row++]=sum1;
681       y[row++]=sum2;
682       y[row++]=sum3;
683       v1       =v3;             /* Since the next block to be processed starts there*/
684       idx     +=2*sz;
685       break;
686     case 4:
687       sum1  = *zt++;
688       sum2  = *zt++;
689       sum3  = *zt++;
690       sum4  = *zt++;
691       v2    = v1 + n;
692       v3    = v2 + n;
693       v4    = v3 + n;
694 
695       for (n = 0; n< sz-1; n+=2) {
696         i1   = idx[0];
697         i2   = idx[1];
698         idx += 2;
699         tmp0 = x[i1];
700         tmp1 = x[i2];
701         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
702         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
703         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
704         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
705       }
706       if (n == sz-1){
707         tmp0  = x[*idx++];
708         sum1 += *v1++ * tmp0;
709         sum2 += *v2++ * tmp0;
710         sum3 += *v3++ * tmp0;
711         sum4 += *v4++ * tmp0;
712       }
713       y[row++]=sum1;
714       y[row++]=sum2;
715       y[row++]=sum3;
716       y[row++]=sum4;
717       v1      =v4;              /* Since the next block to be processed starts there*/
718       idx    +=3*sz;
719       break;
720     case 5:
721       sum1  = *zt++;
722       sum2  = *zt++;
723       sum3  = *zt++;
724       sum4  = *zt++;
725       sum5  = *zt++;
726       v2    = v1 + n;
727       v3    = v2 + n;
728       v4    = v3 + n;
729       v5    = v4 + n;
730 
731       for (n = 0; n<sz-1; n+=2) {
732         i1   = idx[0];
733         i2   = idx[1];
734         idx += 2;
735         tmp0 = x[i1];
736         tmp1 = x[i2];
737         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
738         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
739         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
740         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
741         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
742       }
743       if(n   == sz-1){
744         tmp0  = x[*idx++];
745         sum1 += *v1++ * tmp0;
746         sum2 += *v2++ * tmp0;
747         sum3 += *v3++ * tmp0;
748         sum4 += *v4++ * tmp0;
749         sum5 += *v5++ * tmp0;
750       }
751       y[row++]=sum1;
752       y[row++]=sum2;
753       y[row++]=sum3;
754       y[row++]=sum4;
755       y[row++]=sum5;
756       v1      =v5;       /* Since the next block to be processed starts there */
757       idx    +=4*sz;
758       break;
759     default :
760       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
761     }
762   }
763   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
764   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
765   if (zz != yy) {
766     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
767   }
768   ierr = PetscLogFlops(2*a->nz);CHKERRQ(ierr);
769   PetscFunctionReturn(0);
770 }
771 
772 /* ----------------------------------------------------------- */
773 #undef __FUNCT__
774 #define __FUNCT__ "MatSolve_Inode"
775 PetscErrorCode MatSolve_Inode(Mat A,Vec bb,Vec xx)
776 {
777   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
778   IS             iscol = a->col,isrow = a->row;
779   PetscErrorCode ierr;
780   PetscInt       *r,*c,i,j,n = A->rmap.n,*ai = a->i,nz,*a_j = a->j;
781   PetscInt       node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1,*rout,*cout;
782   PetscScalar    *x,*b,*a_a = a->a,*tmp,*tmps,*aa,tmp0,tmp1;
783   PetscScalar    sum1,sum2,sum3,sum4,sum5,*v1,*v2,*v3,*v4,*v5;
784 
785   PetscFunctionBegin;
786   if (A->factor!=FACTOR_LU) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unfactored matrix");
787   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
788   node_max = a->inode.node_count;
789   ns       = a->inode.size;     /* Node Size array */
790 
791   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
792   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
793   tmp  = a->solve_work;
794 
795   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
796   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
797 
798   /* forward solve the lower triangular */
799   tmps = tmp ;
800   aa   = a_a ;
801   aj   = a_j ;
802   ad   = a->diag;
803 
804   for (i = 0,row = 0; i< node_max; ++i){
805     nsz = ns[i];
806     aii = ai[row];
807     v1  = aa + aii;
808     vi  = aj + aii;
809     nz  = ad[row]- aii;
810 
811     switch (nsz){               /* Each loop in 'case' is unrolled */
812     case 1 :
813       sum1 = b[*r++];
814       /*      while (nz--) sum1 -= *v1++ *tmps[*vi++];*/
815       for(j=0; j<nz-1; j+=2){
816         i0   = vi[0];
817         i1   = vi[1];
818         vi  +=2;
819         tmp0 = tmps[i0];
820         tmp1 = tmps[i1];
821         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
822       }
823       if(j == nz-1){
824         tmp0 = tmps[*vi++];
825         sum1 -= *v1++ *tmp0;
826       }
827       tmp[row ++]=sum1;
828       break;
829     case 2:
830       sum1 = b[*r++];
831       sum2 = b[*r++];
832       v2   = aa + ai[row+1];
833 
834       for(j=0; j<nz-1; j+=2){
835         i0   = vi[0];
836         i1   = vi[1];
837         vi  +=2;
838         tmp0 = tmps[i0];
839         tmp1 = tmps[i1];
840         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
841         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
842       }
843       if(j == nz-1){
844         tmp0 = tmps[*vi++];
845         sum1 -= *v1++ *tmp0;
846         sum2 -= *v2++ *tmp0;
847       }
848       sum2 -= *v2++ * sum1;
849       tmp[row ++]=sum1;
850       tmp[row ++]=sum2;
851       break;
852     case 3:
853       sum1 = b[*r++];
854       sum2 = b[*r++];
855       sum3 = b[*r++];
856       v2   = aa + ai[row+1];
857       v3   = aa + ai[row+2];
858 
859       for (j=0; j<nz-1; j+=2){
860         i0   = vi[0];
861         i1   = vi[1];
862         vi  +=2;
863         tmp0 = tmps[i0];
864         tmp1 = tmps[i1];
865         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
866         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
867         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
868       }
869       if (j == nz-1){
870         tmp0 = tmps[*vi++];
871         sum1 -= *v1++ *tmp0;
872         sum2 -= *v2++ *tmp0;
873         sum3 -= *v3++ *tmp0;
874       }
875       sum2 -= *v2++ * sum1;
876       sum3 -= *v3++ * sum1;
877       sum3 -= *v3++ * sum2;
878       tmp[row ++]=sum1;
879       tmp[row ++]=sum2;
880       tmp[row ++]=sum3;
881       break;
882 
883     case 4:
884       sum1 = b[*r++];
885       sum2 = b[*r++];
886       sum3 = b[*r++];
887       sum4 = b[*r++];
888       v2   = aa + ai[row+1];
889       v3   = aa + ai[row+2];
890       v4   = aa + ai[row+3];
891 
892       for (j=0; j<nz-1; j+=2){
893         i0   = vi[0];
894         i1   = vi[1];
895         vi  +=2;
896         tmp0 = tmps[i0];
897         tmp1 = tmps[i1];
898         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
899         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
900         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
901         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
902       }
903       if (j == nz-1){
904         tmp0 = tmps[*vi++];
905         sum1 -= *v1++ *tmp0;
906         sum2 -= *v2++ *tmp0;
907         sum3 -= *v3++ *tmp0;
908         sum4 -= *v4++ *tmp0;
909       }
910       sum2 -= *v2++ * sum1;
911       sum3 -= *v3++ * sum1;
912       sum4 -= *v4++ * sum1;
913       sum3 -= *v3++ * sum2;
914       sum4 -= *v4++ * sum2;
915       sum4 -= *v4++ * sum3;
916 
917       tmp[row ++]=sum1;
918       tmp[row ++]=sum2;
919       tmp[row ++]=sum3;
920       tmp[row ++]=sum4;
921       break;
922     case 5:
923       sum1 = b[*r++];
924       sum2 = b[*r++];
925       sum3 = b[*r++];
926       sum4 = b[*r++];
927       sum5 = b[*r++];
928       v2   = aa + ai[row+1];
929       v3   = aa + ai[row+2];
930       v4   = aa + ai[row+3];
931       v5   = aa + ai[row+4];
932 
933       for (j=0; j<nz-1; j+=2){
934         i0   = vi[0];
935         i1   = vi[1];
936         vi  +=2;
937         tmp0 = tmps[i0];
938         tmp1 = tmps[i1];
939         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
940         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
941         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
942         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
943         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
944       }
945       if (j == nz-1){
946         tmp0 = tmps[*vi++];
947         sum1 -= *v1++ *tmp0;
948         sum2 -= *v2++ *tmp0;
949         sum3 -= *v3++ *tmp0;
950         sum4 -= *v4++ *tmp0;
951         sum5 -= *v5++ *tmp0;
952       }
953 
954       sum2 -= *v2++ * sum1;
955       sum3 -= *v3++ * sum1;
956       sum4 -= *v4++ * sum1;
957       sum5 -= *v5++ * sum1;
958       sum3 -= *v3++ * sum2;
959       sum4 -= *v4++ * sum2;
960       sum5 -= *v5++ * sum2;
961       sum4 -= *v4++ * sum3;
962       sum5 -= *v5++ * sum3;
963       sum5 -= *v5++ * sum4;
964 
965       tmp[row ++]=sum1;
966       tmp[row ++]=sum2;
967       tmp[row ++]=sum3;
968       tmp[row ++]=sum4;
969       tmp[row ++]=sum5;
970       break;
971     default:
972       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
973     }
974   }
975   /* backward solve the upper triangular */
976   for (i=node_max -1 ,row = n-1 ; i>=0; i--){
977     nsz = ns[i];
978     aii = ai[row+1] -1;
979     v1  = aa + aii;
980     vi  = aj + aii;
981     nz  = aii- ad[row];
982     switch (nsz){               /* Each loop in 'case' is unrolled */
983     case 1 :
984       sum1 = tmp[row];
985 
986       for(j=nz ; j>1; j-=2){
987         vi  -=2;
988         i0   = vi[2];
989         i1   = vi[1];
990         tmp0 = tmps[i0];
991         tmp1 = tmps[i1];
992         v1   -= 2;
993         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
994       }
995       if (j==1){
996         tmp0  = tmps[*vi--];
997         sum1 -= *v1-- * tmp0;
998       }
999       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1000       break;
1001     case 2 :
1002       sum1 = tmp[row];
1003       sum2 = tmp[row -1];
1004       v2   = aa + ai[row]-1;
1005       for (j=nz ; j>1; j-=2){
1006         vi  -=2;
1007         i0   = vi[2];
1008         i1   = vi[1];
1009         tmp0 = tmps[i0];
1010         tmp1 = tmps[i1];
1011         v1   -= 2;
1012         v2   -= 2;
1013         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1014         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1015       }
1016       if (j==1){
1017         tmp0  = tmps[*vi--];
1018         sum1 -= *v1-- * tmp0;
1019         sum2 -= *v2-- * tmp0;
1020       }
1021 
1022       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1023       sum2   -= *v2-- * tmp0;
1024       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1025       break;
1026     case 3 :
1027       sum1 = tmp[row];
1028       sum2 = tmp[row -1];
1029       sum3 = tmp[row -2];
1030       v2   = aa + ai[row]-1;
1031       v3   = aa + ai[row -1]-1;
1032       for (j=nz ; j>1; j-=2){
1033         vi  -=2;
1034         i0   = vi[2];
1035         i1   = vi[1];
1036         tmp0 = tmps[i0];
1037         tmp1 = tmps[i1];
1038         v1   -= 2;
1039         v2   -= 2;
1040         v3   -= 2;
1041         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1042         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1043         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1044       }
1045       if (j==1){
1046         tmp0  = tmps[*vi--];
1047         sum1 -= *v1-- * tmp0;
1048         sum2 -= *v2-- * tmp0;
1049         sum3 -= *v3-- * tmp0;
1050       }
1051       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1052       sum2   -= *v2-- * tmp0;
1053       sum3   -= *v3-- * tmp0;
1054       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1055       sum3   -= *v3-- * tmp0;
1056       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1057 
1058       break;
1059     case 4 :
1060       sum1 = tmp[row];
1061       sum2 = tmp[row -1];
1062       sum3 = tmp[row -2];
1063       sum4 = tmp[row -3];
1064       v2   = aa + ai[row]-1;
1065       v3   = aa + ai[row -1]-1;
1066       v4   = aa + ai[row -2]-1;
1067 
1068       for (j=nz ; j>1; j-=2){
1069         vi  -=2;
1070         i0   = vi[2];
1071         i1   = vi[1];
1072         tmp0 = tmps[i0];
1073         tmp1 = tmps[i1];
1074         v1  -= 2;
1075         v2  -= 2;
1076         v3  -= 2;
1077         v4  -= 2;
1078         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1079         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1080         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1081         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1082       }
1083       if (j==1){
1084         tmp0  = tmps[*vi--];
1085         sum1 -= *v1-- * tmp0;
1086         sum2 -= *v2-- * tmp0;
1087         sum3 -= *v3-- * tmp0;
1088         sum4 -= *v4-- * tmp0;
1089       }
1090 
1091       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1092       sum2   -= *v2-- * tmp0;
1093       sum3   -= *v3-- * tmp0;
1094       sum4   -= *v4-- * tmp0;
1095       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1096       sum3   -= *v3-- * tmp0;
1097       sum4   -= *v4-- * tmp0;
1098       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1099       sum4   -= *v4-- * tmp0;
1100       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1101       break;
1102     case 5 :
1103       sum1 = tmp[row];
1104       sum2 = tmp[row -1];
1105       sum3 = tmp[row -2];
1106       sum4 = tmp[row -3];
1107       sum5 = tmp[row -4];
1108       v2   = aa + ai[row]-1;
1109       v3   = aa + ai[row -1]-1;
1110       v4   = aa + ai[row -2]-1;
1111       v5   = aa + ai[row -3]-1;
1112       for (j=nz ; j>1; j-=2){
1113         vi  -= 2;
1114         i0   = vi[2];
1115         i1   = vi[1];
1116         tmp0 = tmps[i0];
1117         tmp1 = tmps[i1];
1118         v1   -= 2;
1119         v2   -= 2;
1120         v3   -= 2;
1121         v4   -= 2;
1122         v5   -= 2;
1123         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1124         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1125         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1126         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1127         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1128       }
1129       if (j==1){
1130         tmp0  = tmps[*vi--];
1131         sum1 -= *v1-- * tmp0;
1132         sum2 -= *v2-- * tmp0;
1133         sum3 -= *v3-- * tmp0;
1134         sum4 -= *v4-- * tmp0;
1135         sum5 -= *v5-- * tmp0;
1136       }
1137 
1138       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1139       sum2   -= *v2-- * tmp0;
1140       sum3   -= *v3-- * tmp0;
1141       sum4   -= *v4-- * tmp0;
1142       sum5   -= *v5-- * tmp0;
1143       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1144       sum3   -= *v3-- * tmp0;
1145       sum4   -= *v4-- * tmp0;
1146       sum5   -= *v5-- * tmp0;
1147       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1148       sum4   -= *v4-- * tmp0;
1149       sum5   -= *v5-- * tmp0;
1150       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1151       sum5   -= *v5-- * tmp0;
1152       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1153       break;
1154     default:
1155       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1156     }
1157   }
1158   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1159   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1160   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1161   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1162   ierr = PetscLogFlops(2*a->nz - A->cmap.n);CHKERRQ(ierr);
1163   PetscFunctionReturn(0);
1164 }
1165 
1166 #undef __FUNCT__
1167 #define __FUNCT__ "MatLUFactorNumeric_Inode"
1168 PetscErrorCode MatLUFactorNumeric_Inode(Mat A,MatFactorInfo *info,Mat *B)
1169 {
1170   Mat            C = *B;
1171   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1172   IS             iscol = b->col,isrow = b->row,isicol = b->icol;
1173   PetscErrorCode ierr;
1174   PetscInt       *r,*ic,*c,n = A->rmap.n,*bi = b->i;
1175   PetscInt       *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1176   PetscInt       *ics,i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz;
1177   PetscInt       *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1178   PetscScalar    *rtmp1,*rtmp2,*rtmp3,*v1,*v2,*v3,*pc1,*pc2,*pc3,mul1,mul2,mul3;
1179   PetscScalar    tmp,*ba = b->a,*aa = a->a,*pv;
1180   PetscReal      rs=0.0;
1181   LUShift_Ctx    sctx;
1182   PetscInt       newshift;
1183 
1184   PetscFunctionBegin;
1185   sctx.shift_top  = 0;
1186   sctx.nshift_max = 0;
1187   sctx.shift_lo   = 0;
1188   sctx.shift_hi   = 0;
1189 
1190   /* if both shift schemes are chosen by user, only use info->shiftpd */
1191   if (info->shiftpd && info->shiftnz) info->shiftnz = 0.0;
1192   if (info->shiftpd) { /* set sctx.shift_top=max{rs} */
1193     sctx.shift_top = 0;
1194     for (i=0; i<n; i++) {
1195       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1196       rs    = 0.0;
1197       ajtmp = aj + ai[i];
1198       rtmp1 = aa + ai[i];
1199       nz = ai[i+1] - ai[i];
1200       for (j=0; j<nz; j++){
1201         if (*ajtmp != i){
1202           rs += PetscAbsScalar(*rtmp1++);
1203         } else {
1204           rs -= PetscRealPart(*rtmp1++);
1205         }
1206         ajtmp++;
1207       }
1208       if (rs>sctx.shift_top) sctx.shift_top = rs;
1209     }
1210     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1211     sctx.shift_top *= 1.1;
1212     sctx.nshift_max = 5;
1213     sctx.shift_lo   = 0.;
1214     sctx.shift_hi   = 1.;
1215   }
1216   sctx.shift_amount = 0;
1217   sctx.nshift       = 0;
1218 
1219   ierr  = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1220   ierr  = ISGetIndices(iscol,&c);CHKERRQ(ierr);
1221   ierr  = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1222   ierr  = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp1);CHKERRQ(ierr);
1223   ierr  = PetscMemzero(rtmp1,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr);
1224   ics   = ic ;
1225   rtmp2 = rtmp1 + n;
1226   rtmp3 = rtmp2 + n;
1227 
1228   node_max = a->inode.node_count;
1229   ns       = a->inode.size ;
1230   if (!ns){
1231     SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information");
1232   }
1233 
1234   /* If max inode size > 3, split it into two inodes.*/
1235   /* also map the inode sizes according to the ordering */
1236   ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr);
1237   for (i=0,j=0; i<node_max; ++i,++j){
1238     if (ns[i]>3) {
1239       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1240       ++j;
1241       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1242     } else {
1243       tmp_vec1[j] = ns[i];
1244     }
1245   }
1246   /* Use the correct node_max */
1247   node_max = j;
1248 
1249   /* Now reorder the inode info based on mat re-ordering info */
1250   /* First create a row -> inode_size_array_index map */
1251   ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr);
1252   ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr);
1253   for (i=0,row=0; i<node_max; i++) {
1254     nodesz = tmp_vec1[i];
1255     for (j=0; j<nodesz; j++,row++) {
1256       nsmap[row] = i;
1257     }
1258   }
1259   /* Using nsmap, create a reordered ns structure */
1260   for (i=0,j=0; i< node_max; i++) {
1261     nodesz       = tmp_vec1[nsmap[r[j]]];    /* here the reordered row_no is in r[] */
1262     tmp_vec2[i]  = nodesz;
1263     j           += nodesz;
1264   }
1265   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1266   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1267   /* Now use the correct ns */
1268   ns = tmp_vec2;
1269 
1270   do {
1271     sctx.lushift = PETSC_FALSE;
1272     /* Now loop over each block-row, and do the factorization */
1273     for (i=0,row=0; i<node_max; i++) {
1274       nodesz = ns[i];
1275       nz     = bi[row+1] - bi[row];
1276       bjtmp  = bj + bi[row];
1277 
1278       switch (nodesz){
1279       case 1:
1280         for  (j=0; j<nz; j++){
1281           idx        = bjtmp[j];
1282           rtmp1[idx] = 0.0;
1283         }
1284 
1285         /* load in initial (unfactored row) */
1286         idx    = r[row];
1287         nz_tmp = ai[idx+1] - ai[idx];
1288         ajtmp  = aj + ai[idx];
1289         v1     = aa + ai[idx];
1290 
1291         for (j=0; j<nz_tmp; j++) {
1292           idx        = ics[ajtmp[j]];
1293           rtmp1[idx] = v1[j];
1294         }
1295         rtmp1[ics[r[row]]] += sctx.shift_amount;
1296 
1297         prow = *bjtmp++ ;
1298         while (prow < row) {
1299           pc1 = rtmp1 + prow;
1300           if (*pc1 != 0.0){
1301             pv   = ba + bd[prow];
1302             pj   = nbj + bd[prow];
1303             mul1 = *pc1 * *pv++;
1304             *pc1 = mul1;
1305             nz_tmp = bi[prow+1] - bd[prow] - 1;
1306             ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr);
1307             for (j=0; j<nz_tmp; j++) {
1308               tmp = pv[j];
1309               idx = pj[j];
1310               rtmp1[idx] -= mul1 * tmp;
1311             }
1312           }
1313           prow = *bjtmp++ ;
1314         }
1315         pj  = bj + bi[row];
1316         pc1 = ba + bi[row];
1317 
1318         sctx.pv    = rtmp1[row];
1319         rtmp1[row] = 1.0/rtmp1[row]; /* invert diag */
1320         rs         = 0.0;
1321         for (j=0; j<nz; j++) {
1322           idx    = pj[j];
1323           pc1[j] = rtmp1[idx]; /* rtmp1 -> ba */
1324           if (idx != row) rs += PetscAbsScalar(pc1[j]);
1325         }
1326         sctx.rs  = rs;
1327         ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr);
1328         if (newshift == 1) goto endofwhile;
1329         break;
1330 
1331       case 2:
1332         for (j=0; j<nz; j++) {
1333           idx        = bjtmp[j];
1334           rtmp1[idx] = 0.0;
1335           rtmp2[idx] = 0.0;
1336         }
1337 
1338         /* load in initial (unfactored row) */
1339         idx    = r[row];
1340         nz_tmp = ai[idx+1] - ai[idx];
1341         ajtmp  = aj + ai[idx];
1342         v1     = aa + ai[idx];
1343         v2     = aa + ai[idx+1];
1344         for (j=0; j<nz_tmp; j++) {
1345           idx        = ics[ajtmp[j]];
1346           rtmp1[idx] = v1[j];
1347           rtmp2[idx] = v2[j];
1348         }
1349         rtmp1[ics[r[row]]]   += sctx.shift_amount;
1350         rtmp2[ics[r[row+1]]] += sctx.shift_amount;
1351 
1352         prow = *bjtmp++ ;
1353         while (prow < row) {
1354           pc1 = rtmp1 + prow;
1355           pc2 = rtmp2 + prow;
1356           if (*pc1 != 0.0 || *pc2 != 0.0){
1357             pv   = ba + bd[prow];
1358             pj   = nbj + bd[prow];
1359             mul1 = *pc1 * *pv;
1360             mul2 = *pc2 * *pv;
1361             ++pv;
1362             *pc1 = mul1;
1363             *pc2 = mul2;
1364 
1365             nz_tmp = bi[prow+1] - bd[prow] - 1;
1366             for (j=0; j<nz_tmp; j++) {
1367               tmp = pv[j];
1368               idx = pj[j];
1369               rtmp1[idx] -= mul1 * tmp;
1370               rtmp2[idx] -= mul2 * tmp;
1371             }
1372             ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr);
1373           }
1374           prow = *bjtmp++ ;
1375         }
1376 
1377         /* Now take care of diagonal 2x2 block. Note: prow = row here */
1378         pc1 = rtmp1 + prow;
1379         pc2 = rtmp2 + prow;
1380 
1381         sctx.pv = *pc1;
1382         pj      = bj + bi[prow];
1383         rs      = 0.0;
1384         for (j=0; j<nz; j++){
1385           idx = pj[j];
1386           if (idx != prow) rs += PetscAbsScalar(rtmp1[idx]);
1387         }
1388         sctx.rs = rs;
1389         ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr);
1390         if (newshift == 1) goto endofwhile;
1391 
1392         if (*pc2 != 0.0){
1393           pj     = nbj + bd[prow];
1394           mul2   = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
1395           *pc2   = mul2;
1396           nz_tmp = bi[prow+1] - bd[prow] - 1;
1397           for (j=0; j<nz_tmp; j++) {
1398             idx = pj[j] ;
1399             tmp = rtmp1[idx];
1400             rtmp2[idx] -= mul2 * tmp;
1401           }
1402           ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr);
1403         }
1404 
1405         pj  = bj + bi[row];
1406         pc1 = ba + bi[row];
1407         pc2 = ba + bi[row+1];
1408 
1409         sctx.pv = rtmp2[row+1];
1410         rs = 0.0;
1411         rtmp1[row]   = 1.0/rtmp1[row];
1412         rtmp2[row+1] = 1.0/rtmp2[row+1];
1413         /* copy row entries from dense representation to sparse */
1414         for (j=0; j<nz; j++) {
1415           idx    = pj[j];
1416           pc1[j] = rtmp1[idx];
1417           pc2[j] = rtmp2[idx];
1418           if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
1419         }
1420         sctx.rs = rs;
1421         ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr);
1422         if (newshift == 1) goto endofwhile;
1423         break;
1424 
1425       case 3:
1426         for  (j=0; j<nz; j++) {
1427           idx        = bjtmp[j];
1428           rtmp1[idx] = 0.0;
1429           rtmp2[idx] = 0.0;
1430           rtmp3[idx] = 0.0;
1431         }
1432         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
1433         idx    = r[row];
1434         nz_tmp = ai[idx+1] - ai[idx];
1435         ajtmp = aj + ai[idx];
1436         v1    = aa + ai[idx];
1437         v2    = aa + ai[idx+1];
1438         v3    = aa + ai[idx+2];
1439         for (j=0; j<nz_tmp; j++) {
1440           idx        = ics[ajtmp[j]];
1441           rtmp1[idx] = v1[j];
1442           rtmp2[idx] = v2[j];
1443           rtmp3[idx] = v3[j];
1444         }
1445         rtmp1[ics[r[row]]]   += sctx.shift_amount;
1446         rtmp2[ics[r[row+1]]] += sctx.shift_amount;
1447         rtmp3[ics[r[row+2]]] += sctx.shift_amount;
1448 
1449         /* loop over all pivot row blocks above this row block */
1450         prow = *bjtmp++ ;
1451         while (prow < row) {
1452           pc1 = rtmp1 + prow;
1453           pc2 = rtmp2 + prow;
1454           pc3 = rtmp3 + prow;
1455           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){
1456             pv   = ba  + bd[prow];
1457             pj   = nbj + bd[prow];
1458             mul1 = *pc1 * *pv;
1459             mul2 = *pc2 * *pv;
1460             mul3 = *pc3 * *pv;
1461             ++pv;
1462             *pc1 = mul1;
1463             *pc2 = mul2;
1464             *pc3 = mul3;
1465 
1466             nz_tmp = bi[prow+1] - bd[prow] - 1;
1467             /* update this row based on pivot row */
1468             for (j=0; j<nz_tmp; j++) {
1469               tmp = pv[j];
1470               idx = pj[j];
1471               rtmp1[idx] -= mul1 * tmp;
1472               rtmp2[idx] -= mul2 * tmp;
1473               rtmp3[idx] -= mul3 * tmp;
1474             }
1475             ierr = PetscLogFlops(6*nz_tmp);CHKERRQ(ierr);
1476           }
1477           prow = *bjtmp++ ;
1478         }
1479 
1480         /* Now take care of diagonal 3x3 block in this set of rows */
1481         /* note: prow = row here */
1482         pc1 = rtmp1 + prow;
1483         pc2 = rtmp2 + prow;
1484         pc3 = rtmp3 + prow;
1485 
1486         sctx.pv = *pc1;
1487         pj      = bj + bi[prow];
1488         rs      = 0.0;
1489         for (j=0; j<nz; j++){
1490           idx = pj[j];
1491           if (idx != row) rs += PetscAbsScalar(rtmp1[idx]);
1492         }
1493         sctx.rs = rs;
1494         ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr);
1495         if (newshift == 1) goto endofwhile;
1496 
1497         if (*pc2 != 0.0 || *pc3 != 0.0){
1498           mul2 = (*pc2)/(*pc1);
1499           mul3 = (*pc3)/(*pc1);
1500           *pc2 = mul2;
1501           *pc3 = mul3;
1502           nz_tmp = bi[prow+1] - bd[prow] - 1;
1503           pj     = nbj + bd[prow];
1504           for (j=0; j<nz_tmp; j++) {
1505             idx = pj[j] ;
1506             tmp = rtmp1[idx];
1507             rtmp2[idx] -= mul2 * tmp;
1508             rtmp3[idx] -= mul3 * tmp;
1509           }
1510           ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr);
1511         }
1512         ++prow;
1513 
1514         pc2 = rtmp2 + prow;
1515         pc3 = rtmp3 + prow;
1516         sctx.pv = *pc2;
1517         pj      = bj + bi[prow];
1518         rs      = 0.0;
1519         for (j=0; j<nz; j++){
1520           idx = pj[j];
1521           if (idx != prow) rs += PetscAbsScalar(rtmp2[idx]);
1522         }
1523         sctx.rs = rs;
1524         ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr);
1525         if (newshift == 1) goto endofwhile;
1526 
1527         if (*pc3 != 0.0){
1528           mul3   = (*pc3)/(*pc2);
1529           *pc3   = mul3;
1530           pj     = nbj + bd[prow];
1531           nz_tmp = bi[prow+1] - bd[prow] - 1;
1532           for (j=0; j<nz_tmp; j++) {
1533             idx = pj[j] ;
1534             tmp = rtmp2[idx];
1535             rtmp3[idx] -= mul3 * tmp;
1536           }
1537           ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr);
1538         }
1539 
1540         pj  = bj + bi[row];
1541         pc1 = ba + bi[row];
1542         pc2 = ba + bi[row+1];
1543         pc3 = ba + bi[row+2];
1544 
1545         sctx.pv = rtmp3[row+2];
1546         rs = 0.0;
1547         rtmp1[row]   = 1.0/rtmp1[row];
1548         rtmp2[row+1] = 1.0/rtmp2[row+1];
1549         rtmp3[row+2] = 1.0/rtmp3[row+2];
1550         /* copy row entries from dense representation to sparse */
1551         for (j=0; j<nz; j++) {
1552           idx    = pj[j];
1553           pc1[j] = rtmp1[idx];
1554           pc2[j] = rtmp2[idx];
1555           pc3[j] = rtmp3[idx];
1556           if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
1557         }
1558 
1559         sctx.rs = rs;
1560         ierr = MatLUCheckShift_inline(info,sctx,row+2,newshift);CHKERRQ(ierr);
1561         if (newshift == 1) goto endofwhile;
1562         break;
1563 
1564       default:
1565         SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1566       }
1567       row += nodesz;                 /* Update the row */
1568     }
1569     endofwhile:;
1570   } while (sctx.lushift);
1571   ierr = PetscFree(rtmp1);CHKERRQ(ierr);
1572   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
1573   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
1574   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1575   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
1576   C->factor      = FACTOR_LU;
1577   C->assembled   = PETSC_TRUE;
1578   if (sctx.nshift) {
1579     if (info->shiftnz) {
1580       ierr = PetscInfo2(0,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
1581     } else if (info->shiftpd) {
1582       ierr = PetscInfo4(0,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,info->shift_fraction,sctx.shift_top);CHKERRQ(ierr);
1583     }
1584   }
1585   ierr = PetscLogFlops(C->cmap.n);CHKERRQ(ierr);
1586   PetscFunctionReturn(0);
1587 }
1588 
1589 /*
1590      Makes a longer coloring[] array and calls the usual code with that
1591 */
1592 #undef __FUNCT__
1593 #define __FUNCT__ "MatColoringPatch_Inode"
1594 PetscErrorCode MatColoringPatch_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
1595 {
1596   Mat_SeqAIJ       *a = (Mat_SeqAIJ*)mat->data;
1597   PetscErrorCode  ierr;
1598   PetscInt        n = mat->cmap.n,m = a->inode.node_count,j,*ns = a->inode.size,row;
1599   PetscInt        *colorused,i;
1600   ISColoringValue *newcolor;
1601 
1602   PetscFunctionBegin;
1603   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr);
1604   /* loop over inodes, marking a color for each column*/
1605   row = 0;
1606   for (i=0; i<m; i++){
1607     for (j=0; j<ns[i]; j++) {
1608       newcolor[row++] = coloring[i] + j*ncolors;
1609     }
1610   }
1611 
1612   /* eliminate unneeded colors */
1613   ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr);
1614   ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr);
1615   for (i=0; i<n; i++) {
1616     colorused[newcolor[i]] = 1;
1617   }
1618 
1619   for (i=1; i<5*ncolors; i++) {
1620     colorused[i] += colorused[i-1];
1621   }
1622   ncolors = colorused[5*ncolors-1];
1623   for (i=0; i<n; i++) {
1624     newcolor[i] = colorused[newcolor[i]]-1;
1625   }
1626   ierr = PetscFree(colorused);CHKERRQ(ierr);
1627   ierr = ISColoringCreate(mat->comm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr);
1628   ierr = PetscFree(coloring);CHKERRQ(ierr);
1629   PetscFunctionReturn(0);
1630 }
1631 
1632 #include "src/inline/ilu.h"
1633 
1634 #undef __FUNCT__
1635 #define __FUNCT__ "MatRelax_Inode"
1636 PetscErrorCode MatRelax_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
1637 {
1638   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
1639   PetscScalar     *x,*xs,*ibdiag,*bdiag,sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3,*v1,*v2,*v3,*v4,*v5;
1640   PetscScalar     *v = a->a,*b,*xb,tmp4,tmp5,x1,x2,x3,x4,x5;
1641   PetscReal       zeropivot = 1.0e-15;
1642   PetscErrorCode  ierr;
1643   PetscInt        n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2;
1644   PetscInt        *idx,*diag = a->diag,*ii = a->i,sz,k;
1645 
1646   PetscFunctionBegin;
1647   if (omega != 1.0) SETERRQ(PETSC_ERR_SUP,"No support for omega != 1.0");
1648 
1649   its = its*lits;
1650 
1651   if (!a->inode.ibdiag) {
1652     /* calculate space needed for diagonal blocks */
1653     for (i=0; i<m; i++) {
1654       cnt += sizes[i]*sizes[i];
1655     }
1656     a->inode.bdiagsize = cnt;
1657     ierr   = PetscMalloc2(cnt,PetscScalar,&a->inode.ibdiag,cnt,PetscScalar,&a->inode.bdiag);CHKERRQ(ierr);
1658     ibdiag = a->inode.ibdiag;
1659     bdiag  = a->inode.bdiag;
1660     /* copy over the diagonal blocks and invert them */
1661 
1662     cnt = 0;
1663     for (i=0, row = 0; i<m; i++) {
1664       for (j=0; j<sizes[i]; j++) {
1665         for (k=0; k<sizes[i]; k++) {
1666           bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
1667         }
1668       }
1669       ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(PetscScalar));CHKERRQ(ierr);
1670 
1671       switch(sizes[i]) {
1672         case 1:
1673           /* Create matrix data structure */
1674           if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
1675           ibdiag[cnt] = 1.0/ibdiag[cnt];
1676           break;
1677         case 2:
1678           ierr = Kernel_A_gets_inverse_A_2(ibdiag+cnt);CHKERRQ(ierr);
1679           break;
1680         case 3:
1681           ierr = Kernel_A_gets_inverse_A_3(ibdiag+cnt);CHKERRQ(ierr);
1682           break;
1683         case 4:
1684           ierr = Kernel_A_gets_inverse_A_4(ibdiag+cnt);CHKERRQ(ierr);
1685           break;
1686         case 5:
1687           ierr = Kernel_A_gets_inverse_A_5(ibdiag+cnt);CHKERRQ(ierr);
1688           break;
1689        default:
1690 	 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
1691       }
1692       cnt += sizes[i]*sizes[i];
1693       row += sizes[i];
1694     }
1695   }
1696   ibdiag = a->inode.ibdiag;
1697   bdiag  = a->inode.bdiag;
1698 
1699   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1700   if (xx != bb) {
1701     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1702   } else {
1703     b = x;
1704   }
1705 
1706   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
1707   xs   = x;
1708   if (flag & SOR_ZERO_INITIAL_GUESS) {
1709     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){
1710 
1711       for (i=0, row=0; i<m; i++) {
1712         sz  = diag[row] - ii[row];
1713         v1  = a->a + ii[row];
1714         idx = a->j + ii[row];
1715 
1716         /* see comments for MatMult_Inode() for how this is coded */
1717         switch (sizes[i]){
1718           case 1:
1719 
1720             sum1  = b[row];
1721             for(n = 0; n<sz-1; n+=2) {
1722               i1   = idx[0];
1723               i2   = idx[1];
1724               idx += 2;
1725               tmp0 = x[i1];
1726               tmp1 = x[i2];
1727               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1728             }
1729 
1730             if (n == sz-1){
1731               tmp0  = x[*idx];
1732               sum1 -= *v1 * tmp0;
1733             }
1734             x[row++] = sum1*(*ibdiag++);
1735             break;
1736           case 2:
1737             v2    = a->a + ii[row+1];
1738             sum1  = b[row];
1739             sum2  = b[row+1];
1740             for(n = 0; n<sz-1; n+=2) {
1741               i1   = idx[0];
1742               i2   = idx[1];
1743               idx += 2;
1744               tmp0 = x[i1];
1745               tmp1 = x[i2];
1746               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1747               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1748             }
1749 
1750             if (n == sz-1){
1751               tmp0  = x[*idx];
1752               sum1 -= v1[0] * tmp0;
1753               sum2 -= v2[0] * tmp0;
1754             }
1755             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
1756             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
1757             ibdiag  += 4;
1758             break;
1759           case 3:
1760             v2    = a->a + ii[row+1];
1761             v3    = a->a + ii[row+2];
1762             sum1  = b[row];
1763             sum2  = b[row+1];
1764             sum3  = b[row+2];
1765             for(n = 0; n<sz-1; n+=2) {
1766               i1   = idx[0];
1767               i2   = idx[1];
1768               idx += 2;
1769               tmp0 = x[i1];
1770               tmp1 = x[i2];
1771               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1772               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1773               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1774             }
1775 
1776             if (n == sz-1){
1777               tmp0  = x[*idx];
1778               sum1 -= v1[0] * tmp0;
1779               sum2 -= v2[0] * tmp0;
1780               sum3 -= v3[0] * tmp0;
1781             }
1782             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
1783             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
1784             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
1785             ibdiag  += 9;
1786             break;
1787           case 4:
1788             v2    = a->a + ii[row+1];
1789             v3    = a->a + ii[row+2];
1790             v4    = a->a + ii[row+3];
1791             sum1  = b[row];
1792             sum2  = b[row+1];
1793             sum3  = b[row+2];
1794             sum4  = b[row+3];
1795             for(n = 0; n<sz-1; n+=2) {
1796               i1   = idx[0];
1797               i2   = idx[1];
1798               idx += 2;
1799               tmp0 = x[i1];
1800               tmp1 = x[i2];
1801               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1802               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1803               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1804               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
1805             }
1806 
1807             if (n == sz-1){
1808               tmp0  = x[*idx];
1809               sum1 -= v1[0] * tmp0;
1810               sum2 -= v2[0] * tmp0;
1811               sum3 -= v3[0] * tmp0;
1812               sum4 -= v4[0] * tmp0;
1813             }
1814             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
1815             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
1816             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
1817             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
1818             ibdiag  += 16;
1819             break;
1820           case 5:
1821             v2    = a->a + ii[row+1];
1822             v3    = a->a + ii[row+2];
1823             v4    = a->a + ii[row+3];
1824             v5    = a->a + ii[row+4];
1825             sum1  = b[row];
1826             sum2  = b[row+1];
1827             sum3  = b[row+2];
1828             sum4  = b[row+3];
1829             sum5  = b[row+4];
1830             for(n = 0; n<sz-1; n+=2) {
1831               i1   = idx[0];
1832               i2   = idx[1];
1833               idx += 2;
1834               tmp0 = x[i1];
1835               tmp1 = x[i2];
1836               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1837               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1838               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1839               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
1840               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
1841             }
1842 
1843             if (n == sz-1){
1844               tmp0  = x[*idx];
1845               sum1 -= v1[0] * tmp0;
1846               sum2 -= v2[0] * tmp0;
1847               sum3 -= v3[0] * tmp0;
1848               sum4 -= v4[0] * tmp0;
1849               sum5 -= v5[0] * tmp0;
1850             }
1851             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
1852             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
1853             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
1854             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
1855             x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
1856             ibdiag  += 25;
1857             break;
1858 	  default:
1859    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
1860         }
1861       }
1862 
1863       xb = x;
1864       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
1865     } else xb = b;
1866     if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) &&
1867         (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1868       cnt = 0;
1869       for (i=0, row=0; i<m; i++) {
1870 
1871         switch (sizes[i]){
1872           case 1:
1873             x[row++] *= bdiag[cnt++];
1874             break;
1875           case 2:
1876             x1   = x[row]; x2 = x[row+1];
1877             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
1878             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
1879             x[row++] = tmp1;
1880             x[row++] = tmp2;
1881             cnt += 4;
1882             break;
1883           case 3:
1884             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2];
1885             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
1886             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
1887             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
1888             x[row++] = tmp1;
1889             x[row++] = tmp2;
1890             x[row++] = tmp3;
1891             cnt += 9;
1892             break;
1893           case 4:
1894             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
1895             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
1896             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
1897             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
1898             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
1899             x[row++] = tmp1;
1900             x[row++] = tmp2;
1901             x[row++] = tmp3;
1902             x[row++] = tmp4;
1903             cnt += 16;
1904             break;
1905           case 5:
1906             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
1907             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
1908             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
1909             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
1910             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
1911             tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
1912             x[row++] = tmp1;
1913             x[row++] = tmp2;
1914             x[row++] = tmp3;
1915             x[row++] = tmp4;
1916             x[row++] = tmp5;
1917             cnt += 25;
1918             break;
1919 	  default:
1920    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
1921         }
1922       }
1923       ierr = PetscLogFlops(m);CHKERRQ(ierr);
1924     }
1925     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){
1926 
1927       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
1928       for (i=m-1, row=A->rmap.n-1; i>=0; i--) {
1929         ibdiag -= sizes[i]*sizes[i];
1930         sz      = ii[row+1] - diag[row] - 1;
1931         v1      = a->a + diag[row] + 1;
1932         idx     = a->j + diag[row] + 1;
1933 
1934         /* see comments for MatMult_Inode() for how this is coded */
1935         switch (sizes[i]){
1936           case 1:
1937 
1938             sum1  = xb[row];
1939             for(n = 0; n<sz-1; n+=2) {
1940               i1   = idx[0];
1941               i2   = idx[1];
1942               idx += 2;
1943               tmp0 = x[i1];
1944               tmp1 = x[i2];
1945               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1946             }
1947 
1948             if (n == sz-1){
1949               tmp0  = x[*idx];
1950               sum1 -= *v1*tmp0;
1951             }
1952             x[row--] = sum1*(*ibdiag);
1953             break;
1954 
1955           case 2:
1956 
1957             sum1  = xb[row];
1958             sum2  = xb[row-1];
1959             /* note that sum1 is associated with the second of the two rows */
1960             v2    = a->a + diag[row-1] + 2;
1961             for(n = 0; n<sz-1; n+=2) {
1962               i1   = idx[0];
1963               i2   = idx[1];
1964               idx += 2;
1965               tmp0 = x[i1];
1966               tmp1 = x[i2];
1967               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1968               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1969             }
1970 
1971             if (n == sz-1){
1972               tmp0  = x[*idx];
1973               sum1 -= *v1*tmp0;
1974               sum2 -= *v2*tmp0;
1975             }
1976             x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
1977             x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
1978             break;
1979           case 3:
1980 
1981             sum1  = xb[row];
1982             sum2  = xb[row-1];
1983             sum3  = xb[row-2];
1984             v2    = a->a + diag[row-1] + 2;
1985             v3    = a->a + diag[row-2] + 3;
1986             for(n = 0; n<sz-1; n+=2) {
1987               i1   = idx[0];
1988               i2   = idx[1];
1989               idx += 2;
1990               tmp0 = x[i1];
1991               tmp1 = x[i2];
1992               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1993               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1994               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1995             }
1996 
1997             if (n == sz-1){
1998               tmp0  = x[*idx];
1999               sum1 -= *v1*tmp0;
2000               sum2 -= *v2*tmp0;
2001               sum3 -= *v3*tmp0;
2002             }
2003             x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
2004             x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
2005             x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
2006             break;
2007           case 4:
2008 
2009             sum1  = xb[row];
2010             sum2  = xb[row-1];
2011             sum3  = xb[row-2];
2012             sum4  = xb[row-3];
2013             v2    = a->a + diag[row-1] + 2;
2014             v3    = a->a + diag[row-2] + 3;
2015             v4    = a->a + diag[row-3] + 4;
2016             for(n = 0; n<sz-1; n+=2) {
2017               i1   = idx[0];
2018               i2   = idx[1];
2019               idx += 2;
2020               tmp0 = x[i1];
2021               tmp1 = x[i2];
2022               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2023               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2024               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2025               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2026             }
2027 
2028             if (n == sz-1){
2029               tmp0  = x[*idx];
2030               sum1 -= *v1*tmp0;
2031               sum2 -= *v2*tmp0;
2032               sum3 -= *v3*tmp0;
2033               sum4 -= *v4*tmp0;
2034             }
2035             x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
2036             x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
2037             x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
2038             x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
2039             break;
2040           case 5:
2041 
2042             sum1  = xb[row];
2043             sum2  = xb[row-1];
2044             sum3  = xb[row-2];
2045             sum4  = xb[row-3];
2046             sum5  = xb[row-4];
2047             v2    = a->a + diag[row-1] + 2;
2048             v3    = a->a + diag[row-2] + 3;
2049             v4    = a->a + diag[row-3] + 4;
2050             v5    = a->a + diag[row-4] + 5;
2051             for(n = 0; n<sz-1; n+=2) {
2052               i1   = idx[0];
2053               i2   = idx[1];
2054               idx += 2;
2055               tmp0 = x[i1];
2056               tmp1 = x[i2];
2057               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2058               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2059               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2060               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2061               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2062             }
2063 
2064             if (n == sz-1){
2065               tmp0  = x[*idx];
2066               sum1 -= *v1*tmp0;
2067               sum2 -= *v2*tmp0;
2068               sum3 -= *v3*tmp0;
2069               sum4 -= *v4*tmp0;
2070               sum5 -= *v5*tmp0;
2071             }
2072             x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
2073             x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
2074             x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
2075             x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
2076             x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
2077             break;
2078 	  default:
2079    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2080         }
2081       }
2082 
2083       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2084     }
2085     its--;
2086   }
2087   if (its) SETERRQ(PETSC_ERR_SUP,"Currently no support for multiply SOR sweeps using inode version of AIJ matrix format;\n run with the option -mat_no_inode");
2088   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2089   if (bb != xx) {ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);}
2090   PetscFunctionReturn(0);
2091 }
2092 
2093 
2094 /*
2095     samestructure indicates that the matrix has not changed its nonzero structure so we
2096     do not need to recompute the inodes
2097 */
2098 #undef __FUNCT__
2099 #define __FUNCT__ "Mat_CheckInode"
2100 PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure)
2101 {
2102   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
2103   PetscErrorCode ierr;
2104   PetscInt       i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size;
2105   PetscTruth     flag;
2106 
2107   PetscFunctionBegin;
2108   if (!a->inode.use)                     PetscFunctionReturn(0);
2109   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
2110 
2111 
2112   m = A->rmap.n;
2113   if (a->inode.size) {ns = a->inode.size;}
2114   else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);}
2115 
2116   i          = 0;
2117   node_count = 0;
2118   idx        = a->j;
2119   ii         = a->i;
2120   while (i < m){                /* For each row */
2121     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
2122     /* Limits the number of elements in a node to 'a->inode.limit' */
2123     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
2124       nzy     = ii[j+1] - ii[j]; /* Same number of nonzeros */
2125       if (nzy != nzx) break;
2126       idy  += nzx;             /* Same nonzero pattern */
2127       ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
2128       if (!flag) break;
2129     }
2130     ns[node_count++] = blk_size;
2131     idx += blk_size*nzx;
2132     i    = j;
2133   }
2134   /* If not enough inodes found,, do not use inode version of the routines */
2135   if (!a->inode.size && m && node_count > .9*m) {
2136     ierr = PetscFree(ns);CHKERRQ(ierr);
2137     a->inode.node_count     = 0;
2138     a->inode.size           = PETSC_NULL;
2139     a->inode.use            = PETSC_FALSE;
2140     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
2141   } else {
2142     A->ops->mult            = MatMult_Inode;
2143     A->ops->relax           = MatRelax_Inode;
2144     A->ops->multadd         = MatMultAdd_Inode;
2145     A->ops->solve           = MatSolve_Inode;
2146     A->ops->lufactornumeric = MatLUFactorNumeric_Inode;
2147     A->ops->getrowij        = MatGetRowIJ_Inode;
2148     A->ops->restorerowij    = MatRestoreRowIJ_Inode;
2149     A->ops->getcolumnij     = MatGetColumnIJ_Inode;
2150     A->ops->restorecolumnij = MatRestoreColumnIJ_Inode;
2151     A->ops->coloringpatch   = MatColoringPatch_Inode;
2152     a->inode.node_count     = node_count;
2153     a->inode.size           = ns;
2154     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
2155   }
2156   PetscFunctionReturn(0);
2157 }
2158 
2159 /*
2160      This is really ugly. if inodes are used this replaces the
2161   permutations with ones that correspond to rows/cols of the matrix
2162   rather then inode blocks
2163 */
2164 #undef __FUNCT__
2165 #define __FUNCT__ "MatInodeAdjustForInodes"
2166 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
2167 {
2168   PetscErrorCode ierr,(*f)(Mat,IS*,IS*);
2169 
2170   PetscFunctionBegin;
2171   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);CHKERRQ(ierr);
2172   if (f) {
2173     ierr = (*f)(A,rperm,cperm);CHKERRQ(ierr);
2174   }
2175   PetscFunctionReturn(0);
2176 }
2177 
2178 EXTERN_C_BEGIN
2179 #undef __FUNCT__
2180 #define __FUNCT__ "MatAdjustForInodes_Inode"
2181 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes_Inode(Mat A,IS *rperm,IS *cperm)
2182 {
2183   Mat_SeqAIJ      *a=(Mat_SeqAIJ*)A->data;
2184   PetscErrorCode ierr;
2185   PetscInt       m = A->rmap.n,n = A->cmap.n,i,j,*ridx,*cidx,nslim_row = a->inode.node_count;
2186   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
2187   PetscInt       nslim_col,*ns_col;
2188   IS             ris = *rperm,cis = *cperm;
2189 
2190   PetscFunctionBegin;
2191   if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */
2192   if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */
2193 
2194   ierr  = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
2195   ierr  = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
2196   ierr  = PetscMalloc((m+n+1)*sizeof(PetscInt),&permr);CHKERRQ(ierr);
2197   permc = permr + m;
2198 
2199   ierr  = ISGetIndices(ris,&ridx);CHKERRQ(ierr);
2200   ierr  = ISGetIndices(cis,&cidx);CHKERRQ(ierr);
2201 
2202   /* Form the inode structure for the rows of permuted matric using inv perm*/
2203   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
2204 
2205   /* Construct the permutations for rows*/
2206   for (i=0,row = 0; i<nslim_row; ++i){
2207     indx      = ridx[i];
2208     start_val = tns[indx];
2209     end_val   = tns[indx + 1];
2210     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
2211   }
2212 
2213   /* Form the inode structure for the columns of permuted matrix using inv perm*/
2214   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
2215 
2216  /* Construct permutations for columns */
2217   for (i=0,col=0; i<nslim_col; ++i){
2218     indx      = cidx[i];
2219     start_val = tns[indx];
2220     end_val   = tns[indx + 1];
2221     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
2222   }
2223 
2224   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);CHKERRQ(ierr);
2225   ISSetPermutation(*rperm);
2226   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);CHKERRQ(ierr);
2227   ISSetPermutation(*cperm);
2228 
2229   ierr  = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr);
2230   ierr  = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr);
2231 
2232   ierr = PetscFree(ns_col);CHKERRQ(ierr);
2233   ierr = PetscFree(permr);CHKERRQ(ierr);
2234   ierr = ISDestroy(cis);CHKERRQ(ierr);
2235   ierr = ISDestroy(ris);CHKERRQ(ierr);
2236   ierr = PetscFree(tns);CHKERRQ(ierr);
2237   PetscFunctionReturn(0);
2238 }
2239 EXTERN_C_END
2240 
2241 #undef __FUNCT__
2242 #define __FUNCT__ "MatInodeGetInodeSizes"
2243 /*@C
2244    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
2245 
2246    Collective on Mat
2247 
2248    Input Parameter:
2249 .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
2250 
2251    Output Parameter:
2252 +  node_count - no of inodes present in the matrix.
2253 .  sizes      - an array of size node_count,with sizes of each inode.
2254 -  limit      - the max size used to generate the inodes.
2255 
2256    Level: advanced
2257 
2258    Notes: This routine returns some internal storage information
2259    of the matrix, it is intended to be used by advanced users.
2260    It should be called after the matrix is assembled.
2261    The contents of the sizes[] array should not be changed.
2262    PETSC_NULL may be passed for information not requested.
2263 
2264 .keywords: matrix, seqaij, get, inode
2265 
2266 .seealso: MatGetInfo()
2267 @*/
2268 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
2269 {
2270   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
2271 
2272   PetscFunctionBegin;
2273   if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
2274   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr);
2275   if (f) {
2276     ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr);
2277   }
2278   PetscFunctionReturn(0);
2279 }
2280 
2281 EXTERN_C_BEGIN
2282 #undef __FUNCT__
2283 #define __FUNCT__ "MatInodeGetInodeSizes_Inode"
2284 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
2285 {
2286   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
2287 
2288   PetscFunctionBegin;
2289   if (node_count) *node_count = a->inode.node_count;
2290   if (sizes)      *sizes      = a->inode.size;
2291   if (limit)      *limit      = a->inode.limit;
2292   PetscFunctionReturn(0);
2293 }
2294 EXTERN_C_END
2295