xref: /petsc/src/mat/impls/aij/seq/inode.c (revision 4c1414c8d12f32185249d7a747f9a5bd5817bea6)
1*4c1414c8SBarry Smith #define PETSCMAT_DLL
2*4c1414c8SBarry Smith 
3*4c1414c8SBarry Smith /*
4*4c1414c8SBarry Smith   This file provides high performance routines for the Inode format (compressed sparse row)
5*4c1414c8SBarry Smith   by taking advantage of rows with identical nonzero structure (I-nodes).
6*4c1414c8SBarry Smith */
7*4c1414c8SBarry Smith #include "src/mat/impls/aij/seq/aij.h"
8*4c1414c8SBarry Smith 
9*4c1414c8SBarry Smith #undef __FUNCT__
10*4c1414c8SBarry Smith #define __FUNCT__ "Mat_CreateColInode"
11*4c1414c8SBarry Smith static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns)
12*4c1414c8SBarry Smith {
13*4c1414c8SBarry Smith   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
14*4c1414c8SBarry Smith   PetscErrorCode ierr;
15*4c1414c8SBarry Smith   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;
16*4c1414c8SBarry Smith 
17*4c1414c8SBarry Smith   PetscFunctionBegin;
18*4c1414c8SBarry Smith   n      = A->cmap.n;
19*4c1414c8SBarry Smith   m      = A->rmap.n;
20*4c1414c8SBarry Smith   ns_row = a->inode.size;
21*4c1414c8SBarry Smith 
22*4c1414c8SBarry Smith   min_mn = (m < n) ? m : n;
23*4c1414c8SBarry Smith   if (!ns) {
24*4c1414c8SBarry Smith     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++);
25*4c1414c8SBarry Smith     for(; count+1 < n; count++,i++);
26*4c1414c8SBarry Smith     if (count < n)  {
27*4c1414c8SBarry Smith       i++;
28*4c1414c8SBarry Smith     }
29*4c1414c8SBarry Smith     *size = i;
30*4c1414c8SBarry Smith     PetscFunctionReturn(0);
31*4c1414c8SBarry Smith   }
32*4c1414c8SBarry Smith   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr);
33*4c1414c8SBarry Smith 
34*4c1414c8SBarry Smith   /* Use the same row structure wherever feasible. */
35*4c1414c8SBarry Smith   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
36*4c1414c8SBarry Smith     ns_col[i] = ns_row[i];
37*4c1414c8SBarry Smith   }
38*4c1414c8SBarry Smith 
39*4c1414c8SBarry Smith   /* if m < n; pad up the remainder with inode_limit */
40*4c1414c8SBarry Smith   for(; count+1 < n; count++,i++) {
41*4c1414c8SBarry Smith     ns_col[i] = 1;
42*4c1414c8SBarry Smith   }
43*4c1414c8SBarry Smith   /* The last node is the odd ball. padd it up with the remaining rows; */
44*4c1414c8SBarry Smith   if (count < n)  {
45*4c1414c8SBarry Smith     ns_col[i] = n - count;
46*4c1414c8SBarry Smith     i++;
47*4c1414c8SBarry Smith   } else if (count > n) {
48*4c1414c8SBarry Smith     /* Adjust for the over estimation */
49*4c1414c8SBarry Smith     ns_col[i-1] += n - count;
50*4c1414c8SBarry Smith   }
51*4c1414c8SBarry Smith   *size = i;
52*4c1414c8SBarry Smith   *ns   = ns_col;
53*4c1414c8SBarry Smith   PetscFunctionReturn(0);
54*4c1414c8SBarry Smith }
55*4c1414c8SBarry Smith 
56*4c1414c8SBarry Smith 
57*4c1414c8SBarry Smith /*
58*4c1414c8SBarry Smith       This builds symmetric version of nonzero structure,
59*4c1414c8SBarry Smith */
60*4c1414c8SBarry Smith #undef __FUNCT__
61*4c1414c8SBarry Smith #define __FUNCT__ "MatGetRowIJ_Inode_Symmetric"
62*4c1414c8SBarry Smith static PetscErrorCode MatGetRowIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
63*4c1414c8SBarry Smith {
64*4c1414c8SBarry Smith   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
65*4c1414c8SBarry Smith   PetscErrorCode ierr;
66*4c1414c8SBarry Smith   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n;
67*4c1414c8SBarry Smith   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j;
68*4c1414c8SBarry Smith 
69*4c1414c8SBarry Smith   PetscFunctionBegin;
70*4c1414c8SBarry Smith   nslim_row = a->inode.node_count;
71*4c1414c8SBarry Smith   m         = A->rmap.n;
72*4c1414c8SBarry Smith   n         = A->cmap.n;
73*4c1414c8SBarry Smith   if (m != n) SETERRQ(PETSC_ERR_SUP,"MatGetRowIJ_Inode_Symmetric: Matrix should be square");
74*4c1414c8SBarry Smith 
75*4c1414c8SBarry Smith   /* Use the row_inode as column_inode */
76*4c1414c8SBarry Smith   nslim_col = nslim_row;
77*4c1414c8SBarry Smith   ns_col    = ns_row;
78*4c1414c8SBarry Smith 
79*4c1414c8SBarry Smith   /* allocate space for reformated inode structure */
80*4c1414c8SBarry Smith   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
81*4c1414c8SBarry Smith   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
82*4c1414c8SBarry Smith   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
83*4c1414c8SBarry Smith 
84*4c1414c8SBarry Smith   for (i1=0,col=0; i1<nslim_col; ++i1){
85*4c1414c8SBarry Smith     nsz = ns_col[i1];
86*4c1414c8SBarry Smith     for (i2=0; i2<nsz; ++i2,++col)
87*4c1414c8SBarry Smith       tvc[col] = i1;
88*4c1414c8SBarry Smith   }
89*4c1414c8SBarry Smith   /* allocate space for row pointers */
90*4c1414c8SBarry Smith   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
91*4c1414c8SBarry Smith   *iia = ia;
92*4c1414c8SBarry Smith   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
93*4c1414c8SBarry Smith   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
94*4c1414c8SBarry Smith 
95*4c1414c8SBarry Smith   /* determine the number of columns in each row */
96*4c1414c8SBarry Smith   ia[0] = oshift;
97*4c1414c8SBarry Smith   for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) {
98*4c1414c8SBarry Smith 
99*4c1414c8SBarry Smith     j    = aj + ai[row] + ishift;
100*4c1414c8SBarry Smith     jmax = aj + ai[row+1] + ishift;
101*4c1414c8SBarry Smith     i2   = 0;
102*4c1414c8SBarry Smith     col  = *j++ + ishift;
103*4c1414c8SBarry Smith     i2   = tvc[col];
104*4c1414c8SBarry Smith     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
105*4c1414c8SBarry Smith       ia[i1+1]++;
106*4c1414c8SBarry Smith       ia[i2+1]++;
107*4c1414c8SBarry Smith       i2++;                     /* Start col of next node */
108*4c1414c8SBarry Smith       while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j;
109*4c1414c8SBarry Smith       i2 = tvc[col];
110*4c1414c8SBarry Smith     }
111*4c1414c8SBarry Smith     if(i2 == i1) ia[i2+1]++;    /* now the diagonal element */
112*4c1414c8SBarry Smith   }
113*4c1414c8SBarry Smith 
114*4c1414c8SBarry Smith   /* shift ia[i] to point to next row */
115*4c1414c8SBarry Smith   for (i1=1; i1<nslim_row+1; i1++) {
116*4c1414c8SBarry Smith     row        = ia[i1-1];
117*4c1414c8SBarry Smith     ia[i1]    += row;
118*4c1414c8SBarry Smith     work[i1-1] = row - oshift;
119*4c1414c8SBarry Smith   }
120*4c1414c8SBarry Smith 
121*4c1414c8SBarry Smith   /* allocate space for column pointers */
122*4c1414c8SBarry Smith   nz   = ia[nslim_row] + (!ishift);
123*4c1414c8SBarry Smith   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
124*4c1414c8SBarry Smith   *jja = ja;
125*4c1414c8SBarry Smith 
126*4c1414c8SBarry Smith  /* loop over lower triangular part putting into ja */
127*4c1414c8SBarry Smith   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
128*4c1414c8SBarry Smith     j    = aj + ai[row] + ishift;
129*4c1414c8SBarry Smith     jmax = aj + ai[row+1] + ishift;
130*4c1414c8SBarry Smith     i2   = 0;                     /* Col inode index */
131*4c1414c8SBarry Smith     col  = *j++ + ishift;
132*4c1414c8SBarry Smith     i2   = tvc[col];
133*4c1414c8SBarry Smith     while (i2<i1 && j<jmax) {
134*4c1414c8SBarry Smith       ja[work[i2]++] = i1 + oshift;
135*4c1414c8SBarry Smith       ja[work[i1]++] = i2 + oshift;
136*4c1414c8SBarry Smith       ++i2;
137*4c1414c8SBarry Smith       while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */
138*4c1414c8SBarry Smith       i2 = tvc[col];
139*4c1414c8SBarry Smith     }
140*4c1414c8SBarry Smith     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
141*4c1414c8SBarry Smith 
142*4c1414c8SBarry Smith   }
143*4c1414c8SBarry Smith   ierr = PetscFree(work);CHKERRQ(ierr);
144*4c1414c8SBarry Smith   ierr = PetscFree(tns);CHKERRQ(ierr);
145*4c1414c8SBarry Smith   ierr = PetscFree(tvc);CHKERRQ(ierr);
146*4c1414c8SBarry Smith   PetscFunctionReturn(0);
147*4c1414c8SBarry Smith }
148*4c1414c8SBarry Smith 
149*4c1414c8SBarry Smith /*
150*4c1414c8SBarry Smith       This builds nonsymmetric version of nonzero structure,
151*4c1414c8SBarry Smith */
152*4c1414c8SBarry Smith #undef __FUNCT__
153*4c1414c8SBarry Smith #define __FUNCT__ "MatGetRowIJ_Inode_Nonsymmetric"
154*4c1414c8SBarry Smith static PetscErrorCode MatGetRowIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
155*4c1414c8SBarry Smith {
156*4c1414c8SBarry Smith   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
157*4c1414c8SBarry Smith   PetscErrorCode ierr;
158*4c1414c8SBarry Smith   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col;
159*4c1414c8SBarry Smith   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
160*4c1414c8SBarry Smith 
161*4c1414c8SBarry Smith   PetscFunctionBegin;
162*4c1414c8SBarry Smith   nslim_row = a->inode.node_count;
163*4c1414c8SBarry Smith   n         = A->cmap.n;
164*4c1414c8SBarry Smith 
165*4c1414c8SBarry Smith   /* Create The column_inode for this matrix */
166*4c1414c8SBarry Smith   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
167*4c1414c8SBarry Smith 
168*4c1414c8SBarry Smith   /* allocate space for reformated column_inode structure */
169*4c1414c8SBarry Smith   ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
170*4c1414c8SBarry Smith   ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
171*4c1414c8SBarry Smith   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
172*4c1414c8SBarry Smith 
173*4c1414c8SBarry Smith   for (i1=0,col=0; i1<nslim_col; ++i1){
174*4c1414c8SBarry Smith     nsz = ns_col[i1];
175*4c1414c8SBarry Smith     for (i2=0; i2<nsz; ++i2,++col)
176*4c1414c8SBarry Smith       tvc[col] = i1;
177*4c1414c8SBarry Smith   }
178*4c1414c8SBarry Smith   /* allocate space for row pointers */
179*4c1414c8SBarry Smith   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
180*4c1414c8SBarry Smith   *iia = ia;
181*4c1414c8SBarry Smith   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
182*4c1414c8SBarry Smith   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
183*4c1414c8SBarry Smith 
184*4c1414c8SBarry Smith   /* determine the number of columns in each row */
185*4c1414c8SBarry Smith   ia[0] = oshift;
186*4c1414c8SBarry Smith   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
187*4c1414c8SBarry Smith     j   = aj + ai[row] + ishift;
188*4c1414c8SBarry Smith     col = *j++ + ishift;
189*4c1414c8SBarry Smith     i2  = tvc[col];
190*4c1414c8SBarry Smith     nz  = ai[row+1] - ai[row];
191*4c1414c8SBarry Smith     while (nz-- > 0) {           /* off-diagonal elemets */
192*4c1414c8SBarry Smith       ia[i1+1]++;
193*4c1414c8SBarry Smith       i2++;                     /* Start col of next node */
194*4c1414c8SBarry Smith       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
195*4c1414c8SBarry Smith       if (nz > 0) i2 = tvc[col];
196*4c1414c8SBarry Smith     }
197*4c1414c8SBarry Smith   }
198*4c1414c8SBarry Smith 
199*4c1414c8SBarry Smith   /* shift ia[i] to point to next row */
200*4c1414c8SBarry Smith   for (i1=1; i1<nslim_row+1; i1++) {
201*4c1414c8SBarry Smith     row        = ia[i1-1];
202*4c1414c8SBarry Smith     ia[i1]    += row;
203*4c1414c8SBarry Smith     work[i1-1] = row - oshift;
204*4c1414c8SBarry Smith   }
205*4c1414c8SBarry Smith 
206*4c1414c8SBarry Smith   /* allocate space for column pointers */
207*4c1414c8SBarry Smith   nz   = ia[nslim_row] + (!ishift);
208*4c1414c8SBarry Smith   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
209*4c1414c8SBarry Smith   *jja = ja;
210*4c1414c8SBarry Smith 
211*4c1414c8SBarry Smith  /* loop over matrix putting into ja */
212*4c1414c8SBarry Smith   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
213*4c1414c8SBarry Smith     j   = aj + ai[row] + ishift;
214*4c1414c8SBarry Smith     i2  = 0;                     /* Col inode index */
215*4c1414c8SBarry Smith     col = *j++ + ishift;
216*4c1414c8SBarry Smith     i2  = tvc[col];
217*4c1414c8SBarry Smith     nz  = ai[row+1] - ai[row];
218*4c1414c8SBarry Smith     while (nz-- > 0) {
219*4c1414c8SBarry Smith       ja[work[i1]++] = i2 + oshift;
220*4c1414c8SBarry Smith       ++i2;
221*4c1414c8SBarry Smith       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
222*4c1414c8SBarry Smith       if (nz > 0) i2 = tvc[col];
223*4c1414c8SBarry Smith     }
224*4c1414c8SBarry Smith   }
225*4c1414c8SBarry Smith   ierr = PetscFree(ns_col);CHKERRQ(ierr);
226*4c1414c8SBarry Smith   ierr = PetscFree(work);CHKERRQ(ierr);
227*4c1414c8SBarry Smith   ierr = PetscFree(tns);CHKERRQ(ierr);
228*4c1414c8SBarry Smith   ierr = PetscFree(tvc);CHKERRQ(ierr);
229*4c1414c8SBarry Smith   PetscFunctionReturn(0);
230*4c1414c8SBarry Smith }
231*4c1414c8SBarry Smith 
232*4c1414c8SBarry Smith #undef __FUNCT__
233*4c1414c8SBarry Smith #define __FUNCT__ "MatGetRowIJ_Inode"
234*4c1414c8SBarry Smith static PetscErrorCode MatGetRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
235*4c1414c8SBarry Smith {
236*4c1414c8SBarry Smith   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
237*4c1414c8SBarry Smith   PetscErrorCode ierr;
238*4c1414c8SBarry Smith 
239*4c1414c8SBarry Smith   PetscFunctionBegin;
240*4c1414c8SBarry Smith   *n     = a->inode.node_count;
241*4c1414c8SBarry Smith   if (!ia) PetscFunctionReturn(0);
242*4c1414c8SBarry Smith 
243*4c1414c8SBarry Smith   if (symmetric) {
244*4c1414c8SBarry Smith     ierr = MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
245*4c1414c8SBarry Smith   } else {
246*4c1414c8SBarry Smith     ierr = MatGetRowIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
247*4c1414c8SBarry Smith   }
248*4c1414c8SBarry Smith   PetscFunctionReturn(0);
249*4c1414c8SBarry Smith }
250*4c1414c8SBarry Smith 
251*4c1414c8SBarry Smith #undef __FUNCT__
252*4c1414c8SBarry Smith #define __FUNCT__ "MatRestoreRowIJ_Inode"
253*4c1414c8SBarry Smith static PetscErrorCode MatRestoreRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
254*4c1414c8SBarry Smith {
255*4c1414c8SBarry Smith   PetscErrorCode ierr;
256*4c1414c8SBarry Smith 
257*4c1414c8SBarry Smith   PetscFunctionBegin;
258*4c1414c8SBarry Smith   if (!ia) PetscFunctionReturn(0);
259*4c1414c8SBarry Smith   ierr = PetscFree(*ia);CHKERRQ(ierr);
260*4c1414c8SBarry Smith   ierr = PetscFree(*ja);CHKERRQ(ierr);
261*4c1414c8SBarry Smith   PetscFunctionReturn(0);
262*4c1414c8SBarry Smith }
263*4c1414c8SBarry Smith 
264*4c1414c8SBarry Smith /* ----------------------------------------------------------- */
265*4c1414c8SBarry Smith 
266*4c1414c8SBarry Smith #undef __FUNCT__
267*4c1414c8SBarry Smith #define __FUNCT__ "MatGetColumnIJ_Inode_Nonsymmetric"
268*4c1414c8SBarry Smith static PetscErrorCode MatGetColumnIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
269*4c1414c8SBarry Smith {
270*4c1414c8SBarry Smith   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
271*4c1414c8SBarry Smith   PetscErrorCode ierr;
272*4c1414c8SBarry Smith   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
273*4c1414c8SBarry Smith   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
274*4c1414c8SBarry Smith 
275*4c1414c8SBarry Smith   PetscFunctionBegin;
276*4c1414c8SBarry Smith   nslim_row = a->inode.node_count;
277*4c1414c8SBarry Smith   n         = A->cmap.n;
278*4c1414c8SBarry Smith 
279*4c1414c8SBarry Smith   /* Create The column_inode for this matrix */
280*4c1414c8SBarry Smith   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
281*4c1414c8SBarry Smith 
282*4c1414c8SBarry Smith   /* allocate space for reformated column_inode structure */
283*4c1414c8SBarry Smith   ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
284*4c1414c8SBarry Smith   ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
285*4c1414c8SBarry Smith   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
286*4c1414c8SBarry Smith 
287*4c1414c8SBarry Smith   for (i1=0,col=0; i1<nslim_col; ++i1){
288*4c1414c8SBarry Smith     nsz = ns_col[i1];
289*4c1414c8SBarry Smith     for (i2=0; i2<nsz; ++i2,++col)
290*4c1414c8SBarry Smith       tvc[col] = i1;
291*4c1414c8SBarry Smith   }
292*4c1414c8SBarry Smith   /* allocate space for column pointers */
293*4c1414c8SBarry Smith   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
294*4c1414c8SBarry Smith   *iia = ia;
295*4c1414c8SBarry Smith   ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr);
296*4c1414c8SBarry Smith   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
297*4c1414c8SBarry Smith 
298*4c1414c8SBarry Smith   /* determine the number of columns in each row */
299*4c1414c8SBarry Smith   ia[0] = oshift;
300*4c1414c8SBarry Smith   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
301*4c1414c8SBarry Smith     j   = aj + ai[row] + ishift;
302*4c1414c8SBarry Smith     col = *j++ + ishift;
303*4c1414c8SBarry Smith     i2  = tvc[col];
304*4c1414c8SBarry Smith     nz  = ai[row+1] - ai[row];
305*4c1414c8SBarry Smith     while (nz-- > 0) {           /* off-diagonal elemets */
306*4c1414c8SBarry Smith       /* ia[i1+1]++; */
307*4c1414c8SBarry Smith       ia[i2+1]++;
308*4c1414c8SBarry Smith       i2++;
309*4c1414c8SBarry Smith       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
310*4c1414c8SBarry Smith       if (nz > 0) i2 = tvc[col];
311*4c1414c8SBarry Smith     }
312*4c1414c8SBarry Smith   }
313*4c1414c8SBarry Smith 
314*4c1414c8SBarry Smith   /* shift ia[i] to point to next col */
315*4c1414c8SBarry Smith   for (i1=1; i1<nslim_col+1; i1++) {
316*4c1414c8SBarry Smith     col        = ia[i1-1];
317*4c1414c8SBarry Smith     ia[i1]    += col;
318*4c1414c8SBarry Smith     work[i1-1] = col - oshift;
319*4c1414c8SBarry Smith   }
320*4c1414c8SBarry Smith 
321*4c1414c8SBarry Smith   /* allocate space for column pointers */
322*4c1414c8SBarry Smith   nz   = ia[nslim_col] + (!ishift);
323*4c1414c8SBarry Smith   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
324*4c1414c8SBarry Smith   *jja = ja;
325*4c1414c8SBarry Smith 
326*4c1414c8SBarry Smith  /* loop over matrix putting into ja */
327*4c1414c8SBarry Smith   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
328*4c1414c8SBarry Smith     j   = aj + ai[row] + ishift;
329*4c1414c8SBarry Smith     i2  = 0;                     /* Col inode index */
330*4c1414c8SBarry Smith     col = *j++ + ishift;
331*4c1414c8SBarry Smith     i2  = tvc[col];
332*4c1414c8SBarry Smith     nz  = ai[row+1] - ai[row];
333*4c1414c8SBarry Smith     while (nz-- > 0) {
334*4c1414c8SBarry Smith       /* ja[work[i1]++] = i2 + oshift; */
335*4c1414c8SBarry Smith       ja[work[i2]++] = i1 + oshift;
336*4c1414c8SBarry Smith       i2++;
337*4c1414c8SBarry Smith       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
338*4c1414c8SBarry Smith       if (nz > 0) i2 = tvc[col];
339*4c1414c8SBarry Smith     }
340*4c1414c8SBarry Smith   }
341*4c1414c8SBarry Smith   ierr = PetscFree(ns_col);CHKERRQ(ierr);
342*4c1414c8SBarry Smith   ierr = PetscFree(work);CHKERRQ(ierr);
343*4c1414c8SBarry Smith   ierr = PetscFree(tns);CHKERRQ(ierr);
344*4c1414c8SBarry Smith   ierr = PetscFree(tvc);CHKERRQ(ierr);
345*4c1414c8SBarry Smith   PetscFunctionReturn(0);
346*4c1414c8SBarry Smith }
347*4c1414c8SBarry Smith 
348*4c1414c8SBarry Smith #undef __FUNCT__
349*4c1414c8SBarry Smith #define __FUNCT__ "MatGetColumnIJ_Inode"
350*4c1414c8SBarry Smith static PetscErrorCode MatGetColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
351*4c1414c8SBarry Smith {
352*4c1414c8SBarry Smith   PetscErrorCode ierr;
353*4c1414c8SBarry Smith 
354*4c1414c8SBarry Smith   PetscFunctionBegin;
355*4c1414c8SBarry Smith   ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr);
356*4c1414c8SBarry Smith   if (!ia) PetscFunctionReturn(0);
357*4c1414c8SBarry Smith 
358*4c1414c8SBarry Smith   if (symmetric) {
359*4c1414c8SBarry Smith     /* Since the indices are symmetric it does'nt matter */
360*4c1414c8SBarry Smith     ierr = MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
361*4c1414c8SBarry Smith   } else {
362*4c1414c8SBarry Smith     ierr = MatGetColumnIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
363*4c1414c8SBarry Smith   }
364*4c1414c8SBarry Smith   PetscFunctionReturn(0);
365*4c1414c8SBarry Smith }
366*4c1414c8SBarry Smith 
367*4c1414c8SBarry Smith #undef __FUNCT__
368*4c1414c8SBarry Smith #define __FUNCT__ "MatRestoreColumnIJ_Inode"
369*4c1414c8SBarry Smith static PetscErrorCode MatRestoreColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
370*4c1414c8SBarry Smith {
371*4c1414c8SBarry Smith   PetscErrorCode ierr;
372*4c1414c8SBarry Smith 
373*4c1414c8SBarry Smith   PetscFunctionBegin;
374*4c1414c8SBarry Smith   if (!ia) PetscFunctionReturn(0);
375*4c1414c8SBarry Smith   ierr = PetscFree(*ia);CHKERRQ(ierr);
376*4c1414c8SBarry Smith   ierr = PetscFree(*ja);CHKERRQ(ierr);
377*4c1414c8SBarry Smith   PetscFunctionReturn(0);
378*4c1414c8SBarry Smith }
379*4c1414c8SBarry Smith 
380*4c1414c8SBarry Smith /* ----------------------------------------------------------- */
381*4c1414c8SBarry Smith 
382*4c1414c8SBarry Smith #undef __FUNCT__
383*4c1414c8SBarry Smith #define __FUNCT__ "MatMult_Inode"
384*4c1414c8SBarry Smith static PetscErrorCode MatMult_Inode(Mat A,Vec xx,Vec yy)
385*4c1414c8SBarry Smith {
386*4c1414c8SBarry Smith   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
387*4c1414c8SBarry Smith   PetscScalar    sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
388*4c1414c8SBarry Smith   PetscScalar    *v1,*v2,*v3,*v4,*v5,*x,*y;
389*4c1414c8SBarry Smith   PetscErrorCode ierr;
390*4c1414c8SBarry Smith   PetscInt       *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
391*4c1414c8SBarry Smith 
392*4c1414c8SBarry Smith #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
393*4c1414c8SBarry Smith #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
394*4c1414c8SBarry Smith #endif
395*4c1414c8SBarry Smith 
396*4c1414c8SBarry Smith   PetscFunctionBegin;
397*4c1414c8SBarry Smith   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
398*4c1414c8SBarry Smith   node_max = a->inode.node_count;
399*4c1414c8SBarry Smith   ns       = a->inode.size;     /* Node Size array */
400*4c1414c8SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
401*4c1414c8SBarry Smith   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
402*4c1414c8SBarry Smith   idx  = a->j;
403*4c1414c8SBarry Smith   v1   = a->a;
404*4c1414c8SBarry Smith   ii   = a->i;
405*4c1414c8SBarry Smith 
406*4c1414c8SBarry Smith   for (i = 0,row = 0; i< node_max; ++i){
407*4c1414c8SBarry Smith     nsz  = ns[i];
408*4c1414c8SBarry Smith     n    = ii[1] - ii[0];
409*4c1414c8SBarry Smith     ii  += nsz;
410*4c1414c8SBarry Smith     sz   = n;                   /* No of non zeros in this row */
411*4c1414c8SBarry Smith                                 /* Switch on the size of Node */
412*4c1414c8SBarry Smith     switch (nsz){               /* Each loop in 'case' is unrolled */
413*4c1414c8SBarry Smith     case 1 :
414*4c1414c8SBarry Smith       sum1  = 0;
415*4c1414c8SBarry Smith 
416*4c1414c8SBarry Smith       for(n = 0; n< sz-1; n+=2) {
417*4c1414c8SBarry Smith         i1   = idx[0];          /* The instructions are ordered to */
418*4c1414c8SBarry Smith         i2   = idx[1];          /* make the compiler's job easy */
419*4c1414c8SBarry Smith         idx += 2;
420*4c1414c8SBarry Smith         tmp0 = x[i1];
421*4c1414c8SBarry Smith         tmp1 = x[i2];
422*4c1414c8SBarry Smith         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
423*4c1414c8SBarry Smith        }
424*4c1414c8SBarry Smith 
425*4c1414c8SBarry Smith       if (n == sz-1){          /* Take care of the last nonzero  */
426*4c1414c8SBarry Smith         tmp0  = x[*idx++];
427*4c1414c8SBarry Smith         sum1 += *v1++ * tmp0;
428*4c1414c8SBarry Smith       }
429*4c1414c8SBarry Smith       y[row++]=sum1;
430*4c1414c8SBarry Smith       break;
431*4c1414c8SBarry Smith     case 2:
432*4c1414c8SBarry Smith       sum1  = 0;
433*4c1414c8SBarry Smith       sum2  = 0;
434*4c1414c8SBarry Smith       v2    = v1 + n;
435*4c1414c8SBarry Smith 
436*4c1414c8SBarry Smith       for (n = 0; n< sz-1; n+=2) {
437*4c1414c8SBarry Smith         i1   = idx[0];
438*4c1414c8SBarry Smith         i2   = idx[1];
439*4c1414c8SBarry Smith         idx += 2;
440*4c1414c8SBarry Smith         tmp0 = x[i1];
441*4c1414c8SBarry Smith         tmp1 = x[i2];
442*4c1414c8SBarry Smith         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
443*4c1414c8SBarry Smith         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
444*4c1414c8SBarry Smith       }
445*4c1414c8SBarry Smith       if (n == sz-1){
446*4c1414c8SBarry Smith         tmp0  = x[*idx++];
447*4c1414c8SBarry Smith         sum1 += *v1++ * tmp0;
448*4c1414c8SBarry Smith         sum2 += *v2++ * tmp0;
449*4c1414c8SBarry Smith       }
450*4c1414c8SBarry Smith       y[row++]=sum1;
451*4c1414c8SBarry Smith       y[row++]=sum2;
452*4c1414c8SBarry Smith       v1      =v2;              /* Since the next block to be processed starts there*/
453*4c1414c8SBarry Smith       idx    +=sz;
454*4c1414c8SBarry Smith       break;
455*4c1414c8SBarry Smith     case 3:
456*4c1414c8SBarry Smith       sum1  = 0;
457*4c1414c8SBarry Smith       sum2  = 0;
458*4c1414c8SBarry Smith       sum3  = 0;
459*4c1414c8SBarry Smith       v2    = v1 + n;
460*4c1414c8SBarry Smith       v3    = v2 + n;
461*4c1414c8SBarry Smith 
462*4c1414c8SBarry Smith       for (n = 0; n< sz-1; n+=2) {
463*4c1414c8SBarry Smith         i1   = idx[0];
464*4c1414c8SBarry Smith         i2   = idx[1];
465*4c1414c8SBarry Smith         idx += 2;
466*4c1414c8SBarry Smith         tmp0 = x[i1];
467*4c1414c8SBarry Smith         tmp1 = x[i2];
468*4c1414c8SBarry Smith         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
469*4c1414c8SBarry Smith         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
470*4c1414c8SBarry Smith         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
471*4c1414c8SBarry Smith       }
472*4c1414c8SBarry Smith       if (n == sz-1){
473*4c1414c8SBarry Smith         tmp0  = x[*idx++];
474*4c1414c8SBarry Smith         sum1 += *v1++ * tmp0;
475*4c1414c8SBarry Smith         sum2 += *v2++ * tmp0;
476*4c1414c8SBarry Smith         sum3 += *v3++ * tmp0;
477*4c1414c8SBarry Smith       }
478*4c1414c8SBarry Smith       y[row++]=sum1;
479*4c1414c8SBarry Smith       y[row++]=sum2;
480*4c1414c8SBarry Smith       y[row++]=sum3;
481*4c1414c8SBarry Smith       v1       =v3;             /* Since the next block to be processed starts there*/
482*4c1414c8SBarry Smith       idx     +=2*sz;
483*4c1414c8SBarry Smith       break;
484*4c1414c8SBarry Smith     case 4:
485*4c1414c8SBarry Smith       sum1  = 0;
486*4c1414c8SBarry Smith       sum2  = 0;
487*4c1414c8SBarry Smith       sum3  = 0;
488*4c1414c8SBarry Smith       sum4  = 0;
489*4c1414c8SBarry Smith       v2    = v1 + n;
490*4c1414c8SBarry Smith       v3    = v2 + n;
491*4c1414c8SBarry Smith       v4    = v3 + n;
492*4c1414c8SBarry Smith 
493*4c1414c8SBarry Smith       for (n = 0; n< sz-1; n+=2) {
494*4c1414c8SBarry Smith         i1   = idx[0];
495*4c1414c8SBarry Smith         i2   = idx[1];
496*4c1414c8SBarry Smith         idx += 2;
497*4c1414c8SBarry Smith         tmp0 = x[i1];
498*4c1414c8SBarry Smith         tmp1 = x[i2];
499*4c1414c8SBarry Smith         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
500*4c1414c8SBarry Smith         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
501*4c1414c8SBarry Smith         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
502*4c1414c8SBarry Smith         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
503*4c1414c8SBarry Smith       }
504*4c1414c8SBarry Smith       if (n == sz-1){
505*4c1414c8SBarry Smith         tmp0  = x[*idx++];
506*4c1414c8SBarry Smith         sum1 += *v1++ * tmp0;
507*4c1414c8SBarry Smith         sum2 += *v2++ * tmp0;
508*4c1414c8SBarry Smith         sum3 += *v3++ * tmp0;
509*4c1414c8SBarry Smith         sum4 += *v4++ * tmp0;
510*4c1414c8SBarry Smith       }
511*4c1414c8SBarry Smith       y[row++]=sum1;
512*4c1414c8SBarry Smith       y[row++]=sum2;
513*4c1414c8SBarry Smith       y[row++]=sum3;
514*4c1414c8SBarry Smith       y[row++]=sum4;
515*4c1414c8SBarry Smith       v1      =v4;              /* Since the next block to be processed starts there*/
516*4c1414c8SBarry Smith       idx    +=3*sz;
517*4c1414c8SBarry Smith       break;
518*4c1414c8SBarry Smith     case 5:
519*4c1414c8SBarry Smith       sum1  = 0;
520*4c1414c8SBarry Smith       sum2  = 0;
521*4c1414c8SBarry Smith       sum3  = 0;
522*4c1414c8SBarry Smith       sum4  = 0;
523*4c1414c8SBarry Smith       sum5  = 0;
524*4c1414c8SBarry Smith       v2    = v1 + n;
525*4c1414c8SBarry Smith       v3    = v2 + n;
526*4c1414c8SBarry Smith       v4    = v3 + n;
527*4c1414c8SBarry Smith       v5    = v4 + n;
528*4c1414c8SBarry Smith 
529*4c1414c8SBarry Smith       for (n = 0; n<sz-1; n+=2) {
530*4c1414c8SBarry Smith         i1   = idx[0];
531*4c1414c8SBarry Smith         i2   = idx[1];
532*4c1414c8SBarry Smith         idx += 2;
533*4c1414c8SBarry Smith         tmp0 = x[i1];
534*4c1414c8SBarry Smith         tmp1 = x[i2];
535*4c1414c8SBarry Smith         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
536*4c1414c8SBarry Smith         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
537*4c1414c8SBarry Smith         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
538*4c1414c8SBarry Smith         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
539*4c1414c8SBarry Smith         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
540*4c1414c8SBarry Smith       }
541*4c1414c8SBarry Smith       if (n == sz-1){
542*4c1414c8SBarry Smith         tmp0  = x[*idx++];
543*4c1414c8SBarry Smith         sum1 += *v1++ * tmp0;
544*4c1414c8SBarry Smith         sum2 += *v2++ * tmp0;
545*4c1414c8SBarry Smith         sum3 += *v3++ * tmp0;
546*4c1414c8SBarry Smith         sum4 += *v4++ * tmp0;
547*4c1414c8SBarry Smith         sum5 += *v5++ * tmp0;
548*4c1414c8SBarry Smith       }
549*4c1414c8SBarry Smith       y[row++]=sum1;
550*4c1414c8SBarry Smith       y[row++]=sum2;
551*4c1414c8SBarry Smith       y[row++]=sum3;
552*4c1414c8SBarry Smith       y[row++]=sum4;
553*4c1414c8SBarry Smith       y[row++]=sum5;
554*4c1414c8SBarry Smith       v1      =v5;       /* Since the next block to be processed starts there */
555*4c1414c8SBarry Smith       idx    +=4*sz;
556*4c1414c8SBarry Smith       break;
557*4c1414c8SBarry Smith     default :
558*4c1414c8SBarry Smith       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
559*4c1414c8SBarry Smith     }
560*4c1414c8SBarry Smith   }
561*4c1414c8SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
562*4c1414c8SBarry Smith   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
563*4c1414c8SBarry Smith   ierr = PetscLogFlops(2*a->nz - A->rmap.n);CHKERRQ(ierr);
564*4c1414c8SBarry Smith   PetscFunctionReturn(0);
565*4c1414c8SBarry Smith }
566*4c1414c8SBarry Smith /* ----------------------------------------------------------- */
567*4c1414c8SBarry Smith /* Almost same code as the MatMult_Inode() */
568*4c1414c8SBarry Smith #undef __FUNCT__
569*4c1414c8SBarry Smith #define __FUNCT__ "MatMultAdd_Inode"
570*4c1414c8SBarry Smith static PetscErrorCode MatMultAdd_Inode(Mat A,Vec xx,Vec zz,Vec yy)
571*4c1414c8SBarry Smith {
572*4c1414c8SBarry Smith   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
573*4c1414c8SBarry Smith   PetscScalar    sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
574*4c1414c8SBarry Smith   PetscScalar    *v1,*v2,*v3,*v4,*v5,*x,*y,*z,*zt;
575*4c1414c8SBarry Smith   PetscErrorCode ierr;
576*4c1414c8SBarry Smith   PetscInt       *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
577*4c1414c8SBarry Smith 
578*4c1414c8SBarry Smith   PetscFunctionBegin;
579*4c1414c8SBarry Smith   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
580*4c1414c8SBarry Smith   node_max = a->inode.node_count;
581*4c1414c8SBarry Smith   ns       = a->inode.size;     /* Node Size array */
582*4c1414c8SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
583*4c1414c8SBarry Smith   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
584*4c1414c8SBarry Smith   if (zz != yy) {
585*4c1414c8SBarry Smith     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
586*4c1414c8SBarry Smith   } else {
587*4c1414c8SBarry Smith     z = y;
588*4c1414c8SBarry Smith   }
589*4c1414c8SBarry Smith   zt = z;
590*4c1414c8SBarry Smith 
591*4c1414c8SBarry Smith   idx  = a->j;
592*4c1414c8SBarry Smith   v1   = a->a;
593*4c1414c8SBarry Smith   ii   = a->i;
594*4c1414c8SBarry Smith 
595*4c1414c8SBarry Smith   for (i = 0,row = 0; i< node_max; ++i){
596*4c1414c8SBarry Smith     nsz  = ns[i];
597*4c1414c8SBarry Smith     n    = ii[1] - ii[0];
598*4c1414c8SBarry Smith     ii  += nsz;
599*4c1414c8SBarry Smith     sz   = n;                   /* No of non zeros in this row */
600*4c1414c8SBarry Smith                                 /* Switch on the size of Node */
601*4c1414c8SBarry Smith     switch (nsz){               /* Each loop in 'case' is unrolled */
602*4c1414c8SBarry Smith     case 1 :
603*4c1414c8SBarry Smith       sum1  = *zt++;
604*4c1414c8SBarry Smith 
605*4c1414c8SBarry Smith       for(n = 0; n< sz-1; n+=2) {
606*4c1414c8SBarry Smith         i1   = idx[0];          /* The instructions are ordered to */
607*4c1414c8SBarry Smith         i2   = idx[1];          /* make the compiler's job easy */
608*4c1414c8SBarry Smith         idx += 2;
609*4c1414c8SBarry Smith         tmp0 = x[i1];
610*4c1414c8SBarry Smith         tmp1 = x[i2];
611*4c1414c8SBarry Smith         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
612*4c1414c8SBarry Smith        }
613*4c1414c8SBarry Smith 
614*4c1414c8SBarry Smith       if(n   == sz-1){          /* Take care of the last nonzero  */
615*4c1414c8SBarry Smith         tmp0  = x[*idx++];
616*4c1414c8SBarry Smith         sum1 += *v1++ * tmp0;
617*4c1414c8SBarry Smith       }
618*4c1414c8SBarry Smith       y[row++]=sum1;
619*4c1414c8SBarry Smith       break;
620*4c1414c8SBarry Smith     case 2:
621*4c1414c8SBarry Smith       sum1  = *zt++;
622*4c1414c8SBarry Smith       sum2  = *zt++;
623*4c1414c8SBarry Smith       v2    = v1 + n;
624*4c1414c8SBarry Smith 
625*4c1414c8SBarry Smith       for(n = 0; n< sz-1; n+=2) {
626*4c1414c8SBarry Smith         i1   = idx[0];
627*4c1414c8SBarry Smith         i2   = idx[1];
628*4c1414c8SBarry Smith         idx += 2;
629*4c1414c8SBarry Smith         tmp0 = x[i1];
630*4c1414c8SBarry Smith         tmp1 = x[i2];
631*4c1414c8SBarry Smith         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
632*4c1414c8SBarry Smith         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
633*4c1414c8SBarry Smith       }
634*4c1414c8SBarry Smith       if(n   == sz-1){
635*4c1414c8SBarry Smith         tmp0  = x[*idx++];
636*4c1414c8SBarry Smith         sum1 += *v1++ * tmp0;
637*4c1414c8SBarry Smith         sum2 += *v2++ * tmp0;
638*4c1414c8SBarry Smith       }
639*4c1414c8SBarry Smith       y[row++]=sum1;
640*4c1414c8SBarry Smith       y[row++]=sum2;
641*4c1414c8SBarry Smith       v1      =v2;              /* Since the next block to be processed starts there*/
642*4c1414c8SBarry Smith       idx    +=sz;
643*4c1414c8SBarry Smith       break;
644*4c1414c8SBarry Smith     case 3:
645*4c1414c8SBarry Smith       sum1  = *zt++;
646*4c1414c8SBarry Smith       sum2  = *zt++;
647*4c1414c8SBarry Smith       sum3  = *zt++;
648*4c1414c8SBarry Smith       v2    = v1 + n;
649*4c1414c8SBarry Smith       v3    = v2 + n;
650*4c1414c8SBarry Smith 
651*4c1414c8SBarry Smith       for (n = 0; n< sz-1; n+=2) {
652*4c1414c8SBarry Smith         i1   = idx[0];
653*4c1414c8SBarry Smith         i2   = idx[1];
654*4c1414c8SBarry Smith         idx += 2;
655*4c1414c8SBarry Smith         tmp0 = x[i1];
656*4c1414c8SBarry Smith         tmp1 = x[i2];
657*4c1414c8SBarry Smith         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
658*4c1414c8SBarry Smith         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
659*4c1414c8SBarry Smith         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
660*4c1414c8SBarry Smith       }
661*4c1414c8SBarry Smith       if (n == sz-1){
662*4c1414c8SBarry Smith         tmp0  = x[*idx++];
663*4c1414c8SBarry Smith         sum1 += *v1++ * tmp0;
664*4c1414c8SBarry Smith         sum2 += *v2++ * tmp0;
665*4c1414c8SBarry Smith         sum3 += *v3++ * tmp0;
666*4c1414c8SBarry Smith       }
667*4c1414c8SBarry Smith       y[row++]=sum1;
668*4c1414c8SBarry Smith       y[row++]=sum2;
669*4c1414c8SBarry Smith       y[row++]=sum3;
670*4c1414c8SBarry Smith       v1       =v3;             /* Since the next block to be processed starts there*/
671*4c1414c8SBarry Smith       idx     +=2*sz;
672*4c1414c8SBarry Smith       break;
673*4c1414c8SBarry Smith     case 4:
674*4c1414c8SBarry Smith       sum1  = *zt++;
675*4c1414c8SBarry Smith       sum2  = *zt++;
676*4c1414c8SBarry Smith       sum3  = *zt++;
677*4c1414c8SBarry Smith       sum4  = *zt++;
678*4c1414c8SBarry Smith       v2    = v1 + n;
679*4c1414c8SBarry Smith       v3    = v2 + n;
680*4c1414c8SBarry Smith       v4    = v3 + n;
681*4c1414c8SBarry Smith 
682*4c1414c8SBarry Smith       for (n = 0; n< sz-1; n+=2) {
683*4c1414c8SBarry Smith         i1   = idx[0];
684*4c1414c8SBarry Smith         i2   = idx[1];
685*4c1414c8SBarry Smith         idx += 2;
686*4c1414c8SBarry Smith         tmp0 = x[i1];
687*4c1414c8SBarry Smith         tmp1 = x[i2];
688*4c1414c8SBarry Smith         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
689*4c1414c8SBarry Smith         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
690*4c1414c8SBarry Smith         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
691*4c1414c8SBarry Smith         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
692*4c1414c8SBarry Smith       }
693*4c1414c8SBarry Smith       if (n == sz-1){
694*4c1414c8SBarry Smith         tmp0  = x[*idx++];
695*4c1414c8SBarry Smith         sum1 += *v1++ * tmp0;
696*4c1414c8SBarry Smith         sum2 += *v2++ * tmp0;
697*4c1414c8SBarry Smith         sum3 += *v3++ * tmp0;
698*4c1414c8SBarry Smith         sum4 += *v4++ * tmp0;
699*4c1414c8SBarry Smith       }
700*4c1414c8SBarry Smith       y[row++]=sum1;
701*4c1414c8SBarry Smith       y[row++]=sum2;
702*4c1414c8SBarry Smith       y[row++]=sum3;
703*4c1414c8SBarry Smith       y[row++]=sum4;
704*4c1414c8SBarry Smith       v1      =v4;              /* Since the next block to be processed starts there*/
705*4c1414c8SBarry Smith       idx    +=3*sz;
706*4c1414c8SBarry Smith       break;
707*4c1414c8SBarry Smith     case 5:
708*4c1414c8SBarry Smith       sum1  = *zt++;
709*4c1414c8SBarry Smith       sum2  = *zt++;
710*4c1414c8SBarry Smith       sum3  = *zt++;
711*4c1414c8SBarry Smith       sum4  = *zt++;
712*4c1414c8SBarry Smith       sum5  = *zt++;
713*4c1414c8SBarry Smith       v2    = v1 + n;
714*4c1414c8SBarry Smith       v3    = v2 + n;
715*4c1414c8SBarry Smith       v4    = v3 + n;
716*4c1414c8SBarry Smith       v5    = v4 + n;
717*4c1414c8SBarry Smith 
718*4c1414c8SBarry Smith       for (n = 0; n<sz-1; n+=2) {
719*4c1414c8SBarry Smith         i1   = idx[0];
720*4c1414c8SBarry Smith         i2   = idx[1];
721*4c1414c8SBarry Smith         idx += 2;
722*4c1414c8SBarry Smith         tmp0 = x[i1];
723*4c1414c8SBarry Smith         tmp1 = x[i2];
724*4c1414c8SBarry Smith         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
725*4c1414c8SBarry Smith         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
726*4c1414c8SBarry Smith         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
727*4c1414c8SBarry Smith         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
728*4c1414c8SBarry Smith         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
729*4c1414c8SBarry Smith       }
730*4c1414c8SBarry Smith       if(n   == sz-1){
731*4c1414c8SBarry Smith         tmp0  = x[*idx++];
732*4c1414c8SBarry Smith         sum1 += *v1++ * tmp0;
733*4c1414c8SBarry Smith         sum2 += *v2++ * tmp0;
734*4c1414c8SBarry Smith         sum3 += *v3++ * tmp0;
735*4c1414c8SBarry Smith         sum4 += *v4++ * tmp0;
736*4c1414c8SBarry Smith         sum5 += *v5++ * tmp0;
737*4c1414c8SBarry Smith       }
738*4c1414c8SBarry Smith       y[row++]=sum1;
739*4c1414c8SBarry Smith       y[row++]=sum2;
740*4c1414c8SBarry Smith       y[row++]=sum3;
741*4c1414c8SBarry Smith       y[row++]=sum4;
742*4c1414c8SBarry Smith       y[row++]=sum5;
743*4c1414c8SBarry Smith       v1      =v5;       /* Since the next block to be processed starts there */
744*4c1414c8SBarry Smith       idx    +=4*sz;
745*4c1414c8SBarry Smith       break;
746*4c1414c8SBarry Smith     default :
747*4c1414c8SBarry Smith       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
748*4c1414c8SBarry Smith     }
749*4c1414c8SBarry Smith   }
750*4c1414c8SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
751*4c1414c8SBarry Smith   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
752*4c1414c8SBarry Smith   if (zz != yy) {
753*4c1414c8SBarry Smith     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
754*4c1414c8SBarry Smith   }
755*4c1414c8SBarry Smith   ierr = PetscLogFlops(2*a->nz);CHKERRQ(ierr);
756*4c1414c8SBarry Smith   PetscFunctionReturn(0);
757*4c1414c8SBarry Smith }
758*4c1414c8SBarry Smith 
759*4c1414c8SBarry Smith /* ----------------------------------------------------------- */
760*4c1414c8SBarry Smith #undef __FUNCT__
761*4c1414c8SBarry Smith #define __FUNCT__ "MatSolve_Inode"
762*4c1414c8SBarry Smith PetscErrorCode MatSolve_Inode(Mat A,Vec bb,Vec xx)
763*4c1414c8SBarry Smith {
764*4c1414c8SBarry Smith   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
765*4c1414c8SBarry Smith   IS             iscol = a->col,isrow = a->row;
766*4c1414c8SBarry Smith   PetscErrorCode ierr;
767*4c1414c8SBarry Smith   PetscInt       *r,*c,i,j,n = A->rmap.n,*ai = a->i,nz,*a_j = a->j;
768*4c1414c8SBarry Smith   PetscInt       node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1,*rout,*cout;
769*4c1414c8SBarry Smith   PetscScalar    *x,*b,*a_a = a->a,*tmp,*tmps,*aa,tmp0,tmp1;
770*4c1414c8SBarry Smith   PetscScalar    sum1,sum2,sum3,sum4,sum5,*v1,*v2,*v3,*v4,*v5;
771*4c1414c8SBarry Smith 
772*4c1414c8SBarry Smith   PetscFunctionBegin;
773*4c1414c8SBarry Smith   if (A->factor!=FACTOR_LU) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unfactored matrix");
774*4c1414c8SBarry Smith   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
775*4c1414c8SBarry Smith   node_max = a->inode.node_count;
776*4c1414c8SBarry Smith   ns       = a->inode.size;     /* Node Size array */
777*4c1414c8SBarry Smith 
778*4c1414c8SBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
779*4c1414c8SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
780*4c1414c8SBarry Smith   tmp  = a->solve_work;
781*4c1414c8SBarry Smith 
782*4c1414c8SBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
783*4c1414c8SBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
784*4c1414c8SBarry Smith 
785*4c1414c8SBarry Smith   /* forward solve the lower triangular */
786*4c1414c8SBarry Smith   tmps = tmp ;
787*4c1414c8SBarry Smith   aa   = a_a ;
788*4c1414c8SBarry Smith   aj   = a_j ;
789*4c1414c8SBarry Smith   ad   = a->diag;
790*4c1414c8SBarry Smith 
791*4c1414c8SBarry Smith   for (i = 0,row = 0; i< node_max; ++i){
792*4c1414c8SBarry Smith     nsz = ns[i];
793*4c1414c8SBarry Smith     aii = ai[row];
794*4c1414c8SBarry Smith     v1  = aa + aii;
795*4c1414c8SBarry Smith     vi  = aj + aii;
796*4c1414c8SBarry Smith     nz  = ad[row]- aii;
797*4c1414c8SBarry Smith 
798*4c1414c8SBarry Smith     switch (nsz){               /* Each loop in 'case' is unrolled */
799*4c1414c8SBarry Smith     case 1 :
800*4c1414c8SBarry Smith       sum1 = b[*r++];
801*4c1414c8SBarry Smith       /*      while (nz--) sum1 -= *v1++ *tmps[*vi++];*/
802*4c1414c8SBarry Smith       for(j=0; j<nz-1; j+=2){
803*4c1414c8SBarry Smith         i0   = vi[0];
804*4c1414c8SBarry Smith         i1   = vi[1];
805*4c1414c8SBarry Smith         vi  +=2;
806*4c1414c8SBarry Smith         tmp0 = tmps[i0];
807*4c1414c8SBarry Smith         tmp1 = tmps[i1];
808*4c1414c8SBarry Smith         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
809*4c1414c8SBarry Smith       }
810*4c1414c8SBarry Smith       if(j == nz-1){
811*4c1414c8SBarry Smith         tmp0 = tmps[*vi++];
812*4c1414c8SBarry Smith         sum1 -= *v1++ *tmp0;
813*4c1414c8SBarry Smith       }
814*4c1414c8SBarry Smith       tmp[row ++]=sum1;
815*4c1414c8SBarry Smith       break;
816*4c1414c8SBarry Smith     case 2:
817*4c1414c8SBarry Smith       sum1 = b[*r++];
818*4c1414c8SBarry Smith       sum2 = b[*r++];
819*4c1414c8SBarry Smith       v2   = aa + ai[row+1];
820*4c1414c8SBarry Smith 
821*4c1414c8SBarry Smith       for(j=0; j<nz-1; j+=2){
822*4c1414c8SBarry Smith         i0   = vi[0];
823*4c1414c8SBarry Smith         i1   = vi[1];
824*4c1414c8SBarry Smith         vi  +=2;
825*4c1414c8SBarry Smith         tmp0 = tmps[i0];
826*4c1414c8SBarry Smith         tmp1 = tmps[i1];
827*4c1414c8SBarry Smith         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
828*4c1414c8SBarry Smith         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
829*4c1414c8SBarry Smith       }
830*4c1414c8SBarry Smith       if(j == nz-1){
831*4c1414c8SBarry Smith         tmp0 = tmps[*vi++];
832*4c1414c8SBarry Smith         sum1 -= *v1++ *tmp0;
833*4c1414c8SBarry Smith         sum2 -= *v2++ *tmp0;
834*4c1414c8SBarry Smith       }
835*4c1414c8SBarry Smith       sum2 -= *v2++ * sum1;
836*4c1414c8SBarry Smith       tmp[row ++]=sum1;
837*4c1414c8SBarry Smith       tmp[row ++]=sum2;
838*4c1414c8SBarry Smith       break;
839*4c1414c8SBarry Smith     case 3:
840*4c1414c8SBarry Smith       sum1 = b[*r++];
841*4c1414c8SBarry Smith       sum2 = b[*r++];
842*4c1414c8SBarry Smith       sum3 = b[*r++];
843*4c1414c8SBarry Smith       v2   = aa + ai[row+1];
844*4c1414c8SBarry Smith       v3   = aa + ai[row+2];
845*4c1414c8SBarry Smith 
846*4c1414c8SBarry Smith       for (j=0; j<nz-1; j+=2){
847*4c1414c8SBarry Smith         i0   = vi[0];
848*4c1414c8SBarry Smith         i1   = vi[1];
849*4c1414c8SBarry Smith         vi  +=2;
850*4c1414c8SBarry Smith         tmp0 = tmps[i0];
851*4c1414c8SBarry Smith         tmp1 = tmps[i1];
852*4c1414c8SBarry Smith         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
853*4c1414c8SBarry Smith         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
854*4c1414c8SBarry Smith         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
855*4c1414c8SBarry Smith       }
856*4c1414c8SBarry Smith       if (j == nz-1){
857*4c1414c8SBarry Smith         tmp0 = tmps[*vi++];
858*4c1414c8SBarry Smith         sum1 -= *v1++ *tmp0;
859*4c1414c8SBarry Smith         sum2 -= *v2++ *tmp0;
860*4c1414c8SBarry Smith         sum3 -= *v3++ *tmp0;
861*4c1414c8SBarry Smith       }
862*4c1414c8SBarry Smith       sum2 -= *v2++ * sum1;
863*4c1414c8SBarry Smith       sum3 -= *v3++ * sum1;
864*4c1414c8SBarry Smith       sum3 -= *v3++ * sum2;
865*4c1414c8SBarry Smith       tmp[row ++]=sum1;
866*4c1414c8SBarry Smith       tmp[row ++]=sum2;
867*4c1414c8SBarry Smith       tmp[row ++]=sum3;
868*4c1414c8SBarry Smith       break;
869*4c1414c8SBarry Smith 
870*4c1414c8SBarry Smith     case 4:
871*4c1414c8SBarry Smith       sum1 = b[*r++];
872*4c1414c8SBarry Smith       sum2 = b[*r++];
873*4c1414c8SBarry Smith       sum3 = b[*r++];
874*4c1414c8SBarry Smith       sum4 = b[*r++];
875*4c1414c8SBarry Smith       v2   = aa + ai[row+1];
876*4c1414c8SBarry Smith       v3   = aa + ai[row+2];
877*4c1414c8SBarry Smith       v4   = aa + ai[row+3];
878*4c1414c8SBarry Smith 
879*4c1414c8SBarry Smith       for (j=0; j<nz-1; j+=2){
880*4c1414c8SBarry Smith         i0   = vi[0];
881*4c1414c8SBarry Smith         i1   = vi[1];
882*4c1414c8SBarry Smith         vi  +=2;
883*4c1414c8SBarry Smith         tmp0 = tmps[i0];
884*4c1414c8SBarry Smith         tmp1 = tmps[i1];
885*4c1414c8SBarry Smith         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
886*4c1414c8SBarry Smith         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
887*4c1414c8SBarry Smith         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
888*4c1414c8SBarry Smith         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
889*4c1414c8SBarry Smith       }
890*4c1414c8SBarry Smith       if (j == nz-1){
891*4c1414c8SBarry Smith         tmp0 = tmps[*vi++];
892*4c1414c8SBarry Smith         sum1 -= *v1++ *tmp0;
893*4c1414c8SBarry Smith         sum2 -= *v2++ *tmp0;
894*4c1414c8SBarry Smith         sum3 -= *v3++ *tmp0;
895*4c1414c8SBarry Smith         sum4 -= *v4++ *tmp0;
896*4c1414c8SBarry Smith       }
897*4c1414c8SBarry Smith       sum2 -= *v2++ * sum1;
898*4c1414c8SBarry Smith       sum3 -= *v3++ * sum1;
899*4c1414c8SBarry Smith       sum4 -= *v4++ * sum1;
900*4c1414c8SBarry Smith       sum3 -= *v3++ * sum2;
901*4c1414c8SBarry Smith       sum4 -= *v4++ * sum2;
902*4c1414c8SBarry Smith       sum4 -= *v4++ * sum3;
903*4c1414c8SBarry Smith 
904*4c1414c8SBarry Smith       tmp[row ++]=sum1;
905*4c1414c8SBarry Smith       tmp[row ++]=sum2;
906*4c1414c8SBarry Smith       tmp[row ++]=sum3;
907*4c1414c8SBarry Smith       tmp[row ++]=sum4;
908*4c1414c8SBarry Smith       break;
909*4c1414c8SBarry Smith     case 5:
910*4c1414c8SBarry Smith       sum1 = b[*r++];
911*4c1414c8SBarry Smith       sum2 = b[*r++];
912*4c1414c8SBarry Smith       sum3 = b[*r++];
913*4c1414c8SBarry Smith       sum4 = b[*r++];
914*4c1414c8SBarry Smith       sum5 = b[*r++];
915*4c1414c8SBarry Smith       v2   = aa + ai[row+1];
916*4c1414c8SBarry Smith       v3   = aa + ai[row+2];
917*4c1414c8SBarry Smith       v4   = aa + ai[row+3];
918*4c1414c8SBarry Smith       v5   = aa + ai[row+4];
919*4c1414c8SBarry Smith 
920*4c1414c8SBarry Smith       for (j=0; j<nz-1; j+=2){
921*4c1414c8SBarry Smith         i0   = vi[0];
922*4c1414c8SBarry Smith         i1   = vi[1];
923*4c1414c8SBarry Smith         vi  +=2;
924*4c1414c8SBarry Smith         tmp0 = tmps[i0];
925*4c1414c8SBarry Smith         tmp1 = tmps[i1];
926*4c1414c8SBarry Smith         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
927*4c1414c8SBarry Smith         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
928*4c1414c8SBarry Smith         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
929*4c1414c8SBarry Smith         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
930*4c1414c8SBarry Smith         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
931*4c1414c8SBarry Smith       }
932*4c1414c8SBarry Smith       if (j == nz-1){
933*4c1414c8SBarry Smith         tmp0 = tmps[*vi++];
934*4c1414c8SBarry Smith         sum1 -= *v1++ *tmp0;
935*4c1414c8SBarry Smith         sum2 -= *v2++ *tmp0;
936*4c1414c8SBarry Smith         sum3 -= *v3++ *tmp0;
937*4c1414c8SBarry Smith         sum4 -= *v4++ *tmp0;
938*4c1414c8SBarry Smith         sum5 -= *v5++ *tmp0;
939*4c1414c8SBarry Smith       }
940*4c1414c8SBarry Smith 
941*4c1414c8SBarry Smith       sum2 -= *v2++ * sum1;
942*4c1414c8SBarry Smith       sum3 -= *v3++ * sum1;
943*4c1414c8SBarry Smith       sum4 -= *v4++ * sum1;
944*4c1414c8SBarry Smith       sum5 -= *v5++ * sum1;
945*4c1414c8SBarry Smith       sum3 -= *v3++ * sum2;
946*4c1414c8SBarry Smith       sum4 -= *v4++ * sum2;
947*4c1414c8SBarry Smith       sum5 -= *v5++ * sum2;
948*4c1414c8SBarry Smith       sum4 -= *v4++ * sum3;
949*4c1414c8SBarry Smith       sum5 -= *v5++ * sum3;
950*4c1414c8SBarry Smith       sum5 -= *v5++ * sum4;
951*4c1414c8SBarry Smith 
952*4c1414c8SBarry Smith       tmp[row ++]=sum1;
953*4c1414c8SBarry Smith       tmp[row ++]=sum2;
954*4c1414c8SBarry Smith       tmp[row ++]=sum3;
955*4c1414c8SBarry Smith       tmp[row ++]=sum4;
956*4c1414c8SBarry Smith       tmp[row ++]=sum5;
957*4c1414c8SBarry Smith       break;
958*4c1414c8SBarry Smith     default:
959*4c1414c8SBarry Smith       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
960*4c1414c8SBarry Smith     }
961*4c1414c8SBarry Smith   }
962*4c1414c8SBarry Smith   /* backward solve the upper triangular */
963*4c1414c8SBarry Smith   for (i=node_max -1 ,row = n-1 ; i>=0; i--){
964*4c1414c8SBarry Smith     nsz = ns[i];
965*4c1414c8SBarry Smith     aii = ai[row+1] -1;
966*4c1414c8SBarry Smith     v1  = aa + aii;
967*4c1414c8SBarry Smith     vi  = aj + aii;
968*4c1414c8SBarry Smith     nz  = aii- ad[row];
969*4c1414c8SBarry Smith     switch (nsz){               /* Each loop in 'case' is unrolled */
970*4c1414c8SBarry Smith     case 1 :
971*4c1414c8SBarry Smith       sum1 = tmp[row];
972*4c1414c8SBarry Smith 
973*4c1414c8SBarry Smith       for(j=nz ; j>1; j-=2){
974*4c1414c8SBarry Smith         vi  -=2;
975*4c1414c8SBarry Smith         i0   = vi[2];
976*4c1414c8SBarry Smith         i1   = vi[1];
977*4c1414c8SBarry Smith         tmp0 = tmps[i0];
978*4c1414c8SBarry Smith         tmp1 = tmps[i1];
979*4c1414c8SBarry Smith         v1   -= 2;
980*4c1414c8SBarry Smith         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
981*4c1414c8SBarry Smith       }
982*4c1414c8SBarry Smith       if (j==1){
983*4c1414c8SBarry Smith         tmp0  = tmps[*vi--];
984*4c1414c8SBarry Smith         sum1 -= *v1-- * tmp0;
985*4c1414c8SBarry Smith       }
986*4c1414c8SBarry Smith       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
987*4c1414c8SBarry Smith       break;
988*4c1414c8SBarry Smith     case 2 :
989*4c1414c8SBarry Smith       sum1 = tmp[row];
990*4c1414c8SBarry Smith       sum2 = tmp[row -1];
991*4c1414c8SBarry Smith       v2   = aa + ai[row]-1;
992*4c1414c8SBarry Smith       for (j=nz ; j>1; j-=2){
993*4c1414c8SBarry Smith         vi  -=2;
994*4c1414c8SBarry Smith         i0   = vi[2];
995*4c1414c8SBarry Smith         i1   = vi[1];
996*4c1414c8SBarry Smith         tmp0 = tmps[i0];
997*4c1414c8SBarry Smith         tmp1 = tmps[i1];
998*4c1414c8SBarry Smith         v1   -= 2;
999*4c1414c8SBarry Smith         v2   -= 2;
1000*4c1414c8SBarry Smith         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1001*4c1414c8SBarry Smith         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1002*4c1414c8SBarry Smith       }
1003*4c1414c8SBarry Smith       if (j==1){
1004*4c1414c8SBarry Smith         tmp0  = tmps[*vi--];
1005*4c1414c8SBarry Smith         sum1 -= *v1-- * tmp0;
1006*4c1414c8SBarry Smith         sum2 -= *v2-- * tmp0;
1007*4c1414c8SBarry Smith       }
1008*4c1414c8SBarry Smith 
1009*4c1414c8SBarry Smith       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1010*4c1414c8SBarry Smith       sum2   -= *v2-- * tmp0;
1011*4c1414c8SBarry Smith       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1012*4c1414c8SBarry Smith       break;
1013*4c1414c8SBarry Smith     case 3 :
1014*4c1414c8SBarry Smith       sum1 = tmp[row];
1015*4c1414c8SBarry Smith       sum2 = tmp[row -1];
1016*4c1414c8SBarry Smith       sum3 = tmp[row -2];
1017*4c1414c8SBarry Smith       v2   = aa + ai[row]-1;
1018*4c1414c8SBarry Smith       v3   = aa + ai[row -1]-1;
1019*4c1414c8SBarry Smith       for (j=nz ; j>1; j-=2){
1020*4c1414c8SBarry Smith         vi  -=2;
1021*4c1414c8SBarry Smith         i0   = vi[2];
1022*4c1414c8SBarry Smith         i1   = vi[1];
1023*4c1414c8SBarry Smith         tmp0 = tmps[i0];
1024*4c1414c8SBarry Smith         tmp1 = tmps[i1];
1025*4c1414c8SBarry Smith         v1   -= 2;
1026*4c1414c8SBarry Smith         v2   -= 2;
1027*4c1414c8SBarry Smith         v3   -= 2;
1028*4c1414c8SBarry Smith         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1029*4c1414c8SBarry Smith         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1030*4c1414c8SBarry Smith         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1031*4c1414c8SBarry Smith       }
1032*4c1414c8SBarry Smith       if (j==1){
1033*4c1414c8SBarry Smith         tmp0  = tmps[*vi--];
1034*4c1414c8SBarry Smith         sum1 -= *v1-- * tmp0;
1035*4c1414c8SBarry Smith         sum2 -= *v2-- * tmp0;
1036*4c1414c8SBarry Smith         sum3 -= *v3-- * tmp0;
1037*4c1414c8SBarry Smith       }
1038*4c1414c8SBarry Smith       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1039*4c1414c8SBarry Smith       sum2   -= *v2-- * tmp0;
1040*4c1414c8SBarry Smith       sum3   -= *v3-- * tmp0;
1041*4c1414c8SBarry Smith       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1042*4c1414c8SBarry Smith       sum3   -= *v3-- * tmp0;
1043*4c1414c8SBarry Smith       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1044*4c1414c8SBarry Smith 
1045*4c1414c8SBarry Smith       break;
1046*4c1414c8SBarry Smith     case 4 :
1047*4c1414c8SBarry Smith       sum1 = tmp[row];
1048*4c1414c8SBarry Smith       sum2 = tmp[row -1];
1049*4c1414c8SBarry Smith       sum3 = tmp[row -2];
1050*4c1414c8SBarry Smith       sum4 = tmp[row -3];
1051*4c1414c8SBarry Smith       v2   = aa + ai[row]-1;
1052*4c1414c8SBarry Smith       v3   = aa + ai[row -1]-1;
1053*4c1414c8SBarry Smith       v4   = aa + ai[row -2]-1;
1054*4c1414c8SBarry Smith 
1055*4c1414c8SBarry Smith       for (j=nz ; j>1; j-=2){
1056*4c1414c8SBarry Smith         vi  -=2;
1057*4c1414c8SBarry Smith         i0   = vi[2];
1058*4c1414c8SBarry Smith         i1   = vi[1];
1059*4c1414c8SBarry Smith         tmp0 = tmps[i0];
1060*4c1414c8SBarry Smith         tmp1 = tmps[i1];
1061*4c1414c8SBarry Smith         v1  -= 2;
1062*4c1414c8SBarry Smith         v2  -= 2;
1063*4c1414c8SBarry Smith         v3  -= 2;
1064*4c1414c8SBarry Smith         v4  -= 2;
1065*4c1414c8SBarry Smith         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1066*4c1414c8SBarry Smith         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1067*4c1414c8SBarry Smith         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1068*4c1414c8SBarry Smith         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1069*4c1414c8SBarry Smith       }
1070*4c1414c8SBarry Smith       if (j==1){
1071*4c1414c8SBarry Smith         tmp0  = tmps[*vi--];
1072*4c1414c8SBarry Smith         sum1 -= *v1-- * tmp0;
1073*4c1414c8SBarry Smith         sum2 -= *v2-- * tmp0;
1074*4c1414c8SBarry Smith         sum3 -= *v3-- * tmp0;
1075*4c1414c8SBarry Smith         sum4 -= *v4-- * tmp0;
1076*4c1414c8SBarry Smith       }
1077*4c1414c8SBarry Smith 
1078*4c1414c8SBarry Smith       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1079*4c1414c8SBarry Smith       sum2   -= *v2-- * tmp0;
1080*4c1414c8SBarry Smith       sum3   -= *v3-- * tmp0;
1081*4c1414c8SBarry Smith       sum4   -= *v4-- * tmp0;
1082*4c1414c8SBarry Smith       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1083*4c1414c8SBarry Smith       sum3   -= *v3-- * tmp0;
1084*4c1414c8SBarry Smith       sum4   -= *v4-- * tmp0;
1085*4c1414c8SBarry Smith       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1086*4c1414c8SBarry Smith       sum4   -= *v4-- * tmp0;
1087*4c1414c8SBarry Smith       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1088*4c1414c8SBarry Smith       break;
1089*4c1414c8SBarry Smith     case 5 :
1090*4c1414c8SBarry Smith       sum1 = tmp[row];
1091*4c1414c8SBarry Smith       sum2 = tmp[row -1];
1092*4c1414c8SBarry Smith       sum3 = tmp[row -2];
1093*4c1414c8SBarry Smith       sum4 = tmp[row -3];
1094*4c1414c8SBarry Smith       sum5 = tmp[row -4];
1095*4c1414c8SBarry Smith       v2   = aa + ai[row]-1;
1096*4c1414c8SBarry Smith       v3   = aa + ai[row -1]-1;
1097*4c1414c8SBarry Smith       v4   = aa + ai[row -2]-1;
1098*4c1414c8SBarry Smith       v5   = aa + ai[row -3]-1;
1099*4c1414c8SBarry Smith       for (j=nz ; j>1; j-=2){
1100*4c1414c8SBarry Smith         vi  -= 2;
1101*4c1414c8SBarry Smith         i0   = vi[2];
1102*4c1414c8SBarry Smith         i1   = vi[1];
1103*4c1414c8SBarry Smith         tmp0 = tmps[i0];
1104*4c1414c8SBarry Smith         tmp1 = tmps[i1];
1105*4c1414c8SBarry Smith         v1   -= 2;
1106*4c1414c8SBarry Smith         v2   -= 2;
1107*4c1414c8SBarry Smith         v3   -= 2;
1108*4c1414c8SBarry Smith         v4   -= 2;
1109*4c1414c8SBarry Smith         v5   -= 2;
1110*4c1414c8SBarry Smith         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1111*4c1414c8SBarry Smith         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1112*4c1414c8SBarry Smith         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1113*4c1414c8SBarry Smith         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1114*4c1414c8SBarry Smith         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1115*4c1414c8SBarry Smith       }
1116*4c1414c8SBarry Smith       if (j==1){
1117*4c1414c8SBarry Smith         tmp0  = tmps[*vi--];
1118*4c1414c8SBarry Smith         sum1 -= *v1-- * tmp0;
1119*4c1414c8SBarry Smith         sum2 -= *v2-- * tmp0;
1120*4c1414c8SBarry Smith         sum3 -= *v3-- * tmp0;
1121*4c1414c8SBarry Smith         sum4 -= *v4-- * tmp0;
1122*4c1414c8SBarry Smith         sum5 -= *v5-- * tmp0;
1123*4c1414c8SBarry Smith       }
1124*4c1414c8SBarry Smith 
1125*4c1414c8SBarry Smith       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1126*4c1414c8SBarry Smith       sum2   -= *v2-- * tmp0;
1127*4c1414c8SBarry Smith       sum3   -= *v3-- * tmp0;
1128*4c1414c8SBarry Smith       sum4   -= *v4-- * tmp0;
1129*4c1414c8SBarry Smith       sum5   -= *v5-- * tmp0;
1130*4c1414c8SBarry Smith       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1131*4c1414c8SBarry Smith       sum3   -= *v3-- * tmp0;
1132*4c1414c8SBarry Smith       sum4   -= *v4-- * tmp0;
1133*4c1414c8SBarry Smith       sum5   -= *v5-- * tmp0;
1134*4c1414c8SBarry Smith       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1135*4c1414c8SBarry Smith       sum4   -= *v4-- * tmp0;
1136*4c1414c8SBarry Smith       sum5   -= *v5-- * tmp0;
1137*4c1414c8SBarry Smith       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1138*4c1414c8SBarry Smith       sum5   -= *v5-- * tmp0;
1139*4c1414c8SBarry Smith       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1140*4c1414c8SBarry Smith       break;
1141*4c1414c8SBarry Smith     default:
1142*4c1414c8SBarry Smith       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1143*4c1414c8SBarry Smith     }
1144*4c1414c8SBarry Smith   }
1145*4c1414c8SBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1146*4c1414c8SBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1147*4c1414c8SBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1148*4c1414c8SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1149*4c1414c8SBarry Smith   ierr = PetscLogFlops(2*a->nz - A->cmap.n);CHKERRQ(ierr);
1150*4c1414c8SBarry Smith   PetscFunctionReturn(0);
1151*4c1414c8SBarry Smith }
1152*4c1414c8SBarry Smith 
1153*4c1414c8SBarry Smith #undef __FUNCT__
1154*4c1414c8SBarry Smith #define __FUNCT__ "MatLUFactorNumeric_Inode"
1155*4c1414c8SBarry Smith PetscErrorCode MatLUFactorNumeric_Inode(Mat A,MatFactorInfo *info,Mat *B)
1156*4c1414c8SBarry Smith {
1157*4c1414c8SBarry Smith   Mat            C = *B;
1158*4c1414c8SBarry Smith   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1159*4c1414c8SBarry Smith   IS             iscol = b->col,isrow = b->row,isicol = b->icol;
1160*4c1414c8SBarry Smith   PetscErrorCode ierr;
1161*4c1414c8SBarry Smith   PetscInt       *r,*ic,*c,n = A->rmap.n,*bi = b->i;
1162*4c1414c8SBarry Smith   PetscInt       *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1163*4c1414c8SBarry Smith   PetscInt       *ics,i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz;
1164*4c1414c8SBarry Smith   PetscInt       *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1165*4c1414c8SBarry Smith   PetscScalar    *rtmp1,*rtmp2,*rtmp3,*v1,*v2,*v3,*pc1,*pc2,*pc3,mul1,mul2,mul3;
1166*4c1414c8SBarry Smith   PetscScalar    tmp,*ba = b->a,*aa = a->a,*pv;
1167*4c1414c8SBarry Smith   PetscReal      rs=0.0;
1168*4c1414c8SBarry Smith   LUShift_Ctx    sctx;
1169*4c1414c8SBarry Smith   PetscInt       newshift;
1170*4c1414c8SBarry Smith 
1171*4c1414c8SBarry Smith   PetscFunctionBegin;
1172*4c1414c8SBarry Smith   sctx.shift_top  = 0;
1173*4c1414c8SBarry Smith   sctx.nshift_max = 0;
1174*4c1414c8SBarry Smith   sctx.shift_lo   = 0;
1175*4c1414c8SBarry Smith   sctx.shift_hi   = 0;
1176*4c1414c8SBarry Smith 
1177*4c1414c8SBarry Smith   /* if both shift schemes are chosen by user, only use info->shiftpd */
1178*4c1414c8SBarry Smith   if (info->shiftpd && info->shiftnz) info->shiftnz = 0.0;
1179*4c1414c8SBarry Smith   if (info->shiftpd) { /* set sctx.shift_top=max{rs} */
1180*4c1414c8SBarry Smith     sctx.shift_top = 0;
1181*4c1414c8SBarry Smith     for (i=0; i<n; i++) {
1182*4c1414c8SBarry Smith       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1183*4c1414c8SBarry Smith       rs    = 0.0;
1184*4c1414c8SBarry Smith       ajtmp = aj + ai[i];
1185*4c1414c8SBarry Smith       rtmp1 = aa + ai[i];
1186*4c1414c8SBarry Smith       nz = ai[i+1] - ai[i];
1187*4c1414c8SBarry Smith       for (j=0; j<nz; j++){
1188*4c1414c8SBarry Smith         if (*ajtmp != i){
1189*4c1414c8SBarry Smith           rs += PetscAbsScalar(*rtmp1++);
1190*4c1414c8SBarry Smith         } else {
1191*4c1414c8SBarry Smith           rs -= PetscRealPart(*rtmp1++);
1192*4c1414c8SBarry Smith         }
1193*4c1414c8SBarry Smith         ajtmp++;
1194*4c1414c8SBarry Smith       }
1195*4c1414c8SBarry Smith       if (rs>sctx.shift_top) sctx.shift_top = rs;
1196*4c1414c8SBarry Smith     }
1197*4c1414c8SBarry Smith     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1198*4c1414c8SBarry Smith     sctx.shift_top *= 1.1;
1199*4c1414c8SBarry Smith     sctx.nshift_max = 5;
1200*4c1414c8SBarry Smith     sctx.shift_lo   = 0.;
1201*4c1414c8SBarry Smith     sctx.shift_hi   = 1.;
1202*4c1414c8SBarry Smith   }
1203*4c1414c8SBarry Smith   sctx.shift_amount = 0;
1204*4c1414c8SBarry Smith   sctx.nshift       = 0;
1205*4c1414c8SBarry Smith 
1206*4c1414c8SBarry Smith   ierr  = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1207*4c1414c8SBarry Smith   ierr  = ISGetIndices(iscol,&c);CHKERRQ(ierr);
1208*4c1414c8SBarry Smith   ierr  = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1209*4c1414c8SBarry Smith   ierr  = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp1);CHKERRQ(ierr);
1210*4c1414c8SBarry Smith   ierr  = PetscMemzero(rtmp1,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr);
1211*4c1414c8SBarry Smith   ics   = ic ;
1212*4c1414c8SBarry Smith   rtmp2 = rtmp1 + n;
1213*4c1414c8SBarry Smith   rtmp3 = rtmp2 + n;
1214*4c1414c8SBarry Smith 
1215*4c1414c8SBarry Smith   node_max = a->inode.node_count;
1216*4c1414c8SBarry Smith   ns       = a->inode.size ;
1217*4c1414c8SBarry Smith   if (!ns){
1218*4c1414c8SBarry Smith     SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information");
1219*4c1414c8SBarry Smith   }
1220*4c1414c8SBarry Smith 
1221*4c1414c8SBarry Smith   /* If max inode size > 3, split it into two inodes.*/
1222*4c1414c8SBarry Smith   /* also map the inode sizes according to the ordering */
1223*4c1414c8SBarry Smith   ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr);
1224*4c1414c8SBarry Smith   for (i=0,j=0; i<node_max; ++i,++j){
1225*4c1414c8SBarry Smith     if (ns[i]>3) {
1226*4c1414c8SBarry Smith       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1227*4c1414c8SBarry Smith       ++j;
1228*4c1414c8SBarry Smith       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1229*4c1414c8SBarry Smith     } else {
1230*4c1414c8SBarry Smith       tmp_vec1[j] = ns[i];
1231*4c1414c8SBarry Smith     }
1232*4c1414c8SBarry Smith   }
1233*4c1414c8SBarry Smith   /* Use the correct node_max */
1234*4c1414c8SBarry Smith   node_max = j;
1235*4c1414c8SBarry Smith 
1236*4c1414c8SBarry Smith   /* Now reorder the inode info based on mat re-ordering info */
1237*4c1414c8SBarry Smith   /* First create a row -> inode_size_array_index map */
1238*4c1414c8SBarry Smith   ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr);
1239*4c1414c8SBarry Smith   ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr);
1240*4c1414c8SBarry Smith   for (i=0,row=0; i<node_max; i++) {
1241*4c1414c8SBarry Smith     nodesz = tmp_vec1[i];
1242*4c1414c8SBarry Smith     for (j=0; j<nodesz; j++,row++) {
1243*4c1414c8SBarry Smith       nsmap[row] = i;
1244*4c1414c8SBarry Smith     }
1245*4c1414c8SBarry Smith   }
1246*4c1414c8SBarry Smith   /* Using nsmap, create a reordered ns structure */
1247*4c1414c8SBarry Smith   for (i=0,j=0; i< node_max; i++) {
1248*4c1414c8SBarry Smith     nodesz       = tmp_vec1[nsmap[r[j]]];    /* here the reordered row_no is in r[] */
1249*4c1414c8SBarry Smith     tmp_vec2[i]  = nodesz;
1250*4c1414c8SBarry Smith     j           += nodesz;
1251*4c1414c8SBarry Smith   }
1252*4c1414c8SBarry Smith   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1253*4c1414c8SBarry Smith   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1254*4c1414c8SBarry Smith   /* Now use the correct ns */
1255*4c1414c8SBarry Smith   ns = tmp_vec2;
1256*4c1414c8SBarry Smith 
1257*4c1414c8SBarry Smith   do {
1258*4c1414c8SBarry Smith     sctx.lushift = PETSC_FALSE;
1259*4c1414c8SBarry Smith     /* Now loop over each block-row, and do the factorization */
1260*4c1414c8SBarry Smith     for (i=0,row=0; i<node_max; i++) {
1261*4c1414c8SBarry Smith       nodesz = ns[i];
1262*4c1414c8SBarry Smith       nz     = bi[row+1] - bi[row];
1263*4c1414c8SBarry Smith       bjtmp  = bj + bi[row];
1264*4c1414c8SBarry Smith 
1265*4c1414c8SBarry Smith       switch (nodesz){
1266*4c1414c8SBarry Smith       case 1:
1267*4c1414c8SBarry Smith         for  (j=0; j<nz; j++){
1268*4c1414c8SBarry Smith           idx        = bjtmp[j];
1269*4c1414c8SBarry Smith           rtmp1[idx] = 0.0;
1270*4c1414c8SBarry Smith         }
1271*4c1414c8SBarry Smith 
1272*4c1414c8SBarry Smith         /* load in initial (unfactored row) */
1273*4c1414c8SBarry Smith         idx    = r[row];
1274*4c1414c8SBarry Smith         nz_tmp = ai[idx+1] - ai[idx];
1275*4c1414c8SBarry Smith         ajtmp  = aj + ai[idx];
1276*4c1414c8SBarry Smith         v1     = aa + ai[idx];
1277*4c1414c8SBarry Smith 
1278*4c1414c8SBarry Smith         for (j=0; j<nz_tmp; j++) {
1279*4c1414c8SBarry Smith           idx        = ics[ajtmp[j]];
1280*4c1414c8SBarry Smith           rtmp1[idx] = v1[j];
1281*4c1414c8SBarry Smith         }
1282*4c1414c8SBarry Smith         rtmp1[ics[r[row]]] += sctx.shift_amount;
1283*4c1414c8SBarry Smith 
1284*4c1414c8SBarry Smith         prow = *bjtmp++ ;
1285*4c1414c8SBarry Smith         while (prow < row) {
1286*4c1414c8SBarry Smith           pc1 = rtmp1 + prow;
1287*4c1414c8SBarry Smith           if (*pc1 != 0.0){
1288*4c1414c8SBarry Smith             pv   = ba + bd[prow];
1289*4c1414c8SBarry Smith             pj   = nbj + bd[prow];
1290*4c1414c8SBarry Smith             mul1 = *pc1 * *pv++;
1291*4c1414c8SBarry Smith             *pc1 = mul1;
1292*4c1414c8SBarry Smith             nz_tmp = bi[prow+1] - bd[prow] - 1;
1293*4c1414c8SBarry Smith             ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr);
1294*4c1414c8SBarry Smith             for (j=0; j<nz_tmp; j++) {
1295*4c1414c8SBarry Smith               tmp = pv[j];
1296*4c1414c8SBarry Smith               idx = pj[j];
1297*4c1414c8SBarry Smith               rtmp1[idx] -= mul1 * tmp;
1298*4c1414c8SBarry Smith             }
1299*4c1414c8SBarry Smith           }
1300*4c1414c8SBarry Smith           prow = *bjtmp++ ;
1301*4c1414c8SBarry Smith         }
1302*4c1414c8SBarry Smith         pj  = bj + bi[row];
1303*4c1414c8SBarry Smith         pc1 = ba + bi[row];
1304*4c1414c8SBarry Smith 
1305*4c1414c8SBarry Smith         sctx.pv    = rtmp1[row];
1306*4c1414c8SBarry Smith         rtmp1[row] = 1.0/rtmp1[row]; /* invert diag */
1307*4c1414c8SBarry Smith         rs         = 0.0;
1308*4c1414c8SBarry Smith         for (j=0; j<nz; j++) {
1309*4c1414c8SBarry Smith           idx    = pj[j];
1310*4c1414c8SBarry Smith           pc1[j] = rtmp1[idx]; /* rtmp1 -> ba */
1311*4c1414c8SBarry Smith           if (idx != row) rs += PetscAbsScalar(pc1[j]);
1312*4c1414c8SBarry Smith         }
1313*4c1414c8SBarry Smith         sctx.rs  = rs;
1314*4c1414c8SBarry Smith         ierr = MatLUCheckShift_inline(info,sctx,row,a->diag,newshift);CHKERRQ(ierr);
1315*4c1414c8SBarry Smith         if (newshift == 1) goto endofwhile;
1316*4c1414c8SBarry Smith         break;
1317*4c1414c8SBarry Smith 
1318*4c1414c8SBarry Smith       case 2:
1319*4c1414c8SBarry Smith         for (j=0; j<nz; j++) {
1320*4c1414c8SBarry Smith           idx        = bjtmp[j];
1321*4c1414c8SBarry Smith           rtmp1[idx] = 0.0;
1322*4c1414c8SBarry Smith           rtmp2[idx] = 0.0;
1323*4c1414c8SBarry Smith         }
1324*4c1414c8SBarry Smith 
1325*4c1414c8SBarry Smith         /* load in initial (unfactored row) */
1326*4c1414c8SBarry Smith         idx    = r[row];
1327*4c1414c8SBarry Smith         nz_tmp = ai[idx+1] - ai[idx];
1328*4c1414c8SBarry Smith         ajtmp  = aj + ai[idx];
1329*4c1414c8SBarry Smith         v1     = aa + ai[idx];
1330*4c1414c8SBarry Smith         v2     = aa + ai[idx+1];
1331*4c1414c8SBarry Smith         for (j=0; j<nz_tmp; j++) {
1332*4c1414c8SBarry Smith           idx        = ics[ajtmp[j]];
1333*4c1414c8SBarry Smith           rtmp1[idx] = v1[j];
1334*4c1414c8SBarry Smith           rtmp2[idx] = v2[j];
1335*4c1414c8SBarry Smith         }
1336*4c1414c8SBarry Smith         rtmp1[ics[r[row]]]   += sctx.shift_amount;
1337*4c1414c8SBarry Smith         rtmp2[ics[r[row+1]]] += sctx.shift_amount;
1338*4c1414c8SBarry Smith 
1339*4c1414c8SBarry Smith         prow = *bjtmp++ ;
1340*4c1414c8SBarry Smith         while (prow < row) {
1341*4c1414c8SBarry Smith           pc1 = rtmp1 + prow;
1342*4c1414c8SBarry Smith           pc2 = rtmp2 + prow;
1343*4c1414c8SBarry Smith           if (*pc1 != 0.0 || *pc2 != 0.0){
1344*4c1414c8SBarry Smith             pv   = ba + bd[prow];
1345*4c1414c8SBarry Smith             pj   = nbj + bd[prow];
1346*4c1414c8SBarry Smith             mul1 = *pc1 * *pv;
1347*4c1414c8SBarry Smith             mul2 = *pc2 * *pv;
1348*4c1414c8SBarry Smith             ++pv;
1349*4c1414c8SBarry Smith             *pc1 = mul1;
1350*4c1414c8SBarry Smith             *pc2 = mul2;
1351*4c1414c8SBarry Smith 
1352*4c1414c8SBarry Smith             nz_tmp = bi[prow+1] - bd[prow] - 1;
1353*4c1414c8SBarry Smith             for (j=0; j<nz_tmp; j++) {
1354*4c1414c8SBarry Smith               tmp = pv[j];
1355*4c1414c8SBarry Smith               idx = pj[j];
1356*4c1414c8SBarry Smith               rtmp1[idx] -= mul1 * tmp;
1357*4c1414c8SBarry Smith               rtmp2[idx] -= mul2 * tmp;
1358*4c1414c8SBarry Smith             }
1359*4c1414c8SBarry Smith             ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr);
1360*4c1414c8SBarry Smith           }
1361*4c1414c8SBarry Smith           prow = *bjtmp++ ;
1362*4c1414c8SBarry Smith         }
1363*4c1414c8SBarry Smith 
1364*4c1414c8SBarry Smith         /* Now take care of diagonal 2x2 block. Note: prow = row here */
1365*4c1414c8SBarry Smith         pc1 = rtmp1 + prow;
1366*4c1414c8SBarry Smith         pc2 = rtmp2 + prow;
1367*4c1414c8SBarry Smith 
1368*4c1414c8SBarry Smith         sctx.pv = *pc1;
1369*4c1414c8SBarry Smith         pj      = bj + bi[prow];
1370*4c1414c8SBarry Smith         rs      = 0.0;
1371*4c1414c8SBarry Smith         for (j=0; j<nz; j++){
1372*4c1414c8SBarry Smith           idx = pj[j];
1373*4c1414c8SBarry Smith           if (idx != prow) rs += PetscAbsScalar(rtmp1[idx]);
1374*4c1414c8SBarry Smith         }
1375*4c1414c8SBarry Smith         sctx.rs = rs;
1376*4c1414c8SBarry Smith         ierr = MatLUCheckShift_inline(info,sctx,row,a->diag,newshift);CHKERRQ(ierr);
1377*4c1414c8SBarry Smith         if (newshift == 1) goto endofwhile;
1378*4c1414c8SBarry Smith 
1379*4c1414c8SBarry Smith         if (*pc2 != 0.0){
1380*4c1414c8SBarry Smith           pj     = nbj + bd[prow];
1381*4c1414c8SBarry Smith           mul2   = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
1382*4c1414c8SBarry Smith           *pc2   = mul2;
1383*4c1414c8SBarry Smith           nz_tmp = bi[prow+1] - bd[prow] - 1;
1384*4c1414c8SBarry Smith           for (j=0; j<nz_tmp; j++) {
1385*4c1414c8SBarry Smith             idx = pj[j] ;
1386*4c1414c8SBarry Smith             tmp = rtmp1[idx];
1387*4c1414c8SBarry Smith             rtmp2[idx] -= mul2 * tmp;
1388*4c1414c8SBarry Smith           }
1389*4c1414c8SBarry Smith           ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr);
1390*4c1414c8SBarry Smith         }
1391*4c1414c8SBarry Smith 
1392*4c1414c8SBarry Smith         pj  = bj + bi[row];
1393*4c1414c8SBarry Smith         pc1 = ba + bi[row];
1394*4c1414c8SBarry Smith         pc2 = ba + bi[row+1];
1395*4c1414c8SBarry Smith 
1396*4c1414c8SBarry Smith         sctx.pv = rtmp2[row+1];
1397*4c1414c8SBarry Smith         rs = 0.0;
1398*4c1414c8SBarry Smith         rtmp1[row]   = 1.0/rtmp1[row];
1399*4c1414c8SBarry Smith         rtmp2[row+1] = 1.0/rtmp2[row+1];
1400*4c1414c8SBarry Smith         /* copy row entries from dense representation to sparse */
1401*4c1414c8SBarry Smith         for (j=0; j<nz; j++) {
1402*4c1414c8SBarry Smith           idx    = pj[j];
1403*4c1414c8SBarry Smith           pc1[j] = rtmp1[idx];
1404*4c1414c8SBarry Smith           pc2[j] = rtmp2[idx];
1405*4c1414c8SBarry Smith           if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
1406*4c1414c8SBarry Smith         }
1407*4c1414c8SBarry Smith         sctx.rs = rs;
1408*4c1414c8SBarry Smith         ierr = MatLUCheckShift_inline(info,sctx,row+1,a->diag,newshift);CHKERRQ(ierr);
1409*4c1414c8SBarry Smith         if (newshift == 1) goto endofwhile;
1410*4c1414c8SBarry Smith         break;
1411*4c1414c8SBarry Smith 
1412*4c1414c8SBarry Smith       case 3:
1413*4c1414c8SBarry Smith         for  (j=0; j<nz; j++) {
1414*4c1414c8SBarry Smith           idx        = bjtmp[j];
1415*4c1414c8SBarry Smith           rtmp1[idx] = 0.0;
1416*4c1414c8SBarry Smith           rtmp2[idx] = 0.0;
1417*4c1414c8SBarry Smith           rtmp3[idx] = 0.0;
1418*4c1414c8SBarry Smith         }
1419*4c1414c8SBarry Smith         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
1420*4c1414c8SBarry Smith         idx    = r[row];
1421*4c1414c8SBarry Smith         nz_tmp = ai[idx+1] - ai[idx];
1422*4c1414c8SBarry Smith         ajtmp = aj + ai[idx];
1423*4c1414c8SBarry Smith         v1    = aa + ai[idx];
1424*4c1414c8SBarry Smith         v2    = aa + ai[idx+1];
1425*4c1414c8SBarry Smith         v3    = aa + ai[idx+2];
1426*4c1414c8SBarry Smith         for (j=0; j<nz_tmp; j++) {
1427*4c1414c8SBarry Smith           idx        = ics[ajtmp[j]];
1428*4c1414c8SBarry Smith           rtmp1[idx] = v1[j];
1429*4c1414c8SBarry Smith           rtmp2[idx] = v2[j];
1430*4c1414c8SBarry Smith           rtmp3[idx] = v3[j];
1431*4c1414c8SBarry Smith         }
1432*4c1414c8SBarry Smith         rtmp1[ics[r[row]]]   += sctx.shift_amount;
1433*4c1414c8SBarry Smith         rtmp2[ics[r[row+1]]] += sctx.shift_amount;
1434*4c1414c8SBarry Smith         rtmp3[ics[r[row+2]]] += sctx.shift_amount;
1435*4c1414c8SBarry Smith 
1436*4c1414c8SBarry Smith         /* loop over all pivot row blocks above this row block */
1437*4c1414c8SBarry Smith         prow = *bjtmp++ ;
1438*4c1414c8SBarry Smith         while (prow < row) {
1439*4c1414c8SBarry Smith           pc1 = rtmp1 + prow;
1440*4c1414c8SBarry Smith           pc2 = rtmp2 + prow;
1441*4c1414c8SBarry Smith           pc3 = rtmp3 + prow;
1442*4c1414c8SBarry Smith           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){
1443*4c1414c8SBarry Smith             pv   = ba  + bd[prow];
1444*4c1414c8SBarry Smith             pj   = nbj + bd[prow];
1445*4c1414c8SBarry Smith             mul1 = *pc1 * *pv;
1446*4c1414c8SBarry Smith             mul2 = *pc2 * *pv;
1447*4c1414c8SBarry Smith             mul3 = *pc3 * *pv;
1448*4c1414c8SBarry Smith             ++pv;
1449*4c1414c8SBarry Smith             *pc1 = mul1;
1450*4c1414c8SBarry Smith             *pc2 = mul2;
1451*4c1414c8SBarry Smith             *pc3 = mul3;
1452*4c1414c8SBarry Smith 
1453*4c1414c8SBarry Smith             nz_tmp = bi[prow+1] - bd[prow] - 1;
1454*4c1414c8SBarry Smith             /* update this row based on pivot row */
1455*4c1414c8SBarry Smith             for (j=0; j<nz_tmp; j++) {
1456*4c1414c8SBarry Smith               tmp = pv[j];
1457*4c1414c8SBarry Smith               idx = pj[j];
1458*4c1414c8SBarry Smith               rtmp1[idx] -= mul1 * tmp;
1459*4c1414c8SBarry Smith               rtmp2[idx] -= mul2 * tmp;
1460*4c1414c8SBarry Smith               rtmp3[idx] -= mul3 * tmp;
1461*4c1414c8SBarry Smith             }
1462*4c1414c8SBarry Smith             ierr = PetscLogFlops(6*nz_tmp);CHKERRQ(ierr);
1463*4c1414c8SBarry Smith           }
1464*4c1414c8SBarry Smith           prow = *bjtmp++ ;
1465*4c1414c8SBarry Smith         }
1466*4c1414c8SBarry Smith 
1467*4c1414c8SBarry Smith         /* Now take care of diagonal 3x3 block in this set of rows */
1468*4c1414c8SBarry Smith         /* note: prow = row here */
1469*4c1414c8SBarry Smith         pc1 = rtmp1 + prow;
1470*4c1414c8SBarry Smith         pc2 = rtmp2 + prow;
1471*4c1414c8SBarry Smith         pc3 = rtmp3 + prow;
1472*4c1414c8SBarry Smith 
1473*4c1414c8SBarry Smith         sctx.pv = *pc1;
1474*4c1414c8SBarry Smith         pj      = bj + bi[prow];
1475*4c1414c8SBarry Smith         rs      = 0.0;
1476*4c1414c8SBarry Smith         for (j=0; j<nz; j++){
1477*4c1414c8SBarry Smith           idx = pj[j];
1478*4c1414c8SBarry Smith           if (idx != row) rs += PetscAbsScalar(rtmp1[idx]);
1479*4c1414c8SBarry Smith         }
1480*4c1414c8SBarry Smith         sctx.rs = rs;
1481*4c1414c8SBarry Smith         ierr = MatLUCheckShift_inline(info,sctx,row,a->diag,newshift);CHKERRQ(ierr);
1482*4c1414c8SBarry Smith         if (newshift == 1) goto endofwhile;
1483*4c1414c8SBarry Smith 
1484*4c1414c8SBarry Smith         if (*pc2 != 0.0 || *pc3 != 0.0){
1485*4c1414c8SBarry Smith           mul2 = (*pc2)/(*pc1);
1486*4c1414c8SBarry Smith           mul3 = (*pc3)/(*pc1);
1487*4c1414c8SBarry Smith           *pc2 = mul2;
1488*4c1414c8SBarry Smith           *pc3 = mul3;
1489*4c1414c8SBarry Smith           nz_tmp = bi[prow+1] - bd[prow] - 1;
1490*4c1414c8SBarry Smith           pj     = nbj + bd[prow];
1491*4c1414c8SBarry Smith           for (j=0; j<nz_tmp; j++) {
1492*4c1414c8SBarry Smith             idx = pj[j] ;
1493*4c1414c8SBarry Smith             tmp = rtmp1[idx];
1494*4c1414c8SBarry Smith             rtmp2[idx] -= mul2 * tmp;
1495*4c1414c8SBarry Smith             rtmp3[idx] -= mul3 * tmp;
1496*4c1414c8SBarry Smith           }
1497*4c1414c8SBarry Smith           ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr);
1498*4c1414c8SBarry Smith         }
1499*4c1414c8SBarry Smith         ++prow;
1500*4c1414c8SBarry Smith 
1501*4c1414c8SBarry Smith         pc2 = rtmp2 + prow;
1502*4c1414c8SBarry Smith         pc3 = rtmp3 + prow;
1503*4c1414c8SBarry Smith         sctx.pv = *pc2;
1504*4c1414c8SBarry Smith         pj      = bj + bi[prow];
1505*4c1414c8SBarry Smith         rs      = 0.0;
1506*4c1414c8SBarry Smith         for (j=0; j<nz; j++){
1507*4c1414c8SBarry Smith           idx = pj[j];
1508*4c1414c8SBarry Smith           if (idx != prow) rs += PetscAbsScalar(rtmp2[idx]);
1509*4c1414c8SBarry Smith         }
1510*4c1414c8SBarry Smith         sctx.rs = rs;
1511*4c1414c8SBarry Smith         ierr = MatLUCheckShift_inline(info,sctx,row+1,a->diag,newshift);CHKERRQ(ierr);
1512*4c1414c8SBarry Smith         if (newshift == 1) goto endofwhile;
1513*4c1414c8SBarry Smith 
1514*4c1414c8SBarry Smith         if (*pc3 != 0.0){
1515*4c1414c8SBarry Smith           mul3   = (*pc3)/(*pc2);
1516*4c1414c8SBarry Smith           *pc3   = mul3;
1517*4c1414c8SBarry Smith           pj     = nbj + bd[prow];
1518*4c1414c8SBarry Smith           nz_tmp = bi[prow+1] - bd[prow] - 1;
1519*4c1414c8SBarry Smith           for (j=0; j<nz_tmp; j++) {
1520*4c1414c8SBarry Smith             idx = pj[j] ;
1521*4c1414c8SBarry Smith             tmp = rtmp2[idx];
1522*4c1414c8SBarry Smith             rtmp3[idx] -= mul3 * tmp;
1523*4c1414c8SBarry Smith           }
1524*4c1414c8SBarry Smith           ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr);
1525*4c1414c8SBarry Smith         }
1526*4c1414c8SBarry Smith 
1527*4c1414c8SBarry Smith         pj  = bj + bi[row];
1528*4c1414c8SBarry Smith         pc1 = ba + bi[row];
1529*4c1414c8SBarry Smith         pc2 = ba + bi[row+1];
1530*4c1414c8SBarry Smith         pc3 = ba + bi[row+2];
1531*4c1414c8SBarry Smith 
1532*4c1414c8SBarry Smith         sctx.pv = rtmp3[row+2];
1533*4c1414c8SBarry Smith         rs = 0.0;
1534*4c1414c8SBarry Smith         rtmp1[row]   = 1.0/rtmp1[row];
1535*4c1414c8SBarry Smith         rtmp2[row+1] = 1.0/rtmp2[row+1];
1536*4c1414c8SBarry Smith         rtmp3[row+2] = 1.0/rtmp3[row+2];
1537*4c1414c8SBarry Smith         /* copy row entries from dense representation to sparse */
1538*4c1414c8SBarry Smith         for (j=0; j<nz; j++) {
1539*4c1414c8SBarry Smith           idx    = pj[j];
1540*4c1414c8SBarry Smith           pc1[j] = rtmp1[idx];
1541*4c1414c8SBarry Smith           pc2[j] = rtmp2[idx];
1542*4c1414c8SBarry Smith           pc3[j] = rtmp3[idx];
1543*4c1414c8SBarry Smith           if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
1544*4c1414c8SBarry Smith         }
1545*4c1414c8SBarry Smith 
1546*4c1414c8SBarry Smith         sctx.rs = rs;
1547*4c1414c8SBarry Smith         ierr = MatLUCheckShift_inline(info,sctx,row+2,a->diag,newshift);CHKERRQ(ierr);
1548*4c1414c8SBarry Smith         if (newshift == 1) goto endofwhile;
1549*4c1414c8SBarry Smith         break;
1550*4c1414c8SBarry Smith 
1551*4c1414c8SBarry Smith       default:
1552*4c1414c8SBarry Smith         SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1553*4c1414c8SBarry Smith       }
1554*4c1414c8SBarry Smith       row += nodesz;                 /* Update the row */
1555*4c1414c8SBarry Smith     }
1556*4c1414c8SBarry Smith     endofwhile:;
1557*4c1414c8SBarry Smith   } while (sctx.lushift);
1558*4c1414c8SBarry Smith   ierr = PetscFree(rtmp1);CHKERRQ(ierr);
1559*4c1414c8SBarry Smith   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
1560*4c1414c8SBarry Smith   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
1561*4c1414c8SBarry Smith   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1562*4c1414c8SBarry Smith   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
1563*4c1414c8SBarry Smith   C->factor      = FACTOR_LU;
1564*4c1414c8SBarry Smith   C->assembled   = PETSC_TRUE;
1565*4c1414c8SBarry Smith   if (sctx.nshift) {
1566*4c1414c8SBarry Smith     if (info->shiftnz) {
1567*4c1414c8SBarry Smith       ierr = PetscInfo2(0,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
1568*4c1414c8SBarry Smith     } else if (info->shiftpd) {
1569*4c1414c8SBarry Smith       ierr = PetscInfo4(0,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,info->shift_fraction,sctx.shift_top);CHKERRQ(ierr);
1570*4c1414c8SBarry Smith     }
1571*4c1414c8SBarry Smith   }
1572*4c1414c8SBarry Smith   ierr = PetscLogFlops(C->cmap.n);CHKERRQ(ierr);
1573*4c1414c8SBarry Smith   PetscFunctionReturn(0);
1574*4c1414c8SBarry Smith }
1575*4c1414c8SBarry Smith 
1576*4c1414c8SBarry Smith /*
1577*4c1414c8SBarry Smith      Makes a longer coloring[] array and calls the usual code with that
1578*4c1414c8SBarry Smith */
1579*4c1414c8SBarry Smith #undef __FUNCT__
1580*4c1414c8SBarry Smith #define __FUNCT__ "MatColoringPatch_Inode"
1581*4c1414c8SBarry Smith PetscErrorCode MatColoringPatch_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
1582*4c1414c8SBarry Smith {
1583*4c1414c8SBarry Smith   Mat_SeqAIJ       *a = (Mat_SeqAIJ*)mat->data;
1584*4c1414c8SBarry Smith   PetscErrorCode  ierr;
1585*4c1414c8SBarry Smith   PetscInt        n = mat->cmap.n,m = a->inode.node_count,j,*ns = a->inode.size,row;
1586*4c1414c8SBarry Smith   PetscInt        *colorused,i;
1587*4c1414c8SBarry Smith   ISColoringValue *newcolor;
1588*4c1414c8SBarry Smith 
1589*4c1414c8SBarry Smith   PetscFunctionBegin;
1590*4c1414c8SBarry Smith   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr);
1591*4c1414c8SBarry Smith   /* loop over inodes, marking a color for each column*/
1592*4c1414c8SBarry Smith   row = 0;
1593*4c1414c8SBarry Smith   for (i=0; i<m; i++){
1594*4c1414c8SBarry Smith     for (j=0; j<ns[i]; j++) {
1595*4c1414c8SBarry Smith       newcolor[row++] = coloring[i] + j*ncolors;
1596*4c1414c8SBarry Smith     }
1597*4c1414c8SBarry Smith   }
1598*4c1414c8SBarry Smith 
1599*4c1414c8SBarry Smith   /* eliminate unneeded colors */
1600*4c1414c8SBarry Smith   ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr);
1601*4c1414c8SBarry Smith   ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr);
1602*4c1414c8SBarry Smith   for (i=0; i<n; i++) {
1603*4c1414c8SBarry Smith     colorused[newcolor[i]] = 1;
1604*4c1414c8SBarry Smith   }
1605*4c1414c8SBarry Smith 
1606*4c1414c8SBarry Smith   for (i=1; i<5*ncolors; i++) {
1607*4c1414c8SBarry Smith     colorused[i] += colorused[i-1];
1608*4c1414c8SBarry Smith   }
1609*4c1414c8SBarry Smith   ncolors = colorused[5*ncolors-1];
1610*4c1414c8SBarry Smith   for (i=0; i<n; i++) {
1611*4c1414c8SBarry Smith     newcolor[i] = colorused[newcolor[i]];
1612*4c1414c8SBarry Smith   }
1613*4c1414c8SBarry Smith   ierr = PetscFree(colorused);CHKERRQ(ierr);
1614*4c1414c8SBarry Smith   ierr = ISColoringCreate(mat->comm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr);
1615*4c1414c8SBarry Smith   ierr = PetscFree(coloring);CHKERRQ(ierr);
1616*4c1414c8SBarry Smith   PetscFunctionReturn(0);
1617*4c1414c8SBarry Smith }
1618*4c1414c8SBarry Smith 
1619*4c1414c8SBarry Smith /*
1620*4c1414c8SBarry Smith     samestructure indicates that the matrix has not changed its nonzero structure so we
1621*4c1414c8SBarry Smith     do not need to recompute the inodes
1622*4c1414c8SBarry Smith */
1623*4c1414c8SBarry Smith #undef __FUNCT__
1624*4c1414c8SBarry Smith #define __FUNCT__ "Mat_CheckInode"
1625*4c1414c8SBarry Smith PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure)
1626*4c1414c8SBarry Smith {
1627*4c1414c8SBarry Smith   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
1628*4c1414c8SBarry Smith   PetscErrorCode ierr;
1629*4c1414c8SBarry Smith   PetscInt       i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size;
1630*4c1414c8SBarry Smith   PetscTruth     flag,flg;
1631*4c1414c8SBarry Smith 
1632*4c1414c8SBarry Smith   PetscFunctionBegin;
1633*4c1414c8SBarry Smith   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
1634*4c1414c8SBarry Smith 
1635*4c1414c8SBarry Smith   a->inode.checked = PETSC_TRUE;
1636*4c1414c8SBarry Smith 
1637*4c1414c8SBarry Smith   /* Notes: We set a->inode.limit=5 in MatCreate_Inode(). */
1638*4c1414c8SBarry Smith   if (!a->inode.use) {ierr = PetscInfo(A,"Not using Inode routines due to MatSetOption(MAT_DO_NOT_USE_INODES\n");CHKERRQ(ierr); PetscFunctionReturn(0);}
1639*4c1414c8SBarry Smith 
1640*4c1414c8SBarry Smith   ierr = PetscOptionsBegin(A->comm,A->prefix,"Options for SEQAIJ matrix","Mat");CHKERRQ(ierr);
1641*4c1414c8SBarry Smith     ierr = PetscOptionsTruth("-mat_no_unroll","Do not optimize for inodes (slower)",PETSC_NULL,PETSC_FALSE,&flg,PETSC_NULL);CHKERRQ(ierr);
1642*4c1414c8SBarry Smith     if (flg) {ierr = PetscInfo(A,"Not using Inode routines due to -mat_no_unroll\n");CHKERRQ(ierr);PetscFunctionReturn(0);}
1643*4c1414c8SBarry Smith     ierr = PetscOptionsTruth("-mat_no_inode","Do not optimize for inodes (slower)",PETSC_NULL,PETSC_FALSE,&flg,PETSC_NULL);CHKERRQ(ierr);
1644*4c1414c8SBarry Smith     if (flg) {ierr = PetscInfo(A,"Not using Inode routines due to -mat_no_inode\n");CHKERRQ(ierr);PetscFunctionReturn(0);}
1645*4c1414c8SBarry Smith     ierr = PetscOptionsInt("-mat_inode_limit","Do not use inodes larger then this value",PETSC_NULL,a->inode.limit,&a->inode.limit,PETSC_NULL);CHKERRQ(ierr);
1646*4c1414c8SBarry Smith   ierr = PetscOptionsEnd();CHKERRQ(ierr);
1647*4c1414c8SBarry Smith   if (a->inode.limit > a->inode.max_limit) a->inode.limit = a->inode.max_limit;
1648*4c1414c8SBarry Smith 
1649*4c1414c8SBarry Smith   m = A->rmap.n;
1650*4c1414c8SBarry Smith   if (a->inode.size) {ns = a->inode.size;}
1651*4c1414c8SBarry Smith   else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);}
1652*4c1414c8SBarry Smith 
1653*4c1414c8SBarry Smith   i          = 0;
1654*4c1414c8SBarry Smith   node_count = 0;
1655*4c1414c8SBarry Smith   idx        = a->j;
1656*4c1414c8SBarry Smith   ii         = a->i;
1657*4c1414c8SBarry Smith   while (i < m){                /* For each row */
1658*4c1414c8SBarry Smith     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
1659*4c1414c8SBarry Smith     /* Limits the number of elements in a node to 'a->inode.limit' */
1660*4c1414c8SBarry Smith     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
1661*4c1414c8SBarry Smith       nzy     = ii[j+1] - ii[j]; /* Same number of nonzeros */
1662*4c1414c8SBarry Smith       if (nzy != nzx) break;
1663*4c1414c8SBarry Smith       idy  += nzx;             /* Same nonzero pattern */
1664*4c1414c8SBarry Smith       ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
1665*4c1414c8SBarry Smith       if (!flag) break;
1666*4c1414c8SBarry Smith     }
1667*4c1414c8SBarry Smith     ns[node_count++] = blk_size;
1668*4c1414c8SBarry Smith     idx += blk_size*nzx;
1669*4c1414c8SBarry Smith     i    = j;
1670*4c1414c8SBarry Smith   }
1671*4c1414c8SBarry Smith   /* If not enough inodes found,, do not use inode version of the routines */
1672*4c1414c8SBarry Smith   if (!a->inode.size && m && node_count > 0.9*m) {
1673*4c1414c8SBarry Smith     ierr = PetscFree(ns);CHKERRQ(ierr);
1674*4c1414c8SBarry Smith     a->inode.node_count     = 0;
1675*4c1414c8SBarry Smith     a->inode.size           = PETSC_NULL;
1676*4c1414c8SBarry Smith     a->inode.use            = PETSC_FALSE;
1677*4c1414c8SBarry Smith     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
1678*4c1414c8SBarry Smith   } else {
1679*4c1414c8SBarry Smith     A->ops->mult            = MatMult_Inode;
1680*4c1414c8SBarry Smith     A->ops->multadd         = MatMultAdd_Inode;
1681*4c1414c8SBarry Smith     A->ops->solve           = MatSolve_Inode;
1682*4c1414c8SBarry Smith     A->ops->lufactornumeric = MatLUFactorNumeric_Inode;
1683*4c1414c8SBarry Smith     A->ops->getrowij        = MatGetRowIJ_Inode;
1684*4c1414c8SBarry Smith     A->ops->restorerowij    = MatRestoreRowIJ_Inode;
1685*4c1414c8SBarry Smith     A->ops->getcolumnij     = MatGetColumnIJ_Inode;
1686*4c1414c8SBarry Smith     A->ops->restorecolumnij = MatRestoreColumnIJ_Inode;
1687*4c1414c8SBarry Smith     A->ops->coloringpatch   = MatColoringPatch_Inode;
1688*4c1414c8SBarry Smith     a->inode.node_count     = node_count;
1689*4c1414c8SBarry Smith     a->inode.size           = ns;
1690*4c1414c8SBarry Smith     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
1691*4c1414c8SBarry Smith   }
1692*4c1414c8SBarry Smith   PetscFunctionReturn(0);
1693*4c1414c8SBarry Smith }
1694*4c1414c8SBarry Smith 
1695*4c1414c8SBarry Smith /*
1696*4c1414c8SBarry Smith      This is really ugly. if inodes are used this replaces the
1697*4c1414c8SBarry Smith   permutations with ones that correspond to rows/cols of the matrix
1698*4c1414c8SBarry Smith   rather then inode blocks
1699*4c1414c8SBarry Smith */
1700*4c1414c8SBarry Smith #undef __FUNCT__
1701*4c1414c8SBarry Smith #define __FUNCT__ "MatInodeAdjustForInodes"
1702*4c1414c8SBarry Smith PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
1703*4c1414c8SBarry Smith {
1704*4c1414c8SBarry Smith   PetscErrorCode ierr,(*f)(Mat,IS*,IS*);
1705*4c1414c8SBarry Smith 
1706*4c1414c8SBarry Smith   PetscFunctionBegin;
1707*4c1414c8SBarry Smith   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);CHKERRQ(ierr);
1708*4c1414c8SBarry Smith   if (f) {
1709*4c1414c8SBarry Smith     ierr = (*f)(A,rperm,cperm);CHKERRQ(ierr);
1710*4c1414c8SBarry Smith   }
1711*4c1414c8SBarry Smith   PetscFunctionReturn(0);
1712*4c1414c8SBarry Smith }
1713*4c1414c8SBarry Smith 
1714*4c1414c8SBarry Smith EXTERN_C_BEGIN
1715*4c1414c8SBarry Smith #undef __FUNCT__
1716*4c1414c8SBarry Smith #define __FUNCT__ "MatAdjustForInodes_Inode"
1717*4c1414c8SBarry Smith PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes_Inode(Mat A,IS *rperm,IS *cperm)
1718*4c1414c8SBarry Smith {
1719*4c1414c8SBarry Smith   Mat_SeqAIJ      *a=(Mat_SeqAIJ*)A->data;
1720*4c1414c8SBarry Smith   PetscErrorCode ierr;
1721*4c1414c8SBarry Smith   PetscInt       m = A->rmap.n,n = A->cmap.n,i,j,*ridx,*cidx,nslim_row = a->inode.node_count;
1722*4c1414c8SBarry Smith   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
1723*4c1414c8SBarry Smith   PetscInt       nslim_col,*ns_col;
1724*4c1414c8SBarry Smith   IS             ris = *rperm,cis = *cperm;
1725*4c1414c8SBarry Smith 
1726*4c1414c8SBarry Smith   PetscFunctionBegin;
1727*4c1414c8SBarry Smith   if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */
1728*4c1414c8SBarry Smith   if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */
1729*4c1414c8SBarry Smith 
1730*4c1414c8SBarry Smith   ierr  = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
1731*4c1414c8SBarry Smith   ierr  = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
1732*4c1414c8SBarry Smith   ierr  = PetscMalloc((m+n+1)*sizeof(PetscInt),&permr);CHKERRQ(ierr);
1733*4c1414c8SBarry Smith   permc = permr + m;
1734*4c1414c8SBarry Smith 
1735*4c1414c8SBarry Smith   ierr  = ISGetIndices(ris,&ridx);CHKERRQ(ierr);
1736*4c1414c8SBarry Smith   ierr  = ISGetIndices(cis,&cidx);CHKERRQ(ierr);
1737*4c1414c8SBarry Smith 
1738*4c1414c8SBarry Smith   /* Form the inode structure for the rows of permuted matric using inv perm*/
1739*4c1414c8SBarry Smith   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
1740*4c1414c8SBarry Smith 
1741*4c1414c8SBarry Smith   /* Construct the permutations for rows*/
1742*4c1414c8SBarry Smith   for (i=0,row = 0; i<nslim_row; ++i){
1743*4c1414c8SBarry Smith     indx      = ridx[i];
1744*4c1414c8SBarry Smith     start_val = tns[indx];
1745*4c1414c8SBarry Smith     end_val   = tns[indx + 1];
1746*4c1414c8SBarry Smith     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
1747*4c1414c8SBarry Smith   }
1748*4c1414c8SBarry Smith 
1749*4c1414c8SBarry Smith   /* Form the inode structure for the columns of permuted matrix using inv perm*/
1750*4c1414c8SBarry Smith   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
1751*4c1414c8SBarry Smith 
1752*4c1414c8SBarry Smith  /* Construct permutations for columns */
1753*4c1414c8SBarry Smith   for (i=0,col=0; i<nslim_col; ++i){
1754*4c1414c8SBarry Smith     indx      = cidx[i];
1755*4c1414c8SBarry Smith     start_val = tns[indx];
1756*4c1414c8SBarry Smith     end_val   = tns[indx + 1];
1757*4c1414c8SBarry Smith     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
1758*4c1414c8SBarry Smith   }
1759*4c1414c8SBarry Smith 
1760*4c1414c8SBarry Smith   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);CHKERRQ(ierr);
1761*4c1414c8SBarry Smith   ISSetPermutation(*rperm);
1762*4c1414c8SBarry Smith   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);CHKERRQ(ierr);
1763*4c1414c8SBarry Smith   ISSetPermutation(*cperm);
1764*4c1414c8SBarry Smith 
1765*4c1414c8SBarry Smith   ierr  = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr);
1766*4c1414c8SBarry Smith   ierr  = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr);
1767*4c1414c8SBarry Smith 
1768*4c1414c8SBarry Smith   ierr = PetscFree(ns_col);CHKERRQ(ierr);
1769*4c1414c8SBarry Smith   ierr = PetscFree(permr);CHKERRQ(ierr);
1770*4c1414c8SBarry Smith   ierr = ISDestroy(cis);CHKERRQ(ierr);
1771*4c1414c8SBarry Smith   ierr = ISDestroy(ris);CHKERRQ(ierr);
1772*4c1414c8SBarry Smith   ierr = PetscFree(tns);CHKERRQ(ierr);
1773*4c1414c8SBarry Smith   PetscFunctionReturn(0);
1774*4c1414c8SBarry Smith }
1775*4c1414c8SBarry Smith EXTERN_C_END
1776*4c1414c8SBarry Smith 
1777*4c1414c8SBarry Smith #undef __FUNCT__
1778*4c1414c8SBarry Smith #define __FUNCT__ "MatInodeGetInodeSizes"
1779*4c1414c8SBarry Smith /*@C
1780*4c1414c8SBarry Smith    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
1781*4c1414c8SBarry Smith 
1782*4c1414c8SBarry Smith    Collective on Mat
1783*4c1414c8SBarry Smith 
1784*4c1414c8SBarry Smith    Input Parameter:
1785*4c1414c8SBarry Smith .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
1786*4c1414c8SBarry Smith 
1787*4c1414c8SBarry Smith    Output Parameter:
1788*4c1414c8SBarry Smith +  node_count - no of inodes present in the matrix.
1789*4c1414c8SBarry Smith .  sizes      - an array of size node_count,with sizes of each inode.
1790*4c1414c8SBarry Smith -  limit      - the max size used to generate the inodes.
1791*4c1414c8SBarry Smith 
1792*4c1414c8SBarry Smith    Level: advanced
1793*4c1414c8SBarry Smith 
1794*4c1414c8SBarry Smith    Notes: This routine returns some internal storage information
1795*4c1414c8SBarry Smith    of the matrix, it is intended to be used by advanced users.
1796*4c1414c8SBarry Smith    It should be called after the matrix is assembled.
1797*4c1414c8SBarry Smith    The contents of the sizes[] array should not be changed.
1798*4c1414c8SBarry Smith    PETSC_NULL may be passed for information not requested.
1799*4c1414c8SBarry Smith 
1800*4c1414c8SBarry Smith .keywords: matrix, seqaij, get, inode
1801*4c1414c8SBarry Smith 
1802*4c1414c8SBarry Smith .seealso: MatGetInfo()
1803*4c1414c8SBarry Smith @*/
1804*4c1414c8SBarry Smith PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
1805*4c1414c8SBarry Smith {
1806*4c1414c8SBarry Smith   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
1807*4c1414c8SBarry Smith 
1808*4c1414c8SBarry Smith   PetscFunctionBegin;
1809*4c1414c8SBarry Smith   if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1810*4c1414c8SBarry Smith   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr);
1811*4c1414c8SBarry Smith   if (f) {
1812*4c1414c8SBarry Smith     ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr);
1813*4c1414c8SBarry Smith   }
1814*4c1414c8SBarry Smith   PetscFunctionReturn(0);
1815*4c1414c8SBarry Smith }
1816*4c1414c8SBarry Smith 
1817*4c1414c8SBarry Smith EXTERN_C_BEGIN
1818*4c1414c8SBarry Smith #undef __FUNCT__
1819*4c1414c8SBarry Smith #define __FUNCT__ "MatInodeGetInodeSizes_Inode"
1820*4c1414c8SBarry Smith PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
1821*4c1414c8SBarry Smith {
1822*4c1414c8SBarry Smith   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1823*4c1414c8SBarry Smith 
1824*4c1414c8SBarry Smith   PetscFunctionBegin;
1825*4c1414c8SBarry Smith   if (node_count) *node_count = a->inode.node_count;
1826*4c1414c8SBarry Smith   if (sizes)      *sizes      = a->inode.size;
1827*4c1414c8SBarry Smith   if (limit)      *limit      = a->inode.limit;
1828*4c1414c8SBarry Smith   PetscFunctionReturn(0);
1829*4c1414c8SBarry Smith }
1830*4c1414c8SBarry Smith EXTERN_C_END
1831