xref: /petsc/src/mat/impls/baij/seq/baij2.c (revision 81f96dcc196638099f4f8062b7ddb72e8b817c95)
1 #define PETSCMAT_DLL
2 
3 #include "../src/mat/impls/baij/seq/baij.h"
4 #include "../src/mat/blockinvert.h"
5 #include "petscbt.h"
6 
7 #undef __FUNCT__
8 #define __FUNCT__ "MatIncreaseOverlap_SeqBAIJ"
9 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
10 {
11   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
12   PetscErrorCode ierr;
13   PetscInt       row,i,j,k,l,m,n,*nidx,isz,val,ival;
14   const PetscInt *idx;
15   PetscInt       start,end,*ai,*aj,bs,*nidx2;
16   PetscBT        table;
17 
18   PetscFunctionBegin;
19   m     = a->mbs;
20   ai    = a->i;
21   aj    = a->j;
22   bs    = A->rmap->bs;
23 
24   if (ov < 0)  SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
25 
26   ierr = PetscBTCreate(m,table);CHKERRQ(ierr);
27   ierr = PetscMalloc((m+1)*sizeof(PetscInt),&nidx);CHKERRQ(ierr);
28   ierr = PetscMalloc((A->rmap->N+1)*sizeof(PetscInt),&nidx2);CHKERRQ(ierr);
29 
30   for (i=0; i<is_max; i++) {
31     /* Initialise the two local arrays */
32     isz  = 0;
33     ierr = PetscBTMemzero(m,table);CHKERRQ(ierr);
34 
35     /* Extract the indices, assume there can be duplicate entries */
36     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
37     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
38 
39     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
40     for (j=0; j<n ; ++j){
41       ival = idx[j]/bs; /* convert the indices into block indices */
42       if (ival>=m) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
43       if(!PetscBTLookupSet(table,ival)) { nidx[isz++] = ival;}
44     }
45     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
46     ierr = ISDestroy(is[i]);CHKERRQ(ierr);
47 
48     k = 0;
49     for (j=0; j<ov; j++){ /* for each overlap*/
50       n = isz;
51       for (; k<n ; k++){ /* do only those rows in nidx[k], which are not done yet */
52         row   = nidx[k];
53         start = ai[row];
54         end   = ai[row+1];
55         for (l = start; l<end ; l++){
56           val = aj[l];
57           if (!PetscBTLookupSet(table,val)) {nidx[isz++] = val;}
58         }
59       }
60     }
61     /* expand the Index Set */
62     for (j=0; j<isz; j++) {
63       for (k=0; k<bs; k++)
64         nidx2[j*bs+k] = nidx[j]*bs+k;
65     }
66     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);CHKERRQ(ierr);
67   }
68   ierr = PetscBTDestroy(table);CHKERRQ(ierr);
69   ierr = PetscFree(nidx);CHKERRQ(ierr);
70   ierr = PetscFree(nidx2);CHKERRQ(ierr);
71   PetscFunctionReturn(0);
72 }
73 
74 #undef __FUNCT__
75 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ_Private"
76 PetscErrorCode MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
77 {
78   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*c;
79   PetscErrorCode ierr;
80   PetscInt       *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
81   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
82   const PetscInt *irow,*icol;
83   PetscInt       nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
84   PetscInt       *aj = a->j,*ai = a->i;
85   MatScalar      *mat_a;
86   Mat            C;
87   PetscTruth     flag,sorted;
88 
89   PetscFunctionBegin;
90   ierr = ISSorted(iscol,&sorted);CHKERRQ(ierr);
91   if (!sorted) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
92 
93   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
94   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
95   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
96   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
97 
98   ierr = PetscMalloc((1+oldcols)*sizeof(PetscInt),&smap);CHKERRQ(ierr);
99   ssmap = smap;
100   ierr = PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);CHKERRQ(ierr);
101   ierr  = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr);
102   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
103   /* determine lens of each row */
104   for (i=0; i<nrows; i++) {
105     kstart  = ai[irow[i]];
106     kend    = kstart + a->ilen[irow[i]];
107     lens[i] = 0;
108       for (k=kstart; k<kend; k++) {
109         if (ssmap[aj[k]]) {
110           lens[i]++;
111         }
112       }
113     }
114   /* Create and fill new matrix */
115   if (scall == MAT_REUSE_MATRIX) {
116     c = (Mat_SeqBAIJ *)((*B)->data);
117 
118     if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
119     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr);
120     if (!flag) {
121       SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
122     }
123     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr);
124     C = *B;
125   } else {
126     ierr = MatCreate(((PetscObject)A)->comm,&C);CHKERRQ(ierr);
127     ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
128     ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
129     ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(C,bs,0,lens);CHKERRQ(ierr);
130   }
131   c = (Mat_SeqBAIJ *)(C->data);
132   for (i=0; i<nrows; i++) {
133     row    = irow[i];
134     kstart = ai[row];
135     kend   = kstart + a->ilen[row];
136     mat_i  = c->i[i];
137     mat_j  = c->j + mat_i;
138     mat_a  = c->a + mat_i*bs2;
139     mat_ilen = c->ilen + i;
140     for (k=kstart; k<kend; k++) {
141       if ((tcol=ssmap[a->j[k]])) {
142         *mat_j++ = tcol - 1;
143         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
144         mat_a   += bs2;
145         (*mat_ilen)++;
146       }
147     }
148   }
149 
150   /* Free work space */
151   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
152   ierr = PetscFree(smap);CHKERRQ(ierr);
153   ierr = PetscFree(lens);CHKERRQ(ierr);
154   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
155   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
156 
157   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
158   *B = C;
159   PetscFunctionReturn(0);
160 }
161 
162 #undef __FUNCT__
163 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ"
164 PetscErrorCode MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
165 {
166   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
167   IS             is1,is2;
168   PetscErrorCode ierr;
169   PetscInt       *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count;
170   const PetscInt *irow,*icol;
171 
172   PetscFunctionBegin;
173   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
174   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
175   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
176   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
177 
178   /* Verify if the indices corespond to each element in a block
179    and form the IS with compressed IS */
180   ierr = PetscMalloc2(a->mbs,PetscInt,&vary,a->mbs,PetscInt,&iary);CHKERRQ(ierr);
181   ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr);
182   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
183   count = 0;
184   for (i=0; i<a->mbs; i++) {
185     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
186     if (vary[i]==bs) iary[count++] = i;
187   }
188   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr);
189 
190   ierr = PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));CHKERRQ(ierr);
191   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
192   count = 0;
193   for (i=0; i<a->mbs; i++) {
194     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_PLIB,"Internal error in PETSc");
195     if (vary[i]==bs) iary[count++] = i;
196   }
197   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is2);CHKERRQ(ierr);
198   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
199   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
200   ierr = PetscFree2(vary,iary);CHKERRQ(ierr);
201 
202   ierr = MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr);
203   ISDestroy(is1);
204   ISDestroy(is2);
205   PetscFunctionReturn(0);
206 }
207 
208 #undef __FUNCT__
209 #define __FUNCT__ "MatGetSubMatrices_SeqBAIJ"
210 PetscErrorCode MatGetSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
211 {
212   PetscErrorCode ierr;
213   PetscInt       i;
214 
215   PetscFunctionBegin;
216   if (scall == MAT_INITIAL_MATRIX) {
217     ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr);
218   }
219 
220   for (i=0; i<n; i++) {
221     ierr = MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr);
222   }
223   PetscFunctionReturn(0);
224 }
225 
226 
227 /* -------------------------------------------------------*/
228 /* Should check that shapes of vectors and matrices match */
229 /* -------------------------------------------------------*/
230 
231 #undef __FUNCT__
232 #define __FUNCT__ "MatMult_SeqBAIJ_1"
233 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
234 {
235   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
236   PetscScalar       *z,sum;
237   const PetscScalar *x;
238   const MatScalar   *v;
239   PetscErrorCode    ierr;
240   PetscInt          mbs,i,n,nonzerorow=0;
241   const PetscInt    *idx,*ii,*ridx=PETSC_NULL;
242   PetscTruth        usecprow=a->compressedrow.use;
243 
244   PetscFunctionBegin;
245   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
246   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
247 
248   if (usecprow){
249     mbs  = a->compressedrow.nrows;
250     ii   = a->compressedrow.i;
251     ridx = a->compressedrow.rindex;
252     ierr = PetscMemzero(z,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
253   } else {
254     mbs = a->mbs;
255     ii  = a->i;
256   }
257 
258   for (i=0; i<mbs; i++) {
259     n    = ii[1] - ii[0];
260     v    = a->a + ii[0];
261     idx  = a->j + ii[0];
262     ii++;
263     sum  = 0.0;
264     PetscSparseDensePlusDot(sum,x,v,idx,n);
265     if (usecprow){
266       z[ridx[i]] = sum;
267     } else {
268       nonzerorow += (n>0);
269       z[i] = sum;
270     }
271   }
272   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
273   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
274   ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr);
275   PetscFunctionReturn(0);
276 }
277 
278 #undef __FUNCT__
279 #define __FUNCT__ "MatMult_SeqBAIJ_2"
280 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
281 {
282   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
283   PetscScalar       *z = 0,sum1,sum2,*zarray;
284   const PetscScalar *x,*xb;
285   PetscScalar       x1,x2;
286   const MatScalar   *v;
287   PetscErrorCode    ierr;
288   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
289   PetscTruth        usecprow=a->compressedrow.use;
290 
291   PetscFunctionBegin;
292   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
293   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
294 
295   idx = a->j;
296   v   = a->a;
297   if (usecprow){
298     mbs  = a->compressedrow.nrows;
299     ii   = a->compressedrow.i;
300     ridx = a->compressedrow.rindex;
301   } else {
302     mbs = a->mbs;
303     ii  = a->i;
304     z   = zarray;
305   }
306 
307   for (i=0; i<mbs; i++) {
308     n  = ii[1] - ii[0]; ii++;
309     sum1 = 0.0; sum2 = 0.0;
310     nonzerorow += (n>0);
311     for (j=0; j<n; j++) {
312       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
313       sum1 += v[0]*x1 + v[2]*x2;
314       sum2 += v[1]*x1 + v[3]*x2;
315       v += 4;
316     }
317     if (usecprow) z = zarray + 2*ridx[i];
318     z[0] = sum1; z[1] = sum2;
319     if (!usecprow) z += 2;
320   }
321   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
322   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
323   ierr = PetscLogFlops(8.0*a->nz - 2.0*nonzerorow);CHKERRQ(ierr);
324   PetscFunctionReturn(0);
325 }
326 
327 #undef __FUNCT__
328 #define __FUNCT__ "MatMult_SeqBAIJ_3"
329 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
330 {
331   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
332   PetscScalar       *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray;
333   const PetscScalar *x,*xb;
334   const MatScalar   *v;
335   PetscErrorCode    ierr;
336   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
337   PetscTruth        usecprow=a->compressedrow.use;
338 
339 
340 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
341 #pragma disjoint(*v,*z,*xb)
342 #endif
343 
344   PetscFunctionBegin;
345   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
346   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
347 
348   idx = a->j;
349   v   = a->a;
350   if (usecprow){
351     mbs  = a->compressedrow.nrows;
352     ii   = a->compressedrow.i;
353     ridx = a->compressedrow.rindex;
354   } else {
355     mbs = a->mbs;
356     ii  = a->i;
357     z   = zarray;
358   }
359 
360   for (i=0; i<mbs; i++) {
361     n  = ii[1] - ii[0]; ii++;
362     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
363     nonzerorow += (n>0);
364     for (j=0; j<n; j++) {
365       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
366       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
367       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
368       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
369       v += 9;
370     }
371     if (usecprow) z = zarray + 3*ridx[i];
372     z[0] = sum1; z[1] = sum2; z[2] = sum3;
373     if (!usecprow) z += 3;
374   }
375   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
376   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
377   ierr = PetscLogFlops(18.0*a->nz - 3.0*nonzerorow);CHKERRQ(ierr);
378   PetscFunctionReturn(0);
379 }
380 
381 #undef __FUNCT__
382 #define __FUNCT__ "MatMult_SeqBAIJ_4"
383 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
384 {
385   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
386   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
387   const PetscScalar *x,*xb;
388   const MatScalar   *v;
389   PetscErrorCode    ierr;
390   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
391   PetscTruth        usecprow=a->compressedrow.use;
392 
393   PetscFunctionBegin;
394   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
395   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
396 
397   idx = a->j;
398   v   = a->a;
399   if (usecprow){
400     mbs  = a->compressedrow.nrows;
401     ii   = a->compressedrow.i;
402     ridx = a->compressedrow.rindex;
403   } else {
404     mbs = a->mbs;
405     ii  = a->i;
406     z   = zarray;
407   }
408 
409   for (i=0; i<mbs; i++) {
410     n  = ii[1] - ii[0]; ii++;
411     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
412     nonzerorow += (n>0);
413     for (j=0; j<n; j++) {
414       xb = x + 4*(*idx++);
415       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
416       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
417       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
418       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
419       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
420       v += 16;
421     }
422     if (usecprow) z = zarray + 4*ridx[i];
423     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
424     if (!usecprow) z += 4;
425   }
426   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
427   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
428   ierr = PetscLogFlops(32.0*a->nz - 4.0*nonzerorow);CHKERRQ(ierr);
429   PetscFunctionReturn(0);
430 }
431 
432 #undef __FUNCT__
433 #define __FUNCT__ "MatMult_SeqBAIJ_5"
434 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
435 {
436   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
437   PetscScalar       sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray;
438   const PetscScalar *xb,*x;
439   const MatScalar   *v;
440   PetscErrorCode    ierr;
441   const PetscInt    *idx,*ii,*ridx=PETSC_NULL;
442   PetscInt          mbs,i,j,n,nonzerorow=0;
443   PetscTruth        usecprow=a->compressedrow.use;
444 
445   PetscFunctionBegin;
446   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
447   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
448 
449   idx = a->j;
450   v   = a->a;
451   if (usecprow){
452     mbs  = a->compressedrow.nrows;
453     ii   = a->compressedrow.i;
454     ridx = a->compressedrow.rindex;
455   } else {
456     mbs = a->mbs;
457     ii  = a->i;
458     z   = zarray;
459   }
460 
461   for (i=0; i<mbs; i++) {
462     n  = ii[1] - ii[0]; ii++;
463     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
464     nonzerorow += (n>0);
465     for (j=0; j<n; j++) {
466       xb = x + 5*(*idx++);
467       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
468       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
469       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
470       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
471       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
472       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
473       v += 25;
474     }
475     if (usecprow) z = zarray + 5*ridx[i];
476     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
477     if (!usecprow) z += 5;
478   }
479   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
480   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
481   ierr = PetscLogFlops(50.0*a->nz - 5.0*nonzerorow);CHKERRQ(ierr);
482   PetscFunctionReturn(0);
483 }
484 
485 
486 #undef __FUNCT__
487 #define __FUNCT__ "MatMult_SeqBAIJ_6"
488 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
489 {
490   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
491   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
492   const PetscScalar *x,*xb;
493   PetscScalar       x1,x2,x3,x4,x5,x6,*zarray;
494   const MatScalar   *v;
495   PetscErrorCode    ierr;
496   PetscInt          mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
497   PetscTruth        usecprow=a->compressedrow.use;
498 
499   PetscFunctionBegin;
500   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
501   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
502 
503   idx = a->j;
504   v   = a->a;
505   if (usecprow){
506     mbs  = a->compressedrow.nrows;
507     ii   = a->compressedrow.i;
508     ridx = a->compressedrow.rindex;
509   } else {
510     mbs = a->mbs;
511     ii  = a->i;
512     z   = zarray;
513   }
514 
515   for (i=0; i<mbs; i++) {
516     n  = ii[1] - ii[0]; ii++;
517     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0;
518     nonzerorow += (n>0);
519     for (j=0; j<n; j++) {
520       xb = x + 6*(*idx++);
521       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
522       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
523       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
524       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
525       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
526       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
527       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
528       v += 36;
529     }
530     if (usecprow) z = zarray + 6*ridx[i];
531     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
532     if (!usecprow) z += 6;
533   }
534 
535   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
536   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
537   ierr = PetscLogFlops(72.0*a->nz - 6.0*nonzerorow);CHKERRQ(ierr);
538   PetscFunctionReturn(0);
539 }
540 
541 #undef __FUNCT__
542 #define __FUNCT__ "MatMult_SeqBAIJ_7"
543 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
544 {
545   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
546   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
547   const PetscScalar *x,*xb;
548   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*zarray;
549   const MatScalar   *v;
550   PetscErrorCode    ierr;
551   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
552   PetscTruth        usecprow=a->compressedrow.use;
553 
554   PetscFunctionBegin;
555   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
556   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
557 
558   idx = a->j;
559   v   = a->a;
560   if (usecprow){
561     mbs    = a->compressedrow.nrows;
562     ii     = a->compressedrow.i;
563     ridx = a->compressedrow.rindex;
564   } else {
565     mbs = a->mbs;
566     ii  = a->i;
567     z   = zarray;
568   }
569 
570   for (i=0; i<mbs; i++) {
571     n  = ii[1] - ii[0]; ii++;
572     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
573     nonzerorow += (n>0);
574     for (j=0; j<n; j++) {
575       xb = x + 7*(*idx++);
576       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
577       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
578       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
579       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
580       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
581       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
582       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
583       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
584       v += 49;
585     }
586     if (usecprow) z = zarray + 7*ridx[i];
587     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
588     if (!usecprow) z += 7;
589   }
590 
591   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
592   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
593   ierr = PetscLogFlops(98.0*a->nz - 7.0*nonzerorow);CHKERRQ(ierr);
594   PetscFunctionReturn(0);
595 }
596 
597 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
598 /* Default MatMult for block size 15 */
599 
600 #undef __FUNCT__
601 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver1"
602 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz)
603 {
604   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
605   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
606   const PetscScalar *x,*xb;
607   PetscScalar        x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray;
608   const MatScalar   *v;
609   PetscErrorCode    ierr;
610   const PetscInt    *ii,*ij=a->j,*idx;
611   PetscInt          mbs,i,j,k,n,*ridx=PETSC_NULL,nonzerorow=0;
612   PetscTruth        usecprow=a->compressedrow.use;
613 
614   PetscFunctionBegin;
615   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
616   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
617 
618   v   = a->a;
619   if (usecprow){
620     mbs    = a->compressedrow.nrows;
621     ii     = a->compressedrow.i;
622     ridx = a->compressedrow.rindex;
623   } else {
624     mbs = a->mbs;
625     ii  = a->i;
626     z   = zarray;
627   }
628 
629   for (i=0; i<mbs; i++) {
630     n  = ii[i+1] - ii[i];
631     idx = ij + ii[i];
632     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
633     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
634 
635     nonzerorow += (n>0);
636     for (j=0; j<n; j++) {
637       xb = x + 15*(idx[j]);
638       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
639       x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14];
640 
641       for(k=0;k<15;k++){
642 	sum1  += v[0]*xb[k];
643         sum2  += v[1]*xb[k];
644 	sum3  += v[2]*xb[k];
645 	sum4  += v[3]*xb[k];
646 	sum5  += v[4]*xb[k];
647         sum6  += v[5]*xb[k];
648 	sum7  += v[6]*xb[k];
649 	sum8  += v[7]*xb[k];
650         sum9  += v[8]*xb[k];
651         sum10 += v[9]*xb[k];
652 	sum11 += v[10]*xb[k];
653 	sum12 += v[11]*xb[k];
654 	sum13 += v[12]*xb[k];
655         sum14 += v[13]*xb[k];
656 	sum15 += v[14]*xb[k];
657 	v += 15;
658       }
659     }
660     if (usecprow) z = zarray + 15*ridx[i];
661     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
662     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
663 
664     if (!usecprow) z += 15;
665   }
666 
667   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
668   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
669   ierr = PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);CHKERRQ(ierr);
670   PetscFunctionReturn(0);
671 }
672 
673 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
674 #undef __FUNCT__
675 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver2"
676 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz)
677 {
678   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
679   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
680   const PetscScalar *x,*xb;
681   PetscScalar        x1,x2,x3,x4,*zarray;
682   const MatScalar   *v;
683   PetscErrorCode    ierr;
684   const PetscInt    *ii,*ij=a->j,*idx;
685   PetscInt          mbs,i,j,n,*ridx=PETSC_NULL,nonzerorow=0;
686   PetscTruth        usecprow=a->compressedrow.use;
687 
688   PetscFunctionBegin;
689   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
690   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
691 
692   v   = a->a;
693   if (usecprow){
694     mbs    = a->compressedrow.nrows;
695     ii     = a->compressedrow.i;
696     ridx = a->compressedrow.rindex;
697   } else {
698     mbs = a->mbs;
699     ii  = a->i;
700     z   = zarray;
701   }
702 
703   for (i=0; i<mbs; i++) {
704     n  = ii[i+1] - ii[i];
705     idx = ij + ii[i];
706     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
707     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
708 
709     nonzerorow += (n>0);
710     for (j=0; j<n; j++) {
711       xb = x + 15*(idx[j]);
712       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
713 
714       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
715       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
716       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
717       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
718       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
719       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
720       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
721       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
722       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
723       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
724       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
725       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
726       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
727       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
728       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
729 
730       v += 60;
731 
732       x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];
733 
734       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
735       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
736       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
737       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
738       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
739       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
740       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
741       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
742       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
743       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
744       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
745       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
746       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
747       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
748       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
749       v += 60;
750 
751       x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
752       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
753       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
754       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
755       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
756       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
757       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
758       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
759       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
760       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
761       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
762       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
763       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
764       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
765       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
766       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
767       v  += 60;
768 
769       x1 = xb[12]; x2 = xb[13]; x3 = xb[14];
770       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3;
771       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3;
772       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3;
773       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3;
774       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3;
775       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3;
776       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3;
777       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3;
778       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3;
779       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3;
780       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3;
781       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3;
782       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3;
783       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3;
784       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3;
785       v += 45;
786     }
787     if (usecprow) z = zarray + 15*ridx[i];
788     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
789     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
790 
791     if (!usecprow) z += 15;
792   }
793 
794   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
795   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
796   ierr = PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);CHKERRQ(ierr);
797   PetscFunctionReturn(0);
798 }
799 
800 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
801 #undef __FUNCT__
802 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver3"
803 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz)
804 {
805   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
806   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
807   const PetscScalar *x,*xb;
808   PetscScalar        x1,x2,x3,x4,x5,x6,x7,x8,*zarray;
809   const MatScalar   *v;
810   PetscErrorCode    ierr;
811   const PetscInt    *ii,*ij=a->j,*idx;
812   PetscInt          mbs,i,j,n,*ridx=PETSC_NULL,nonzerorow=0;
813   PetscTruth        usecprow=a->compressedrow.use;
814 
815   PetscFunctionBegin;
816   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
817   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
818 
819   v   = a->a;
820   if (usecprow){
821     mbs    = a->compressedrow.nrows;
822     ii     = a->compressedrow.i;
823     ridx = a->compressedrow.rindex;
824   } else {
825     mbs = a->mbs;
826     ii  = a->i;
827     z   = zarray;
828   }
829 
830   for (i=0; i<mbs; i++) {
831     n  = ii[i+1] - ii[i];
832     idx = ij + ii[i];
833     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
834     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
835 
836     nonzerorow += (n>0);
837     for (j=0; j<n; j++) {
838       xb = x + 15*(idx[j]);
839       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
840       x8 = xb[7];
841 
842       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8;
843       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8;
844       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8;
845       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8;
846       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8;
847       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8;
848       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8;
849       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8;
850       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8;
851       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8;
852       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8;
853       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8;
854       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8;
855       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8;
856       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8;
857       v += 120;
858 
859       x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14];
860 
861       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7;
862       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7;
863       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7;
864       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7;
865       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7;
866       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7;
867       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7;
868       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7;
869       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7;
870       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7;
871       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7;
872       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7;
873       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7;
874       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7;
875       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7;
876       v += 105;
877     }
878     if (usecprow) z = zarray + 15*ridx[i];
879     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
880     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
881 
882     if (!usecprow) z += 15;
883   }
884 
885   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
886   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
887   ierr = PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);CHKERRQ(ierr);
888   PetscFunctionReturn(0);
889 }
890 
891 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
892 
893 #undef __FUNCT__
894 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver4"
895 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz)
896 {
897   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
898   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
899   const PetscScalar *x,*xb;
900   PetscScalar        x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray;
901   const MatScalar   *v;
902   PetscErrorCode    ierr;
903   const PetscInt    *ii,*ij=a->j,*idx;
904   PetscInt          mbs,i,j,n,*ridx=PETSC_NULL,nonzerorow=0;
905   PetscTruth        usecprow=a->compressedrow.use;
906 
907   PetscFunctionBegin;
908   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
909   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
910 
911   v   = a->a;
912   if (usecprow){
913     mbs    = a->compressedrow.nrows;
914     ii     = a->compressedrow.i;
915     ridx = a->compressedrow.rindex;
916   } else {
917     mbs = a->mbs;
918     ii  = a->i;
919     z   = zarray;
920   }
921 
922   for (i=0; i<mbs; i++) {
923     n  = ii[i+1] - ii[i];
924     idx = ij + ii[i];
925     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
926     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
927 
928     nonzerorow += (n>0);
929     for (j=0; j<n; j++) {
930       xb = x + 15*(idx[j]);
931       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
932       x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14];
933 
934       sum1 +=  v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
935       sum2 +=  v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
936       sum3 +=  v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
937       sum4 +=  v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
938       sum5  += v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
939       sum6  += v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
940       sum7  += v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
941       sum8  += v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
942       sum9  += v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
943       sum10 += v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
944       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
945       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
946       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
947       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
948       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
949       v += 225;
950     }
951     if (usecprow) z = zarray + 15*ridx[i];
952     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
953     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
954 
955     if (!usecprow) z += 15;
956   }
957 
958   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
959   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
960   ierr = PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);CHKERRQ(ierr);
961   PetscFunctionReturn(0);
962 }
963 
964 
965 /*
966     This will not work with MatScalar == float because it calls the BLAS
967 */
968 #undef __FUNCT__
969 #define __FUNCT__ "MatMult_SeqBAIJ_N"
970 PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
971 {
972   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
973   PetscScalar    *x,*z = 0,*xb,*work,*workt,*zarray;
974   MatScalar      *v;
975   PetscErrorCode ierr;
976   PetscInt       mbs=a->mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2;
977   PetscInt       ncols,k,*ridx=PETSC_NULL,nonzerorow=0;
978   PetscTruth     usecprow=a->compressedrow.use;
979 
980   PetscFunctionBegin;
981   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
982   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
983 
984   idx = a->j;
985   v   = a->a;
986   if (usecprow){
987     mbs  = a->compressedrow.nrows;
988     ii   = a->compressedrow.i;
989     ridx = a->compressedrow.rindex;
990   } else {
991     mbs = a->mbs;
992     ii  = a->i;
993     z   = zarray;
994   }
995 
996   if (!a->mult_work) {
997     k    = PetscMax(A->rmap->n,A->cmap->n);
998     ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
999   }
1000   work = a->mult_work;
1001   for (i=0; i<mbs; i++) {
1002     n     = ii[1] - ii[0]; ii++;
1003     ncols = n*bs;
1004     workt = work;
1005     nonzerorow += (n>0);
1006     for (j=0; j<n; j++) {
1007       xb = x + bs*(*idx++);
1008       for (k=0; k<bs; k++) workt[k] = xb[k];
1009       workt += bs;
1010     }
1011     if (usecprow) z = zarray + bs*ridx[i];
1012     Kernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
1013     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
1014     v += n*bs2;
1015     if (!usecprow) z += bs;
1016   }
1017   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1018   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1019   ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*nonzerorow);CHKERRQ(ierr);
1020   PetscFunctionReturn(0);
1021 }
1022 
1023 extern PetscErrorCode VecCopy_Seq(Vec,Vec);
1024 #undef __FUNCT__
1025 #define __FUNCT__ "MatMultAdd_SeqBAIJ_1"
1026 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
1027 {
1028   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data;
1029   const PetscScalar  *x;
1030   PetscScalar        *y,*z,sum;
1031   const MatScalar    *v;
1032   PetscErrorCode     ierr;
1033   PetscInt           mbs=a->mbs,i,n,*ridx=PETSC_NULL,nonzerorow=0;
1034   const PetscInt     *idx,*ii;
1035   PetscTruth         usecprow=a->compressedrow.use;
1036 
1037   PetscFunctionBegin;
1038   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
1039   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
1040   if (zz != yy) {
1041     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1042   } else {
1043     z = y;
1044   }
1045 
1046   idx = a->j;
1047   v   = a->a;
1048   if (usecprow){
1049     if (zz != yy){
1050       ierr = PetscMemcpy(z,y,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1051     }
1052     mbs  = a->compressedrow.nrows;
1053     ii   = a->compressedrow.i;
1054     ridx = a->compressedrow.rindex;
1055   } else {
1056     ii  = a->i;
1057   }
1058 
1059   for (i=0; i<mbs; i++) {
1060     n    = ii[1] - ii[0];
1061     ii++;
1062     if (!usecprow){
1063       nonzerorow += (n>0);
1064       sum = y[i];
1065     } else {
1066       sum = y[ridx[i]];
1067     }
1068     PetscSparseDensePlusDot(sum,x,v,idx,n);
1069     v += n;
1070     idx += n;
1071     if (usecprow){
1072       z[ridx[i]] = sum;
1073     } else {
1074       z[i] = sum;
1075     }
1076   }
1077   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
1078   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
1079   if (zz != yy) {
1080     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1081   }
1082   ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr);
1083   PetscFunctionReturn(0);
1084 }
1085 
1086 #undef __FUNCT__
1087 #define __FUNCT__ "MatMultAdd_SeqBAIJ_2"
1088 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
1089 {
1090   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1091   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2;
1092   PetscScalar    x1,x2,*yarray,*zarray;
1093   MatScalar      *v;
1094   PetscErrorCode ierr;
1095   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1096   PetscTruth     usecprow=a->compressedrow.use;
1097 
1098   PetscFunctionBegin;
1099   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1100   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
1101   if (zz != yy) {
1102     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1103   } else {
1104     zarray = yarray;
1105   }
1106 
1107   idx = a->j;
1108   v   = a->a;
1109   if (usecprow){
1110     if (zz != yy){
1111       ierr = PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1112     }
1113     mbs  = a->compressedrow.nrows;
1114     ii   = a->compressedrow.i;
1115     ridx = a->compressedrow.rindex;
1116     if (zz != yy){
1117       ierr = PetscMemcpy(zarray,yarray,a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1118     }
1119   } else {
1120     ii  = a->i;
1121     y   = yarray;
1122     z   = zarray;
1123   }
1124 
1125   for (i=0; i<mbs; i++) {
1126     n  = ii[1] - ii[0]; ii++;
1127     if (usecprow){
1128       z = zarray + 2*ridx[i];
1129       y = yarray + 2*ridx[i];
1130     }
1131     sum1 = y[0]; sum2 = y[1];
1132     for (j=0; j<n; j++) {
1133       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
1134       sum1 += v[0]*x1 + v[2]*x2;
1135       sum2 += v[1]*x1 + v[3]*x2;
1136       v += 4;
1137     }
1138     z[0] = sum1; z[1] = sum2;
1139     if (!usecprow){
1140       z += 2; y += 2;
1141     }
1142   }
1143   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1144   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
1145   if (zz != yy) {
1146     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1147   }
1148   ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr);
1149   PetscFunctionReturn(0);
1150 }
1151 
1152 #undef __FUNCT__
1153 #define __FUNCT__ "MatMultAdd_SeqBAIJ_3"
1154 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
1155 {
1156   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1157   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
1158   MatScalar      *v;
1159   PetscErrorCode ierr;
1160   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1161   PetscTruth     usecprow=a->compressedrow.use;
1162 
1163   PetscFunctionBegin;
1164   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1165   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
1166   if (zz != yy) {
1167     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1168   } else {
1169     zarray = yarray;
1170   }
1171 
1172   idx = a->j;
1173   v   = a->a;
1174   if (usecprow){
1175     if (zz != yy){
1176       ierr = PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1177     }
1178     mbs  = a->compressedrow.nrows;
1179     ii   = a->compressedrow.i;
1180     ridx = a->compressedrow.rindex;
1181   } else {
1182     ii  = a->i;
1183     y   = yarray;
1184     z   = zarray;
1185   }
1186 
1187   for (i=0; i<mbs; i++) {
1188     n  = ii[1] - ii[0]; ii++;
1189     if (usecprow){
1190       z = zarray + 3*ridx[i];
1191       y = yarray + 3*ridx[i];
1192     }
1193     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
1194     for (j=0; j<n; j++) {
1195       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1196       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
1197       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
1198       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
1199       v += 9;
1200     }
1201     z[0] = sum1; z[1] = sum2; z[2] = sum3;
1202     if (!usecprow){
1203       z += 3; y += 3;
1204     }
1205   }
1206   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1207   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
1208   if (zz != yy) {
1209     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1210   }
1211   ierr = PetscLogFlops(18.0*a->nz);CHKERRQ(ierr);
1212   PetscFunctionReturn(0);
1213 }
1214 
1215 #undef __FUNCT__
1216 #define __FUNCT__ "MatMultAdd_SeqBAIJ_4"
1217 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
1218 {
1219   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1220   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
1221   MatScalar      *v;
1222   PetscErrorCode ierr;
1223   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1224   PetscTruth     usecprow=a->compressedrow.use;
1225 
1226   PetscFunctionBegin;
1227   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1228   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
1229   if (zz != yy) {
1230     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1231   } else {
1232     zarray = yarray;
1233   }
1234 
1235   idx   = a->j;
1236   v     = a->a;
1237   if (usecprow){
1238     if (zz != yy){
1239       ierr = PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1240     }
1241     mbs  = a->compressedrow.nrows;
1242     ii   = a->compressedrow.i;
1243     ridx = a->compressedrow.rindex;
1244   } else {
1245     ii  = a->i;
1246     y   = yarray;
1247     z   = zarray;
1248   }
1249 
1250   for (i=0; i<mbs; i++) {
1251     n  = ii[1] - ii[0]; ii++;
1252     if (usecprow){
1253       z = zarray + 4*ridx[i];
1254       y = yarray + 4*ridx[i];
1255     }
1256     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
1257     for (j=0; j<n; j++) {
1258       xb = x + 4*(*idx++);
1259       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1260       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1261       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1262       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1263       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1264       v += 16;
1265     }
1266     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
1267     if (!usecprow){
1268       z += 4; y += 4;
1269     }
1270   }
1271   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1272   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
1273   if (zz != yy) {
1274     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1275   }
1276   ierr = PetscLogFlops(32.0*a->nz);CHKERRQ(ierr);
1277   PetscFunctionReturn(0);
1278 }
1279 
1280 #undef __FUNCT__
1281 #define __FUNCT__ "MatMultAdd_SeqBAIJ_5"
1282 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
1283 {
1284   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1285   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
1286   PetscScalar    *yarray,*zarray;
1287   MatScalar      *v;
1288   PetscErrorCode ierr;
1289   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1290   PetscTruth     usecprow=a->compressedrow.use;
1291 
1292   PetscFunctionBegin;
1293   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1294   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
1295   if (zz != yy) {
1296     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1297   } else {
1298     zarray = yarray;
1299   }
1300 
1301   idx = a->j;
1302   v   = a->a;
1303   if (usecprow){
1304     if (zz != yy){
1305       ierr = PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1306     }
1307     mbs  = a->compressedrow.nrows;
1308     ii   = a->compressedrow.i;
1309     ridx = a->compressedrow.rindex;
1310   } else {
1311     ii  = a->i;
1312     y   = yarray;
1313     z   = zarray;
1314   }
1315 
1316   for (i=0; i<mbs; i++) {
1317     n  = ii[1] - ii[0]; ii++;
1318     if (usecprow){
1319       z = zarray + 5*ridx[i];
1320       y = yarray + 5*ridx[i];
1321     }
1322     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
1323     for (j=0; j<n; j++) {
1324       xb = x + 5*(*idx++);
1325       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
1326       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1327       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1328       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1329       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1330       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1331       v += 25;
1332     }
1333     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
1334     if (!usecprow){
1335       z += 5; y += 5;
1336     }
1337   }
1338   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1339   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
1340   if (zz != yy) {
1341     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1342   }
1343   ierr = PetscLogFlops(50.0*a->nz);CHKERRQ(ierr);
1344   PetscFunctionReturn(0);
1345 }
1346 #undef __FUNCT__
1347 #define __FUNCT__ "MatMultAdd_SeqBAIJ_6"
1348 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
1349 {
1350   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1351   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6;
1352   PetscScalar    x1,x2,x3,x4,x5,x6,*yarray,*zarray;
1353   MatScalar      *v;
1354   PetscErrorCode ierr;
1355   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1356   PetscTruth     usecprow=a->compressedrow.use;
1357 
1358   PetscFunctionBegin;
1359   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1360   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
1361   if (zz != yy) {
1362     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1363   } else {
1364     zarray = yarray;
1365   }
1366 
1367   idx = a->j;
1368   v   = a->a;
1369   if (usecprow){
1370     if (zz != yy){
1371       ierr = PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1372     }
1373     mbs  = a->compressedrow.nrows;
1374     ii   = a->compressedrow.i;
1375     ridx = a->compressedrow.rindex;
1376   } else {
1377     ii  = a->i;
1378     y   = yarray;
1379     z   = zarray;
1380   }
1381 
1382   for (i=0; i<mbs; i++) {
1383     n  = ii[1] - ii[0]; ii++;
1384     if (usecprow){
1385       z = zarray + 6*ridx[i];
1386       y = yarray + 6*ridx[i];
1387     }
1388     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
1389     for (j=0; j<n; j++) {
1390       xb = x + 6*(*idx++);
1391       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
1392       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
1393       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
1394       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
1395       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
1396       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
1397       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
1398       v += 36;
1399     }
1400     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
1401     if (!usecprow){
1402       z += 6; y += 6;
1403     }
1404   }
1405   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1406   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
1407   if (zz != yy) {
1408     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1409   }
1410   ierr = PetscLogFlops(72.0*a->nz);CHKERRQ(ierr);
1411   PetscFunctionReturn(0);
1412 }
1413 
1414 #undef __FUNCT__
1415 #define __FUNCT__ "MatMultAdd_SeqBAIJ_7"
1416 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1417 {
1418   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1419   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1420   PetscScalar    x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
1421   MatScalar      *v;
1422   PetscErrorCode ierr;
1423   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1424   PetscTruth     usecprow=a->compressedrow.use;
1425 
1426   PetscFunctionBegin;
1427   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1428   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
1429   if (zz != yy) {
1430     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1431   } else {
1432     zarray = yarray;
1433   }
1434 
1435   idx = a->j;
1436   v   = a->a;
1437   if (usecprow){
1438     if (zz != yy){
1439       ierr = PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1440     }
1441     mbs  = a->compressedrow.nrows;
1442     ii   = a->compressedrow.i;
1443     ridx = a->compressedrow.rindex;
1444   } else {
1445     ii  = a->i;
1446     y   = yarray;
1447     z   = zarray;
1448   }
1449 
1450   for (i=0; i<mbs; i++) {
1451     n  = ii[1] - ii[0]; ii++;
1452     if (usecprow){
1453       z = zarray + 7*ridx[i];
1454       y = yarray + 7*ridx[i];
1455     }
1456     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1457     for (j=0; j<n; j++) {
1458       xb = x + 7*(*idx++);
1459       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1460       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1461       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1462       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1463       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1464       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1465       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1466       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1467       v += 49;
1468     }
1469     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1470     if (!usecprow){
1471       z += 7; y += 7;
1472     }
1473   }
1474   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1475   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
1476   if (zz != yy) {
1477     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1478   }
1479   ierr = PetscLogFlops(98.0*a->nz);CHKERRQ(ierr);
1480   PetscFunctionReturn(0);
1481 }
1482 
1483 #undef __FUNCT__
1484 #define __FUNCT__ "MatMultAdd_SeqBAIJ_N"
1485 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1486 {
1487   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1488   PetscScalar    *x,*z = 0,*xb,*work,*workt,*zarray;
1489   MatScalar      *v;
1490   PetscErrorCode ierr;
1491   PetscInt       mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2;
1492   PetscInt       ncols,k,*ridx=PETSC_NULL;
1493   PetscTruth     usecprow=a->compressedrow.use;
1494 
1495   PetscFunctionBegin;
1496   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
1497   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1498   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1499 
1500   idx = a->j;
1501   v   = a->a;
1502   if (usecprow){
1503     mbs    = a->compressedrow.nrows;
1504     ii     = a->compressedrow.i;
1505     ridx = a->compressedrow.rindex;
1506   } else {
1507     mbs = a->mbs;
1508     ii  = a->i;
1509     z   = zarray;
1510   }
1511 
1512   if (!a->mult_work) {
1513     k    = PetscMax(A->rmap->n,A->cmap->n);
1514     ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
1515   }
1516   work = a->mult_work;
1517   for (i=0; i<mbs; i++) {
1518     n     = ii[1] - ii[0]; ii++;
1519     ncols = n*bs;
1520     workt = work;
1521     for (j=0; j<n; j++) {
1522       xb = x + bs*(*idx++);
1523       for (k=0; k<bs; k++) workt[k] = xb[k];
1524       workt += bs;
1525     }
1526     if (usecprow) z = zarray + bs*ridx[i];
1527     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1528     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
1529     v += n*bs2;
1530     if (!usecprow){
1531       z += bs;
1532     }
1533   }
1534   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1535   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1536   ierr = PetscLogFlops(2.0*a->nz*bs2);CHKERRQ(ierr);
1537   PetscFunctionReturn(0);
1538 }
1539 
1540 #undef __FUNCT__
1541 #define __FUNCT__ "MatMultHermitianTranspose_SeqBAIJ"
1542 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1543 {
1544   PetscScalar    zero = 0.0;
1545   PetscErrorCode ierr;
1546 
1547   PetscFunctionBegin;
1548   ierr = VecSet(zz,zero);CHKERRQ(ierr);
1549   ierr = MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr);
1550   PetscFunctionReturn(0);
1551 }
1552 
1553 #undef __FUNCT__
1554 #define __FUNCT__ "MatMultTranspose_SeqBAIJ"
1555 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1556 {
1557   PetscScalar    zero = 0.0;
1558   PetscErrorCode ierr;
1559 
1560   PetscFunctionBegin;
1561   ierr = VecSet(zz,zero);CHKERRQ(ierr);
1562   ierr = MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr);
1563   PetscFunctionReturn(0);
1564 }
1565 
1566 #undef __FUNCT__
1567 #define __FUNCT__ "MatMultHermitianTransposeAdd_SeqBAIJ"
1568 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1569 
1570 {
1571   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1572   PetscScalar       *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5;
1573   MatScalar         *v;
1574   PetscErrorCode    ierr;
1575   PetscInt          mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=PETSC_NULL;
1576   Mat_CompressedRow cprow = a->compressedrow;
1577   PetscTruth        usecprow=cprow.use;
1578 
1579   PetscFunctionBegin;
1580   if (yy != zz) { ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); }
1581   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1582   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1583 
1584   idx = a->j;
1585   v   = a->a;
1586   if (usecprow){
1587     mbs  = cprow.nrows;
1588     ii   = cprow.i;
1589     ridx = cprow.rindex;
1590   } else {
1591     mbs=a->mbs;
1592     ii = a->i;
1593     xb = x;
1594   }
1595 
1596   switch (bs) {
1597   case 1:
1598     for (i=0; i<mbs; i++) {
1599       if (usecprow) xb = x + ridx[i];
1600       x1 = xb[0];
1601       ib = idx + ii[0];
1602       n  = ii[1] - ii[0]; ii++;
1603       for (j=0; j<n; j++) {
1604         rval    = ib[j];
1605         z[rval] += PetscConj(*v) * x1;
1606         v++;
1607       }
1608       if (!usecprow) xb++;
1609     }
1610     break;
1611   case 2:
1612     for (i=0; i<mbs; i++) {
1613       if (usecprow) xb = x + 2*ridx[i];
1614       x1 = xb[0]; x2 = xb[1];
1615       ib = idx + ii[0];
1616       n  = ii[1] - ii[0]; ii++;
1617       for (j=0; j<n; j++) {
1618         rval      = ib[j]*2;
1619         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2;
1620         z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2;
1621         v  += 4;
1622       }
1623       if (!usecprow) xb += 2;
1624     }
1625     break;
1626   case 3:
1627     for (i=0; i<mbs; i++) {
1628       if (usecprow) xb = x + 3*ridx[i];
1629       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1630       ib = idx + ii[0];
1631       n  = ii[1] - ii[0]; ii++;
1632       for (j=0; j<n; j++) {
1633         rval      = ib[j]*3;
1634         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3;
1635         z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3;
1636         z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3;
1637         v  += 9;
1638       }
1639       if (!usecprow) xb += 3;
1640     }
1641     break;
1642   case 4:
1643     for (i=0; i<mbs; i++) {
1644       if (usecprow) xb = x + 4*ridx[i];
1645       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1646       ib = idx + ii[0];
1647       n  = ii[1] - ii[0]; ii++;
1648       for (j=0; j<n; j++) {
1649         rval      = ib[j]*4;
1650         z[rval++] +=  PetscConj(v[0])*x1 + PetscConj(v[1])*x2  + PetscConj(v[2])*x3  + PetscConj(v[3])*x4;
1651         z[rval++] +=  PetscConj(v[4])*x1 + PetscConj(v[5])*x2  + PetscConj(v[6])*x3  + PetscConj(v[7])*x4;
1652         z[rval++] +=  PetscConj(v[8])*x1 + PetscConj(v[9])*x2  + PetscConj(v[10])*x3 + PetscConj(v[11])*x4;
1653         z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4;
1654         v  += 16;
1655       }
1656       if (!usecprow) xb += 4;
1657     }
1658     break;
1659   case 5:
1660     for (i=0; i<mbs; i++) {
1661       if (usecprow) xb = x + 5*ridx[i];
1662       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1663       x4 = xb[3]; x5 = xb[4];
1664       ib = idx + ii[0];
1665       n  = ii[1] - ii[0]; ii++;
1666       for (j=0; j<n; j++) {
1667         rval      = ib[j]*5;
1668         z[rval++] +=  PetscConj(v[0])*x1 +  PetscConj(v[1])*x2 +  PetscConj(v[2])*x3 +  PetscConj(v[3])*x4 +  PetscConj(v[4])*x5;
1669         z[rval++] +=  PetscConj(v[5])*x1 +  PetscConj(v[6])*x2 +  PetscConj(v[7])*x3 +  PetscConj(v[8])*x4 +  PetscConj(v[9])*x5;
1670         z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5;
1671         z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5;
1672         z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5;
1673         v  += 25;
1674       }
1675       if (!usecprow) xb += 5;
1676     }
1677     break;
1678   default: {      /* block sizes larger than 5 by 5 are handled by BLAS */
1679       PetscInt     ncols,k;
1680       PetscScalar  *work,*workt,*xtmp;
1681 
1682       SETERRQ(PETSC_ERR_SUP,"block size larger than 5 is not supported yet");
1683       if (!a->mult_work) {
1684         k = PetscMax(A->rmap->n,A->cmap->n);
1685         ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
1686       }
1687       work = a->mult_work;
1688       xtmp = x;
1689       for (i=0; i<mbs; i++) {
1690         n     = ii[1] - ii[0]; ii++;
1691         ncols = n*bs;
1692         ierr  = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1693         if (usecprow) {
1694           xtmp = x + bs*ridx[i];
1695         }
1696         Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
1697         /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
1698         v += n*bs2;
1699         if (!usecprow) xtmp += bs;
1700         workt = work;
1701         for (j=0; j<n; j++) {
1702           zb = z + bs*(*idx++);
1703           for (k=0; k<bs; k++) zb[k] += workt[k] ;
1704           workt += bs;
1705         }
1706       }
1707     }
1708   }
1709   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1710   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1711   ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr);
1712   PetscFunctionReturn(0);
1713 }
1714 
1715 #undef __FUNCT__
1716 #define __FUNCT__ "MatMultTransposeAdd_SeqBAIJ"
1717 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1718 {
1719   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1720   PetscScalar       *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5;
1721   MatScalar         *v;
1722   PetscErrorCode    ierr;
1723   PetscInt          mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=PETSC_NULL;
1724   Mat_CompressedRow cprow = a->compressedrow;
1725   PetscTruth        usecprow=cprow.use;
1726 
1727   PetscFunctionBegin;
1728   if (yy != zz) { ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); }
1729   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1730   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1731 
1732   idx = a->j;
1733   v   = a->a;
1734   if (usecprow){
1735     mbs  = cprow.nrows;
1736     ii   = cprow.i;
1737     ridx = cprow.rindex;
1738   } else {
1739     mbs=a->mbs;
1740     ii = a->i;
1741     xb = x;
1742   }
1743 
1744   switch (bs) {
1745   case 1:
1746     for (i=0; i<mbs; i++) {
1747       if (usecprow) xb = x + ridx[i];
1748       x1 = xb[0];
1749       ib = idx + ii[0];
1750       n  = ii[1] - ii[0]; ii++;
1751       for (j=0; j<n; j++) {
1752         rval    = ib[j];
1753         z[rval] += *v * x1;
1754         v++;
1755       }
1756       if (!usecprow) xb++;
1757     }
1758     break;
1759   case 2:
1760     for (i=0; i<mbs; i++) {
1761       if (usecprow) xb = x + 2*ridx[i];
1762       x1 = xb[0]; x2 = xb[1];
1763       ib = idx + ii[0];
1764       n  = ii[1] - ii[0]; ii++;
1765       for (j=0; j<n; j++) {
1766         rval      = ib[j]*2;
1767         z[rval++] += v[0]*x1 + v[1]*x2;
1768         z[rval++] += v[2]*x1 + v[3]*x2;
1769         v  += 4;
1770       }
1771       if (!usecprow) xb += 2;
1772     }
1773     break;
1774   case 3:
1775     for (i=0; i<mbs; i++) {
1776       if (usecprow) xb = x + 3*ridx[i];
1777       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1778       ib = idx + ii[0];
1779       n  = ii[1] - ii[0]; ii++;
1780       for (j=0; j<n; j++) {
1781         rval      = ib[j]*3;
1782         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1783         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1784         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1785         v  += 9;
1786       }
1787       if (!usecprow) xb += 3;
1788     }
1789     break;
1790   case 4:
1791     for (i=0; i<mbs; i++) {
1792       if (usecprow) xb = x + 4*ridx[i];
1793       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1794       ib = idx + ii[0];
1795       n  = ii[1] - ii[0]; ii++;
1796       for (j=0; j<n; j++) {
1797         rval      = ib[j]*4;
1798         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1799         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1800         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1801         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1802         v  += 16;
1803       }
1804       if (!usecprow) xb += 4;
1805     }
1806     break;
1807   case 5:
1808     for (i=0; i<mbs; i++) {
1809       if (usecprow) xb = x + 5*ridx[i];
1810       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1811       x4 = xb[3]; x5 = xb[4];
1812       ib = idx + ii[0];
1813       n  = ii[1] - ii[0]; ii++;
1814       for (j=0; j<n; j++) {
1815         rval      = ib[j]*5;
1816         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1817         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1818         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1819         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1820         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1821         v  += 25;
1822       }
1823       if (!usecprow) xb += 5;
1824     }
1825     break;
1826   default: {      /* block sizes larger then 5 by 5 are handled by BLAS */
1827       PetscInt     ncols,k;
1828       PetscScalar  *work,*workt,*xtmp;
1829 
1830       if (!a->mult_work) {
1831         k = PetscMax(A->rmap->n,A->cmap->n);
1832         ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
1833       }
1834       work = a->mult_work;
1835       xtmp = x;
1836       for (i=0; i<mbs; i++) {
1837         n     = ii[1] - ii[0]; ii++;
1838         ncols = n*bs;
1839         ierr  = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1840         if (usecprow) {
1841           xtmp = x + bs*ridx[i];
1842         }
1843         Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
1844         /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
1845         v += n*bs2;
1846         if (!usecprow) xtmp += bs;
1847         workt = work;
1848         for (j=0; j<n; j++) {
1849           zb = z + bs*(*idx++);
1850           for (k=0; k<bs; k++) zb[k] += workt[k] ;
1851           workt += bs;
1852         }
1853       }
1854     }
1855   }
1856   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1857   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1858   ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr);
1859   PetscFunctionReturn(0);
1860 }
1861 
1862 #undef __FUNCT__
1863 #define __FUNCT__ "MatScale_SeqBAIJ"
1864 PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
1865 {
1866   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)inA->data;
1867   PetscInt       totalnz = a->bs2*a->nz;
1868   PetscScalar    oalpha = alpha;
1869   PetscErrorCode ierr;
1870   PetscBLASInt   one = 1,tnz = PetscBLASIntCast(totalnz);
1871 
1872   PetscFunctionBegin;
1873   BLASscal_(&tnz,&oalpha,a->a,&one);
1874   ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
1875   PetscFunctionReturn(0);
1876 }
1877 
1878 #undef __FUNCT__
1879 #define __FUNCT__ "MatNorm_SeqBAIJ"
1880 PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
1881 {
1882   PetscErrorCode ierr;
1883   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1884   MatScalar      *v = a->a;
1885   PetscReal      sum = 0.0;
1886   PetscInt       i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1;
1887 
1888   PetscFunctionBegin;
1889   if (type == NORM_FROBENIUS) {
1890     for (i=0; i< bs2*nz; i++) {
1891 #if defined(PETSC_USE_COMPLEX)
1892       sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
1893 #else
1894       sum += (*v)*(*v); v++;
1895 #endif
1896     }
1897     *norm = sqrt(sum);
1898   } else if (type == NORM_1) { /* maximum column sum */
1899     PetscReal *tmp;
1900     PetscInt  *bcol = a->j;
1901     ierr = PetscMalloc((A->cmap->n+1)*sizeof(PetscReal),&tmp);CHKERRQ(ierr);
1902     ierr = PetscMemzero(tmp,A->cmap->n*sizeof(PetscReal));CHKERRQ(ierr);
1903     for (i=0; i<nz; i++){
1904       for (j=0; j<bs; j++){
1905         k1 = bs*(*bcol) + j; /* column index */
1906         for (k=0; k<bs; k++){
1907           tmp[k1] += PetscAbsScalar(*v); v++;
1908         }
1909       }
1910       bcol++;
1911     }
1912     *norm = 0.0;
1913     for (j=0; j<A->cmap->n; j++) {
1914       if (tmp[j] > *norm) *norm = tmp[j];
1915     }
1916     ierr = PetscFree(tmp);CHKERRQ(ierr);
1917   } else if (type == NORM_INFINITY) { /* maximum row sum */
1918     *norm = 0.0;
1919     for (k=0; k<bs; k++) {
1920       for (j=0; j<a->mbs; j++) {
1921         v = a->a + bs2*a->i[j] + k;
1922         sum = 0.0;
1923         for (i=0; i<a->i[j+1]-a->i[j]; i++) {
1924           for (k1=0; k1<bs; k1++){
1925             sum += PetscAbsScalar(*v);
1926             v   += bs;
1927           }
1928         }
1929         if (sum > *norm) *norm = sum;
1930       }
1931     }
1932   } else {
1933     SETERRQ(PETSC_ERR_SUP,"No support for this norm yet");
1934   }
1935   PetscFunctionReturn(0);
1936 }
1937 
1938 
1939 #undef __FUNCT__
1940 #define __FUNCT__ "MatEqual_SeqBAIJ"
1941 PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscTruth* flg)
1942 {
1943   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data,*b = (Mat_SeqBAIJ *)B->data;
1944   PetscErrorCode ierr;
1945 
1946   PetscFunctionBegin;
1947   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1948   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1949     *flg = PETSC_FALSE;
1950     PetscFunctionReturn(0);
1951   }
1952 
1953   /* if the a->i are the same */
1954   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1955   if (!*flg) {
1956     PetscFunctionReturn(0);
1957   }
1958 
1959   /* if a->j are the same */
1960   ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1961   if (!*flg) {
1962     PetscFunctionReturn(0);
1963   }
1964   /* if a->a are the same */
1965   ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
1966   PetscFunctionReturn(0);
1967 
1968 }
1969 
1970 #undef __FUNCT__
1971 #define __FUNCT__ "MatGetDiagonal_SeqBAIJ"
1972 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
1973 {
1974   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1975   PetscErrorCode ierr;
1976   PetscInt       i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1977   PetscScalar    *x,zero = 0.0;
1978   MatScalar      *aa,*aa_j;
1979 
1980   PetscFunctionBegin;
1981   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1982   bs   = A->rmap->bs;
1983   aa   = a->a;
1984   ai   = a->i;
1985   aj   = a->j;
1986   ambs = a->mbs;
1987   bs2  = a->bs2;
1988 
1989   ierr = VecSet(v,zero);CHKERRQ(ierr);
1990   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1991   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1992   if (n != A->rmap->N) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1993   for (i=0; i<ambs; i++) {
1994     for (j=ai[i]; j<ai[i+1]; j++) {
1995       if (aj[j] == i) {
1996         row  = i*bs;
1997         aa_j = aa+j*bs2;
1998         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1999         break;
2000       }
2001     }
2002   }
2003   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
2004   PetscFunctionReturn(0);
2005 }
2006 
2007 #undef __FUNCT__
2008 #define __FUNCT__ "MatDiagonalScale_SeqBAIJ"
2009 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
2010 {
2011   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2012   PetscScalar    *l,*r,x,*li,*ri;
2013   MatScalar      *aa,*v;
2014   PetscErrorCode ierr;
2015   PetscInt       i,j,k,lm,rn,M,m,n,*ai,*aj,mbs,tmp,bs,bs2;
2016 
2017   PetscFunctionBegin;
2018   ai  = a->i;
2019   aj  = a->j;
2020   aa  = a->a;
2021   m   = A->rmap->n;
2022   n   = A->cmap->n;
2023   bs  = A->rmap->bs;
2024   mbs = a->mbs;
2025   bs2 = a->bs2;
2026   if (ll) {
2027     ierr = VecGetArray(ll,&l);CHKERRQ(ierr);
2028     ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
2029     if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
2030     for (i=0; i<mbs; i++) { /* for each block row */
2031       M  = ai[i+1] - ai[i];
2032       li = l + i*bs;
2033       v  = aa + bs2*ai[i];
2034       for (j=0; j<M; j++) { /* for each block */
2035         for (k=0; k<bs2; k++) {
2036           (*v++) *= li[k%bs];
2037         }
2038       }
2039     }
2040     ierr = VecRestoreArray(ll,&l);CHKERRQ(ierr);
2041     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2042   }
2043 
2044   if (rr) {
2045     ierr = VecGetArray(rr,&r);CHKERRQ(ierr);
2046     ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr);
2047     if (rn != n) SETERRQ(PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
2048     for (i=0; i<mbs; i++) { /* for each block row */
2049       M  = ai[i+1] - ai[i];
2050       v  = aa + bs2*ai[i];
2051       for (j=0; j<M; j++) { /* for each block */
2052         ri = r + bs*aj[ai[i]+j];
2053         for (k=0; k<bs; k++) {
2054           x = ri[k];
2055           for (tmp=0; tmp<bs; tmp++) (*v++) *= x;
2056         }
2057       }
2058     }
2059     ierr = VecRestoreArray(rr,&r);CHKERRQ(ierr);
2060     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2061   }
2062   PetscFunctionReturn(0);
2063 }
2064 
2065 
2066 #undef __FUNCT__
2067 #define __FUNCT__ "MatGetInfo_SeqBAIJ"
2068 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
2069 {
2070   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2071 
2072   PetscFunctionBegin;
2073   info->block_size     = a->bs2;
2074   info->nz_allocated   = a->maxnz;
2075   info->nz_used        = a->bs2*a->nz;
2076   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
2077   info->assemblies   = A->num_ass;
2078   info->mallocs      = a->reallocs;
2079   info->memory       = ((PetscObject)A)->mem;
2080   if (A->factor) {
2081     info->fill_ratio_given  = A->info.fill_ratio_given;
2082     info->fill_ratio_needed = A->info.fill_ratio_needed;
2083     info->factor_mallocs    = A->info.factor_mallocs;
2084   } else {
2085     info->fill_ratio_given  = 0;
2086     info->fill_ratio_needed = 0;
2087     info->factor_mallocs    = 0;
2088   }
2089   PetscFunctionReturn(0);
2090 }
2091 
2092 
2093 #undef __FUNCT__
2094 #define __FUNCT__ "MatZeroEntries_SeqBAIJ"
2095 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
2096 {
2097   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2098   PetscErrorCode ierr;
2099 
2100   PetscFunctionBegin;
2101   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
2102   PetscFunctionReturn(0);
2103 }
2104