xref: /petsc/src/mat/impls/baij/seq/baij2.c (revision 8ab949d8a86daa69f82db2e3a2e7f07f1f4d47d0)
1 #define PETSCMAT_DLL
2 
3 #include "../src/mat/impls/baij/seq/baij.h"
4 #include "../src/mat/blockinvert.h"
5 #include "petscbt.h"
6 
7 #undef __FUNCT__
8 #define __FUNCT__ "MatIncreaseOverlap_SeqBAIJ"
9 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
10 {
11   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
12   PetscErrorCode ierr;
13   PetscInt       row,i,j,k,l,m,n,*nidx,isz,val,ival;
14   const PetscInt *idx;
15   PetscInt       start,end,*ai,*aj,bs,*nidx2;
16   PetscBT        table;
17 
18   PetscFunctionBegin;
19   m     = a->mbs;
20   ai    = a->i;
21   aj    = a->j;
22   bs    = A->rmap->bs;
23 
24   if (ov < 0)  SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
25 
26   ierr = PetscBTCreate(m,table);CHKERRQ(ierr);
27   ierr = PetscMalloc((m+1)*sizeof(PetscInt),&nidx);CHKERRQ(ierr);
28   ierr = PetscMalloc((A->rmap->N+1)*sizeof(PetscInt),&nidx2);CHKERRQ(ierr);
29 
30   for (i=0; i<is_max; i++) {
31     /* Initialise the two local arrays */
32     isz  = 0;
33     ierr = PetscBTMemzero(m,table);CHKERRQ(ierr);
34 
35     /* Extract the indices, assume there can be duplicate entries */
36     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
37     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
38 
39     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
40     for (j=0; j<n ; ++j){
41       ival = idx[j]/bs; /* convert the indices into block indices */
42       if (ival>=m) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
43       if(!PetscBTLookupSet(table,ival)) { nidx[isz++] = ival;}
44     }
45     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
46     ierr = ISDestroy(is[i]);CHKERRQ(ierr);
47 
48     k = 0;
49     for (j=0; j<ov; j++){ /* for each overlap*/
50       n = isz;
51       for (; k<n ; k++){ /* do only those rows in nidx[k], which are not done yet */
52         row   = nidx[k];
53         start = ai[row];
54         end   = ai[row+1];
55         for (l = start; l<end ; l++){
56           val = aj[l];
57           if (!PetscBTLookupSet(table,val)) {nidx[isz++] = val;}
58         }
59       }
60     }
61     /* expand the Index Set */
62     for (j=0; j<isz; j++) {
63       for (k=0; k<bs; k++)
64         nidx2[j*bs+k] = nidx[j]*bs+k;
65     }
66     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);CHKERRQ(ierr);
67   }
68   ierr = PetscBTDestroy(table);CHKERRQ(ierr);
69   ierr = PetscFree(nidx);CHKERRQ(ierr);
70   ierr = PetscFree(nidx2);CHKERRQ(ierr);
71   PetscFunctionReturn(0);
72 }
73 
74 #undef __FUNCT__
75 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ_Private"
76 PetscErrorCode MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
77 {
78   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*c;
79   PetscErrorCode ierr;
80   PetscInt       *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
81   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
82   const PetscInt *irow,*icol;
83   PetscInt       nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
84   PetscInt       *aj = a->j,*ai = a->i;
85   MatScalar      *mat_a;
86   Mat            C;
87   PetscTruth     flag,sorted;
88 
89   PetscFunctionBegin;
90   ierr = ISSorted(iscol,&sorted);CHKERRQ(ierr);
91   if (!sorted) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
92 
93   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
94   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
95   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
96   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
97 
98   ierr = PetscMalloc((1+oldcols)*sizeof(PetscInt),&smap);CHKERRQ(ierr);
99   ssmap = smap;
100   ierr = PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);CHKERRQ(ierr);
101   ierr  = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr);
102   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
103   /* determine lens of each row */
104   for (i=0; i<nrows; i++) {
105     kstart  = ai[irow[i]];
106     kend    = kstart + a->ilen[irow[i]];
107     lens[i] = 0;
108       for (k=kstart; k<kend; k++) {
109         if (ssmap[aj[k]]) {
110           lens[i]++;
111         }
112       }
113     }
114   /* Create and fill new matrix */
115   if (scall == MAT_REUSE_MATRIX) {
116     c = (Mat_SeqBAIJ *)((*B)->data);
117 
118     if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
119     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr);
120     if (!flag) {
121       SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
122     }
123     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr);
124     C = *B;
125   } else {
126     ierr = MatCreate(((PetscObject)A)->comm,&C);CHKERRQ(ierr);
127     ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
128     ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
129     ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(C,bs,0,lens);CHKERRQ(ierr);
130   }
131   c = (Mat_SeqBAIJ *)(C->data);
132   for (i=0; i<nrows; i++) {
133     row    = irow[i];
134     kstart = ai[row];
135     kend   = kstart + a->ilen[row];
136     mat_i  = c->i[i];
137     mat_j  = c->j + mat_i;
138     mat_a  = c->a + mat_i*bs2;
139     mat_ilen = c->ilen + i;
140     for (k=kstart; k<kend; k++) {
141       if ((tcol=ssmap[a->j[k]])) {
142         *mat_j++ = tcol - 1;
143         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
144         mat_a   += bs2;
145         (*mat_ilen)++;
146       }
147     }
148   }
149 
150   /* Free work space */
151   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
152   ierr = PetscFree(smap);CHKERRQ(ierr);
153   ierr = PetscFree(lens);CHKERRQ(ierr);
154   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
155   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
156 
157   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
158   *B = C;
159   PetscFunctionReturn(0);
160 }
161 
162 #undef __FUNCT__
163 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ"
164 PetscErrorCode MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
165 {
166   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
167   IS             is1,is2;
168   PetscErrorCode ierr;
169   PetscInt       *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count;
170   const PetscInt *irow,*icol;
171 
172   PetscFunctionBegin;
173   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
174   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
175   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
176   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
177 
178   /* Verify if the indices corespond to each element in a block
179    and form the IS with compressed IS */
180   ierr = PetscMalloc2(a->mbs,PetscInt,&vary,a->mbs,PetscInt,&iary);CHKERRQ(ierr);
181   ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr);
182   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
183   count = 0;
184   for (i=0; i<a->mbs; i++) {
185     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
186     if (vary[i]==bs) iary[count++] = i;
187   }
188   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr);
189 
190   ierr = PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));CHKERRQ(ierr);
191   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
192   count = 0;
193   for (i=0; i<a->mbs; i++) {
194     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_PLIB,"Internal error in PETSc");
195     if (vary[i]==bs) iary[count++] = i;
196   }
197   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is2);CHKERRQ(ierr);
198   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
199   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
200   ierr = PetscFree2(vary,iary);CHKERRQ(ierr);
201 
202   ierr = MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr);
203   ISDestroy(is1);
204   ISDestroy(is2);
205   PetscFunctionReturn(0);
206 }
207 
208 #undef __FUNCT__
209 #define __FUNCT__ "MatGetSubMatrices_SeqBAIJ"
210 PetscErrorCode MatGetSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
211 {
212   PetscErrorCode ierr;
213   PetscInt       i;
214 
215   PetscFunctionBegin;
216   if (scall == MAT_INITIAL_MATRIX) {
217     ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr);
218   }
219 
220   for (i=0; i<n; i++) {
221     ierr = MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr);
222   }
223   PetscFunctionReturn(0);
224 }
225 
226 
227 /* -------------------------------------------------------*/
228 /* Should check that shapes of vectors and matrices match */
229 /* -------------------------------------------------------*/
230 #include "petscblaslapack.h"
231 
232 #undef __FUNCT__
233 #define __FUNCT__ "MatMult_SeqBAIJ_1"
234 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
235 {
236   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
237   PetscScalar       *z,sum;
238   const PetscScalar *x;
239   const MatScalar   *v;
240   PetscErrorCode    ierr;
241   PetscInt          mbs,i,n,nonzerorow=0;
242   const PetscInt    *idx,*ii,*ridx=PETSC_NULL;
243   PetscTruth        usecprow=a->compressedrow.use;
244 
245   PetscFunctionBegin;
246   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
247   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
248 
249   if (usecprow){
250     mbs  = a->compressedrow.nrows;
251     ii   = a->compressedrow.i;
252     ridx = a->compressedrow.rindex;
253     ierr = PetscMemzero(z,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
254   } else {
255     mbs = a->mbs;
256     ii  = a->i;
257   }
258 
259   for (i=0; i<mbs; i++) {
260     n    = ii[1] - ii[0];
261     v    = a->a + ii[0];
262     idx  = a->j + ii[0];
263     ii++;
264     sum  = 0.0;
265     PetscSparseDensePlusDot(sum,x,v,idx,n);
266     if (usecprow){
267       z[ridx[i]] = sum;
268     } else {
269       nonzerorow += (n>0);
270       z[i] = sum;
271     }
272   }
273   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
274   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
275   ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr);
276   PetscFunctionReturn(0);
277 }
278 
279 #undef __FUNCT__
280 #define __FUNCT__ "MatMult_SeqBAIJ_2"
281 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
282 {
283   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
284   PetscScalar       *z = 0,sum1,sum2,*zarray;
285   const PetscScalar *x,*xb;
286   PetscScalar       x1,x2;
287   const MatScalar   *v;
288   PetscErrorCode    ierr;
289   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
290   PetscTruth        usecprow=a->compressedrow.use;
291 
292   PetscFunctionBegin;
293   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
294   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
295 
296   idx = a->j;
297   v   = a->a;
298   if (usecprow){
299     mbs  = a->compressedrow.nrows;
300     ii   = a->compressedrow.i;
301     ridx = a->compressedrow.rindex;
302   } else {
303     mbs = a->mbs;
304     ii  = a->i;
305     z   = zarray;
306   }
307 
308   for (i=0; i<mbs; i++) {
309     n  = ii[1] - ii[0]; ii++;
310     sum1 = 0.0; sum2 = 0.0;
311     nonzerorow += (n>0);
312     for (j=0; j<n; j++) {
313       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
314       sum1 += v[0]*x1 + v[2]*x2;
315       sum2 += v[1]*x1 + v[3]*x2;
316       v += 4;
317     }
318     if (usecprow) z = zarray + 2*ridx[i];
319     z[0] = sum1; z[1] = sum2;
320     if (!usecprow) z += 2;
321   }
322   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
323   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
324   ierr = PetscLogFlops(8.0*a->nz - 2.0*nonzerorow);CHKERRQ(ierr);
325   PetscFunctionReturn(0);
326 }
327 
328 #undef __FUNCT__
329 #define __FUNCT__ "MatMult_SeqBAIJ_3"
330 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
331 {
332   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
333   PetscScalar       *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray;
334   const PetscScalar *x,*xb;
335   const MatScalar   *v;
336   PetscErrorCode    ierr;
337   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
338   PetscTruth        usecprow=a->compressedrow.use;
339 
340 
341 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
342 #pragma disjoint(*v,*z,*xb)
343 #endif
344 
345   PetscFunctionBegin;
346   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
347   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
348 
349   idx = a->j;
350   v   = a->a;
351   if (usecprow){
352     mbs  = a->compressedrow.nrows;
353     ii   = a->compressedrow.i;
354     ridx = a->compressedrow.rindex;
355   } else {
356     mbs = a->mbs;
357     ii  = a->i;
358     z   = zarray;
359   }
360 
361   for (i=0; i<mbs; i++) {
362     n  = ii[1] - ii[0]; ii++;
363     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
364     nonzerorow += (n>0);
365     for (j=0; j<n; j++) {
366       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
367       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
368       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
369       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
370       v += 9;
371     }
372     if (usecprow) z = zarray + 3*ridx[i];
373     z[0] = sum1; z[1] = sum2; z[2] = sum3;
374     if (!usecprow) z += 3;
375   }
376   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
377   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
378   ierr = PetscLogFlops(18.0*a->nz - 3.0*nonzerorow);CHKERRQ(ierr);
379   PetscFunctionReturn(0);
380 }
381 
382 #undef __FUNCT__
383 #define __FUNCT__ "MatMult_SeqBAIJ_4"
384 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
385 {
386   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
387   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
388   const PetscScalar *x,*xb;
389   const MatScalar   *v;
390   PetscErrorCode    ierr;
391   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
392   PetscTruth        usecprow=a->compressedrow.use;
393 
394   PetscFunctionBegin;
395   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
396   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
397 
398   idx = a->j;
399   v   = a->a;
400   if (usecprow){
401     mbs  = a->compressedrow.nrows;
402     ii   = a->compressedrow.i;
403     ridx = a->compressedrow.rindex;
404   } else {
405     mbs = a->mbs;
406     ii  = a->i;
407     z   = zarray;
408   }
409 
410   for (i=0; i<mbs; i++) {
411     n  = ii[1] - ii[0]; ii++;
412     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
413     nonzerorow += (n>0);
414     for (j=0; j<n; j++) {
415       xb = x + 4*(*idx++);
416       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
417       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
418       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
419       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
420       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
421       v += 16;
422     }
423     if (usecprow) z = zarray + 4*ridx[i];
424     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
425     if (!usecprow) z += 4;
426   }
427   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
428   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
429   ierr = PetscLogFlops(32.0*a->nz - 4.0*nonzerorow);CHKERRQ(ierr);
430   PetscFunctionReturn(0);
431 }
432 
433 #undef __FUNCT__
434 #define __FUNCT__ "MatMult_SeqBAIJ_5"
435 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
436 {
437   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
438   PetscScalar       sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray;
439   const PetscScalar *xb,*x;
440   const MatScalar   *v;
441   PetscErrorCode    ierr;
442   const PetscInt    *idx,*ii,*ridx=PETSC_NULL;
443   PetscInt          mbs,i,j,n,nonzerorow=0;
444   PetscTruth        usecprow=a->compressedrow.use;
445 
446   PetscFunctionBegin;
447   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
448   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
449 
450   idx = a->j;
451   v   = a->a;
452   if (usecprow){
453     mbs  = a->compressedrow.nrows;
454     ii   = a->compressedrow.i;
455     ridx = a->compressedrow.rindex;
456   } else {
457     mbs = a->mbs;
458     ii  = a->i;
459     z   = zarray;
460   }
461 
462   for (i=0; i<mbs; i++) {
463     n  = ii[1] - ii[0]; ii++;
464     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
465     nonzerorow += (n>0);
466     for (j=0; j<n; j++) {
467       xb = x + 5*(*idx++);
468       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
469       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
470       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
471       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
472       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
473       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
474       v += 25;
475     }
476     if (usecprow) z = zarray + 5*ridx[i];
477     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
478     if (!usecprow) z += 5;
479   }
480   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
481   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
482   ierr = PetscLogFlops(50.0*a->nz - 5.0*nonzerorow);CHKERRQ(ierr);
483   PetscFunctionReturn(0);
484 }
485 
486 
487 #undef __FUNCT__
488 #define __FUNCT__ "MatMult_SeqBAIJ_6"
489 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
490 {
491   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
492   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
493   const PetscScalar *x,*xb;
494   PetscScalar       x1,x2,x3,x4,x5,x6,*zarray;
495   const MatScalar   *v;
496   PetscErrorCode    ierr;
497   PetscInt          mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
498   PetscTruth        usecprow=a->compressedrow.use;
499 
500   PetscFunctionBegin;
501   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
502   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
503 
504   idx = a->j;
505   v   = a->a;
506   if (usecprow){
507     mbs  = a->compressedrow.nrows;
508     ii   = a->compressedrow.i;
509     ridx = a->compressedrow.rindex;
510   } else {
511     mbs = a->mbs;
512     ii  = a->i;
513     z   = zarray;
514   }
515 
516   for (i=0; i<mbs; i++) {
517     n  = ii[1] - ii[0]; ii++;
518     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0;
519     nonzerorow += (n>0);
520     for (j=0; j<n; j++) {
521       xb = x + 6*(*idx++);
522       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
523       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
524       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
525       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
526       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
527       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
528       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
529       v += 36;
530     }
531     if (usecprow) z = zarray + 6*ridx[i];
532     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
533     if (!usecprow) z += 6;
534   }
535 
536   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
537   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
538   ierr = PetscLogFlops(72.0*a->nz - 6.0*nonzerorow);CHKERRQ(ierr);
539   PetscFunctionReturn(0);
540 }
541 
542 #undef __FUNCT__
543 #define __FUNCT__ "MatMult_SeqBAIJ_7"
544 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
545 {
546   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
547   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
548   const PetscScalar *x,*xb;
549   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*zarray;
550   const MatScalar   *v;
551   PetscErrorCode    ierr;
552   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
553   PetscTruth        usecprow=a->compressedrow.use;
554 
555   PetscFunctionBegin;
556   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
557   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
558 
559   idx = a->j;
560   v   = a->a;
561   if (usecprow){
562     mbs    = a->compressedrow.nrows;
563     ii     = a->compressedrow.i;
564     ridx = a->compressedrow.rindex;
565   } else {
566     mbs = a->mbs;
567     ii  = a->i;
568     z   = zarray;
569   }
570 
571   for (i=0; i<mbs; i++) {
572     n  = ii[1] - ii[0]; ii++;
573     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
574     nonzerorow += (n>0);
575     for (j=0; j<n; j++) {
576       xb = x + 7*(*idx++);
577       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
578       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
579       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
580       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
581       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
582       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
583       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
584       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
585       v += 49;
586     }
587     if (usecprow) z = zarray + 7*ridx[i];
588     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
589     if (!usecprow) z += 7;
590   }
591 
592   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
593   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
594   ierr = PetscLogFlops(98.0*a->nz - 7.0*nonzerorow);CHKERRQ(ierr);
595   PetscFunctionReturn(0);
596 }
597 
598 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
599 
600 #undef __FUNCT__
601 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver1"
602 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz)
603 {
604   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
605   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
606   const PetscScalar *x,*xb;
607   PetscScalar        x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray;
608   const MatScalar   *v;
609   PetscErrorCode    ierr;
610   const PetscInt    *ii,*ij=a->j,*idx;
611   PetscInt          mbs,i,j,k,n,*ridx=PETSC_NULL,nonzerorow=0;
612   PetscTruth        usecprow=a->compressedrow.use;
613 
614   PetscFunctionBegin;
615   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
616   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
617 
618   v   = a->a;
619   if (usecprow){
620     mbs    = a->compressedrow.nrows;
621     ii     = a->compressedrow.i;
622     ridx = a->compressedrow.rindex;
623   } else {
624     mbs = a->mbs;
625     ii  = a->i;
626     z   = zarray;
627   }
628 
629   for (i=0; i<mbs; i++) {
630     n  = ii[i+1] - ii[i];
631     idx = ij + ii[i];
632     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
633     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
634 
635     nonzerorow += (n>0);
636     for (j=0; j<n; j++) {
637       xb = x + 15*(idx[j]);
638       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
639       x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14];
640 
641       for(k=0;k<15;k++){
642 	sum1  += v[0]*xb[k];
643         sum2  += v[1]*xb[k];
644 	sum3  += v[2]*xb[k];
645 	sum4  += v[3]*xb[k];
646 	sum5  += v[4]*xb[k];
647         sum6  += v[5]*xb[k];
648 	sum7  += v[6]*xb[k];
649 	sum8  += v[7]*xb[k];
650         sum9  += v[8]*xb[k];
651         sum10 += v[9]*xb[k];
652 	sum11 += v[10]*xb[k];
653 	sum12 += v[11]*xb[k];
654 	sum13 += v[12]*xb[k];
655         sum14 += v[13]*xb[k];
656 	sum15 += v[14]*xb[k];
657 	v += 15;
658       }
659     }
660     if (usecprow) z = zarray + 15*ridx[i];
661     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
662     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
663 
664     if (!usecprow) z += 15;
665   }
666 
667   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
668   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
669   ierr = PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);CHKERRQ(ierr);
670   PetscFunctionReturn(0);
671 }
672 
673 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
674 #undef __FUNCT__
675 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver2"
676 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz)
677 {
678   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
679   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
680   const PetscScalar *x,*xb;
681   PetscScalar        x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray;
682   const MatScalar   *v;
683   PetscErrorCode    ierr;
684   const PetscInt    *ii,*ij=a->j,*idx;
685   PetscInt          mbs,i,j,n,*ridx=PETSC_NULL,nonzerorow=0;
686   PetscTruth        usecprow=a->compressedrow.use;
687 
688   PetscFunctionBegin;
689   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
690   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
691 
692   v   = a->a;
693   if (usecprow){
694     mbs    = a->compressedrow.nrows;
695     ii     = a->compressedrow.i;
696     ridx = a->compressedrow.rindex;
697   } else {
698     mbs = a->mbs;
699     ii  = a->i;
700     z   = zarray;
701   }
702 
703   for (i=0; i<mbs; i++) {
704     n  = ii[i+1] - ii[i];
705     idx = ij + ii[i];
706     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
707     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
708 
709     nonzerorow += (n>0);
710     for (j=0; j<n; j++) {
711       xb = x + 15*(idx[j]);
712       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
713       x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14];
714 
715       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
716       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
717       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
718       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
719       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
720       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
721       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
722       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
723       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
724       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
725       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
726       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
727       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
728       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
729       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
730 
731       v += 60;
732 
733       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
734       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
735       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
736       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
737       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
738       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
739       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
740       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
741       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
742       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
743       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
744       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
745       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
746       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
747       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
748       v += 60;
749 
750       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
751       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
752       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
753       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
754       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
755       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
756       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
757       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
758       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
759       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
760       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
761       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
762       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
763       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
764       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
765       v  += 60;
766 
767       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3;
768       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3;
769       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3;
770       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3;
771       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3;
772       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3;
773       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3;
774       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3;
775       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3;
776       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3;
777       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3;
778       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3;
779       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3;
780       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3;
781       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3;
782       v += 45;
783     }
784     if (usecprow) z = zarray + 15*ridx[i];
785     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
786     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
787 
788     if (!usecprow) z += 15;
789   }
790 
791   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
792   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
793   ierr = PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);CHKERRQ(ierr);
794   PetscFunctionReturn(0);
795 }
796 
797 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
798 #undef __FUNCT__
799 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver3"
800 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz)
801 {
802   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
803   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
804   const PetscScalar *x,*xb;
805   PetscScalar        x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray;
806   const MatScalar   *v;
807   PetscErrorCode    ierr;
808   const PetscInt    *ii,*ij=a->j,*idx;
809   PetscInt          mbs,i,j,n,*ridx=PETSC_NULL,nonzerorow=0;
810   PetscTruth        usecprow=a->compressedrow.use;
811 
812   PetscFunctionBegin;
813   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
814   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
815 
816   v   = a->a;
817   if (usecprow){
818     mbs    = a->compressedrow.nrows;
819     ii     = a->compressedrow.i;
820     ridx = a->compressedrow.rindex;
821   } else {
822     mbs = a->mbs;
823     ii  = a->i;
824     z   = zarray;
825   }
826 
827   for (i=0; i<mbs; i++) {
828     n  = ii[i+1] - ii[i];
829     idx = ij + ii[i];
830     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
831     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
832 
833     nonzerorow += (n>0);
834     for (j=0; j<n; j++) {
835       xb = x + 15*(idx[j]);
836       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
837       x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14];
838 
839       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8;
840       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8;
841       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8;
842       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8;
843       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8;
844       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8;
845       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8;
846       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8;
847       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8;
848       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8;
849       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8;
850       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8;
851       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8;
852       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8;
853       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8;
854       v += 120;
855 
856       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7;
857       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7;
858       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7;
859       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7;
860       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7;
861       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7;
862       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7;
863       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7;
864       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7;
865       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7;
866       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7;
867       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7;
868       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7;
869       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7;
870       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7;
871       v += 105;
872     }
873     if (usecprow) z = zarray + 15*ridx[i];
874     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
875     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
876 
877     if (!usecprow) z += 15;
878   }
879 
880   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
881   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
882   ierr = PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);CHKERRQ(ierr);
883   PetscFunctionReturn(0);
884 }
885 
886 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
887 
888 #undef __FUNCT__
889 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver4"
890 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz)
891 {
892   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
893   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
894   const PetscScalar *x,*xb;
895   PetscScalar        x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray;
896   const MatScalar   *v;
897   PetscErrorCode    ierr;
898   const PetscInt    *ii,*ij=a->j,*idx;
899   PetscInt          mbs,i,j,n,*ridx=PETSC_NULL,nonzerorow=0;
900   PetscTruth        usecprow=a->compressedrow.use;
901 
902   PetscFunctionBegin;
903   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
904   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
905 
906   v   = a->a;
907   if (usecprow){
908     mbs    = a->compressedrow.nrows;
909     ii     = a->compressedrow.i;
910     ridx = a->compressedrow.rindex;
911   } else {
912     mbs = a->mbs;
913     ii  = a->i;
914     z   = zarray;
915   }
916 
917   for (i=0; i<mbs; i++) {
918     n  = ii[i+1] - ii[i];
919     idx = ij + ii[i];
920     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
921     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
922 
923     nonzerorow += (n>0);
924     for (j=0; j<n; j++) {
925       xb = x + 15*(idx[j]);
926       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
927       x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14];
928 
929       sum1 +=  v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
930       sum2 +=  v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
931       sum3 +=  v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
932       sum4 +=  v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
933       sum5  += v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
934       sum6  += v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
935       sum7  += v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
936       sum8  += v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
937       sum9  += v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
938       sum10 += v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
939       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
940       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
941       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
942       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
943       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
944       v += 225;
945     }
946     if (usecprow) z = zarray + 15*ridx[i];
947     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
948     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
949 
950     if (!usecprow) z += 15;
951   }
952 
953   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
954   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
955   ierr = PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);CHKERRQ(ierr);
956   PetscFunctionReturn(0);
957 }
958 
959 
960 /*
961     This will not work with MatScalar == float because it calls the BLAS
962 */
963 #undef __FUNCT__
964 #define __FUNCT__ "MatMult_SeqBAIJ_N"
965 PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
966 {
967   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
968   PetscScalar    *x,*z = 0,*xb,*work,*workt,*zarray;
969   MatScalar      *v;
970   PetscErrorCode ierr;
971   PetscInt       mbs=a->mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2;
972   PetscInt       ncols,k,*ridx=PETSC_NULL,nonzerorow=0;
973   PetscTruth     usecprow=a->compressedrow.use;
974 
975   PetscFunctionBegin;
976   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
977   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
978 
979   idx = a->j;
980   v   = a->a;
981   if (usecprow){
982     mbs  = a->compressedrow.nrows;
983     ii   = a->compressedrow.i;
984     ridx = a->compressedrow.rindex;
985   } else {
986     mbs = a->mbs;
987     ii  = a->i;
988     z   = zarray;
989   }
990 
991   if (!a->mult_work) {
992     k    = PetscMax(A->rmap->n,A->cmap->n);
993     ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
994   }
995   work = a->mult_work;
996   for (i=0; i<mbs; i++) {
997     n     = ii[1] - ii[0]; ii++;
998     ncols = n*bs;
999     workt = work;
1000     nonzerorow += (n>0);
1001     for (j=0; j<n; j++) {
1002       xb = x + bs*(*idx++);
1003       for (k=0; k<bs; k++) workt[k] = xb[k];
1004       workt += bs;
1005     }
1006     if (usecprow) z = zarray + bs*ridx[i];
1007     Kernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
1008     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
1009     v += n*bs2;
1010     if (!usecprow) z += bs;
1011   }
1012   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1013   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1014   ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*nonzerorow);CHKERRQ(ierr);
1015   PetscFunctionReturn(0);
1016 }
1017 
1018 extern PetscErrorCode VecCopy_Seq(Vec,Vec);
1019 #undef __FUNCT__
1020 #define __FUNCT__ "MatMultAdd_SeqBAIJ_1"
1021 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
1022 {
1023   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data;
1024   const PetscScalar  *x;
1025   PetscScalar        *y,*z,sum;
1026   const MatScalar    *v;
1027   PetscErrorCode     ierr;
1028   PetscInt           mbs=a->mbs,i,n,*ridx=PETSC_NULL,nonzerorow=0;
1029   const PetscInt     *idx,*ii;
1030   PetscTruth         usecprow=a->compressedrow.use;
1031 
1032   PetscFunctionBegin;
1033   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
1034   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
1035   if (zz != yy) {
1036     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1037   } else {
1038     z = y;
1039   }
1040 
1041   idx = a->j;
1042   v   = a->a;
1043   if (usecprow){
1044     if (zz != yy){
1045       ierr = PetscMemcpy(z,y,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1046     }
1047     mbs  = a->compressedrow.nrows;
1048     ii   = a->compressedrow.i;
1049     ridx = a->compressedrow.rindex;
1050   } else {
1051     ii  = a->i;
1052   }
1053 
1054   for (i=0; i<mbs; i++) {
1055     n    = ii[1] - ii[0];
1056     ii++;
1057     if (!usecprow){
1058       nonzerorow += (n>0);
1059       sum = y[i];
1060     } else {
1061       sum = y[ridx[i]];
1062     }
1063     PetscSparseDensePlusDot(sum,x,v,idx,n);
1064     v += n;
1065     idx += n;
1066     if (usecprow){
1067       z[ridx[i]] = sum;
1068     } else {
1069       z[i] = sum;
1070     }
1071   }
1072   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
1073   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
1074   if (zz != yy) {
1075     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1076   }
1077   ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr);
1078   PetscFunctionReturn(0);
1079 }
1080 
1081 #undef __FUNCT__
1082 #define __FUNCT__ "MatMultAdd_SeqBAIJ_2"
1083 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
1084 {
1085   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1086   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2;
1087   PetscScalar    x1,x2,*yarray,*zarray;
1088   MatScalar      *v;
1089   PetscErrorCode ierr;
1090   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1091   PetscTruth     usecprow=a->compressedrow.use;
1092 
1093   PetscFunctionBegin;
1094   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1095   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
1096   if (zz != yy) {
1097     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1098   } else {
1099     zarray = yarray;
1100   }
1101 
1102   idx = a->j;
1103   v   = a->a;
1104   if (usecprow){
1105     if (zz != yy){
1106       ierr = PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1107     }
1108     mbs  = a->compressedrow.nrows;
1109     ii   = a->compressedrow.i;
1110     ridx = a->compressedrow.rindex;
1111     if (zz != yy){
1112       ierr = PetscMemcpy(zarray,yarray,a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1113     }
1114   } else {
1115     ii  = a->i;
1116     y   = yarray;
1117     z   = zarray;
1118   }
1119 
1120   for (i=0; i<mbs; i++) {
1121     n  = ii[1] - ii[0]; ii++;
1122     if (usecprow){
1123       z = zarray + 2*ridx[i];
1124       y = yarray + 2*ridx[i];
1125     }
1126     sum1 = y[0]; sum2 = y[1];
1127     for (j=0; j<n; j++) {
1128       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
1129       sum1 += v[0]*x1 + v[2]*x2;
1130       sum2 += v[1]*x1 + v[3]*x2;
1131       v += 4;
1132     }
1133     z[0] = sum1; z[1] = sum2;
1134     if (!usecprow){
1135       z += 2; y += 2;
1136     }
1137   }
1138   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1139   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
1140   if (zz != yy) {
1141     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1142   }
1143   ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr);
1144   PetscFunctionReturn(0);
1145 }
1146 
1147 #undef __FUNCT__
1148 #define __FUNCT__ "MatMultAdd_SeqBAIJ_3"
1149 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
1150 {
1151   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1152   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
1153   MatScalar      *v;
1154   PetscErrorCode ierr;
1155   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1156   PetscTruth     usecprow=a->compressedrow.use;
1157 
1158   PetscFunctionBegin;
1159   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1160   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
1161   if (zz != yy) {
1162     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1163   } else {
1164     zarray = yarray;
1165   }
1166 
1167   idx = a->j;
1168   v   = a->a;
1169   if (usecprow){
1170     if (zz != yy){
1171       ierr = PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1172     }
1173     mbs  = a->compressedrow.nrows;
1174     ii   = a->compressedrow.i;
1175     ridx = a->compressedrow.rindex;
1176   } else {
1177     ii  = a->i;
1178     y   = yarray;
1179     z   = zarray;
1180   }
1181 
1182   for (i=0; i<mbs; i++) {
1183     n  = ii[1] - ii[0]; ii++;
1184     if (usecprow){
1185       z = zarray + 3*ridx[i];
1186       y = yarray + 3*ridx[i];
1187     }
1188     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
1189     for (j=0; j<n; j++) {
1190       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1191       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
1192       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
1193       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
1194       v += 9;
1195     }
1196     z[0] = sum1; z[1] = sum2; z[2] = sum3;
1197     if (!usecprow){
1198       z += 3; y += 3;
1199     }
1200   }
1201   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1202   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
1203   if (zz != yy) {
1204     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1205   }
1206   ierr = PetscLogFlops(18.0*a->nz);CHKERRQ(ierr);
1207   PetscFunctionReturn(0);
1208 }
1209 
1210 #undef __FUNCT__
1211 #define __FUNCT__ "MatMultAdd_SeqBAIJ_4"
1212 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
1213 {
1214   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1215   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
1216   MatScalar      *v;
1217   PetscErrorCode ierr;
1218   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1219   PetscTruth     usecprow=a->compressedrow.use;
1220 
1221   PetscFunctionBegin;
1222   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1223   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
1224   if (zz != yy) {
1225     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1226   } else {
1227     zarray = yarray;
1228   }
1229 
1230   idx   = a->j;
1231   v     = a->a;
1232   if (usecprow){
1233     if (zz != yy){
1234       ierr = PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1235     }
1236     mbs  = a->compressedrow.nrows;
1237     ii   = a->compressedrow.i;
1238     ridx = a->compressedrow.rindex;
1239   } else {
1240     ii  = a->i;
1241     y   = yarray;
1242     z   = zarray;
1243   }
1244 
1245   for (i=0; i<mbs; i++) {
1246     n  = ii[1] - ii[0]; ii++;
1247     if (usecprow){
1248       z = zarray + 4*ridx[i];
1249       y = yarray + 4*ridx[i];
1250     }
1251     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
1252     for (j=0; j<n; j++) {
1253       xb = x + 4*(*idx++);
1254       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1255       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1256       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1257       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1258       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1259       v += 16;
1260     }
1261     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
1262     if (!usecprow){
1263       z += 4; y += 4;
1264     }
1265   }
1266   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1267   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
1268   if (zz != yy) {
1269     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1270   }
1271   ierr = PetscLogFlops(32.0*a->nz);CHKERRQ(ierr);
1272   PetscFunctionReturn(0);
1273 }
1274 
1275 #undef __FUNCT__
1276 #define __FUNCT__ "MatMultAdd_SeqBAIJ_5"
1277 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
1278 {
1279   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1280   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
1281   PetscScalar    *yarray,*zarray;
1282   MatScalar      *v;
1283   PetscErrorCode ierr;
1284   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1285   PetscTruth     usecprow=a->compressedrow.use;
1286 
1287   PetscFunctionBegin;
1288   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1289   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
1290   if (zz != yy) {
1291     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1292   } else {
1293     zarray = yarray;
1294   }
1295 
1296   idx = a->j;
1297   v   = a->a;
1298   if (usecprow){
1299     if (zz != yy){
1300       ierr = PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1301     }
1302     mbs  = a->compressedrow.nrows;
1303     ii   = a->compressedrow.i;
1304     ridx = a->compressedrow.rindex;
1305   } else {
1306     ii  = a->i;
1307     y   = yarray;
1308     z   = zarray;
1309   }
1310 
1311   for (i=0; i<mbs; i++) {
1312     n  = ii[1] - ii[0]; ii++;
1313     if (usecprow){
1314       z = zarray + 5*ridx[i];
1315       y = yarray + 5*ridx[i];
1316     }
1317     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
1318     for (j=0; j<n; j++) {
1319       xb = x + 5*(*idx++);
1320       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
1321       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1322       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1323       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1324       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1325       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1326       v += 25;
1327     }
1328     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
1329     if (!usecprow){
1330       z += 5; y += 5;
1331     }
1332   }
1333   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1334   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
1335   if (zz != yy) {
1336     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1337   }
1338   ierr = PetscLogFlops(50.0*a->nz);CHKERRQ(ierr);
1339   PetscFunctionReturn(0);
1340 }
1341 #undef __FUNCT__
1342 #define __FUNCT__ "MatMultAdd_SeqBAIJ_6"
1343 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
1344 {
1345   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1346   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6;
1347   PetscScalar    x1,x2,x3,x4,x5,x6,*yarray,*zarray;
1348   MatScalar      *v;
1349   PetscErrorCode ierr;
1350   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1351   PetscTruth     usecprow=a->compressedrow.use;
1352 
1353   PetscFunctionBegin;
1354   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1355   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
1356   if (zz != yy) {
1357     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1358   } else {
1359     zarray = yarray;
1360   }
1361 
1362   idx = a->j;
1363   v   = a->a;
1364   if (usecprow){
1365     if (zz != yy){
1366       ierr = PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1367     }
1368     mbs  = a->compressedrow.nrows;
1369     ii   = a->compressedrow.i;
1370     ridx = a->compressedrow.rindex;
1371   } else {
1372     ii  = a->i;
1373     y   = yarray;
1374     z   = zarray;
1375   }
1376 
1377   for (i=0; i<mbs; i++) {
1378     n  = ii[1] - ii[0]; ii++;
1379     if (usecprow){
1380       z = zarray + 6*ridx[i];
1381       y = yarray + 6*ridx[i];
1382     }
1383     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
1384     for (j=0; j<n; j++) {
1385       xb = x + 6*(*idx++);
1386       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
1387       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
1388       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
1389       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
1390       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
1391       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
1392       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
1393       v += 36;
1394     }
1395     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
1396     if (!usecprow){
1397       z += 6; y += 6;
1398     }
1399   }
1400   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1401   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
1402   if (zz != yy) {
1403     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1404   }
1405   ierr = PetscLogFlops(72.0*a->nz);CHKERRQ(ierr);
1406   PetscFunctionReturn(0);
1407 }
1408 
1409 #undef __FUNCT__
1410 #define __FUNCT__ "MatMultAdd_SeqBAIJ_7"
1411 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1412 {
1413   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1414   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1415   PetscScalar    x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
1416   MatScalar      *v;
1417   PetscErrorCode ierr;
1418   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1419   PetscTruth     usecprow=a->compressedrow.use;
1420 
1421   PetscFunctionBegin;
1422   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1423   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
1424   if (zz != yy) {
1425     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1426   } else {
1427     zarray = yarray;
1428   }
1429 
1430   idx = a->j;
1431   v   = a->a;
1432   if (usecprow){
1433     if (zz != yy){
1434       ierr = PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1435     }
1436     mbs  = a->compressedrow.nrows;
1437     ii   = a->compressedrow.i;
1438     ridx = a->compressedrow.rindex;
1439   } else {
1440     ii  = a->i;
1441     y   = yarray;
1442     z   = zarray;
1443   }
1444 
1445   for (i=0; i<mbs; i++) {
1446     n  = ii[1] - ii[0]; ii++;
1447     if (usecprow){
1448       z = zarray + 7*ridx[i];
1449       y = yarray + 7*ridx[i];
1450     }
1451     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1452     for (j=0; j<n; j++) {
1453       xb = x + 7*(*idx++);
1454       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1455       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1456       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1457       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1458       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1459       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1460       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1461       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1462       v += 49;
1463     }
1464     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1465     if (!usecprow){
1466       z += 7; y += 7;
1467     }
1468   }
1469   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1470   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
1471   if (zz != yy) {
1472     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1473   }
1474   ierr = PetscLogFlops(98.0*a->nz);CHKERRQ(ierr);
1475   PetscFunctionReturn(0);
1476 }
1477 
1478 #undef __FUNCT__
1479 #define __FUNCT__ "MatMultAdd_SeqBAIJ_N"
1480 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1481 {
1482   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1483   PetscScalar    *x,*z = 0,*xb,*work,*workt,*zarray;
1484   MatScalar      *v;
1485   PetscErrorCode ierr;
1486   PetscInt       mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2;
1487   PetscInt       ncols,k,*ridx=PETSC_NULL;
1488   PetscTruth     usecprow=a->compressedrow.use;
1489 
1490   PetscFunctionBegin;
1491   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
1492   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1493   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1494 
1495   idx = a->j;
1496   v   = a->a;
1497   if (usecprow){
1498     mbs    = a->compressedrow.nrows;
1499     ii     = a->compressedrow.i;
1500     ridx = a->compressedrow.rindex;
1501   } else {
1502     mbs = a->mbs;
1503     ii  = a->i;
1504     z   = zarray;
1505   }
1506 
1507   if (!a->mult_work) {
1508     k    = PetscMax(A->rmap->n,A->cmap->n);
1509     ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
1510   }
1511   work = a->mult_work;
1512   for (i=0; i<mbs; i++) {
1513     n     = ii[1] - ii[0]; ii++;
1514     ncols = n*bs;
1515     workt = work;
1516     for (j=0; j<n; j++) {
1517       xb = x + bs*(*idx++);
1518       for (k=0; k<bs; k++) workt[k] = xb[k];
1519       workt += bs;
1520     }
1521     if (usecprow) z = zarray + bs*ridx[i];
1522     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1523     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
1524     v += n*bs2;
1525     if (!usecprow){
1526       z += bs;
1527     }
1528   }
1529   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1530   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1531   ierr = PetscLogFlops(2.0*a->nz*bs2);CHKERRQ(ierr);
1532   PetscFunctionReturn(0);
1533 }
1534 
1535 #undef __FUNCT__
1536 #define __FUNCT__ "MatMultHermitianTranspose_SeqBAIJ"
1537 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1538 {
1539   PetscScalar    zero = 0.0;
1540   PetscErrorCode ierr;
1541 
1542   PetscFunctionBegin;
1543   ierr = VecSet(zz,zero);CHKERRQ(ierr);
1544   ierr = MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr);
1545   PetscFunctionReturn(0);
1546 }
1547 
1548 #undef __FUNCT__
1549 #define __FUNCT__ "MatMultTranspose_SeqBAIJ"
1550 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1551 {
1552   PetscScalar    zero = 0.0;
1553   PetscErrorCode ierr;
1554 
1555   PetscFunctionBegin;
1556   ierr = VecSet(zz,zero);CHKERRQ(ierr);
1557   ierr = MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr);
1558   PetscFunctionReturn(0);
1559 }
1560 
1561 #undef __FUNCT__
1562 #define __FUNCT__ "MatMultHermitianTransposeAdd_SeqBAIJ"
1563 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1564 
1565 {
1566   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1567   PetscScalar       *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5;
1568   MatScalar         *v;
1569   PetscErrorCode    ierr;
1570   PetscInt          mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=PETSC_NULL;
1571   Mat_CompressedRow cprow = a->compressedrow;
1572   PetscTruth        usecprow=cprow.use;
1573 
1574   PetscFunctionBegin;
1575   if (yy != zz) { ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); }
1576   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1577   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1578 
1579   idx = a->j;
1580   v   = a->a;
1581   if (usecprow){
1582     mbs  = cprow.nrows;
1583     ii   = cprow.i;
1584     ridx = cprow.rindex;
1585   } else {
1586     mbs=a->mbs;
1587     ii = a->i;
1588     xb = x;
1589   }
1590 
1591   switch (bs) {
1592   case 1:
1593     for (i=0; i<mbs; i++) {
1594       if (usecprow) xb = x + ridx[i];
1595       x1 = xb[0];
1596       ib = idx + ii[0];
1597       n  = ii[1] - ii[0]; ii++;
1598       for (j=0; j<n; j++) {
1599         rval    = ib[j];
1600         z[rval] += PetscConj(*v) * x1;
1601         v++;
1602       }
1603       if (!usecprow) xb++;
1604     }
1605     break;
1606   case 2:
1607     for (i=0; i<mbs; i++) {
1608       if (usecprow) xb = x + 2*ridx[i];
1609       x1 = xb[0]; x2 = xb[1];
1610       ib = idx + ii[0];
1611       n  = ii[1] - ii[0]; ii++;
1612       for (j=0; j<n; j++) {
1613         rval      = ib[j]*2;
1614         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2;
1615         z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2;
1616         v  += 4;
1617       }
1618       if (!usecprow) xb += 2;
1619     }
1620     break;
1621   case 3:
1622     for (i=0; i<mbs; i++) {
1623       if (usecprow) xb = x + 3*ridx[i];
1624       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1625       ib = idx + ii[0];
1626       n  = ii[1] - ii[0]; ii++;
1627       for (j=0; j<n; j++) {
1628         rval      = ib[j]*3;
1629         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3;
1630         z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3;
1631         z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3;
1632         v  += 9;
1633       }
1634       if (!usecprow) xb += 3;
1635     }
1636     break;
1637   case 4:
1638     for (i=0; i<mbs; i++) {
1639       if (usecprow) xb = x + 4*ridx[i];
1640       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1641       ib = idx + ii[0];
1642       n  = ii[1] - ii[0]; ii++;
1643       for (j=0; j<n; j++) {
1644         rval      = ib[j]*4;
1645         z[rval++] +=  PetscConj(v[0])*x1 + PetscConj(v[1])*x2  + PetscConj(v[2])*x3  + PetscConj(v[3])*x4;
1646         z[rval++] +=  PetscConj(v[4])*x1 + PetscConj(v[5])*x2  + PetscConj(v[6])*x3  + PetscConj(v[7])*x4;
1647         z[rval++] +=  PetscConj(v[8])*x1 + PetscConj(v[9])*x2  + PetscConj(v[10])*x3 + PetscConj(v[11])*x4;
1648         z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4;
1649         v  += 16;
1650       }
1651       if (!usecprow) xb += 4;
1652     }
1653     break;
1654   case 5:
1655     for (i=0; i<mbs; i++) {
1656       if (usecprow) xb = x + 5*ridx[i];
1657       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1658       x4 = xb[3]; x5 = xb[4];
1659       ib = idx + ii[0];
1660       n  = ii[1] - ii[0]; ii++;
1661       for (j=0; j<n; j++) {
1662         rval      = ib[j]*5;
1663         z[rval++] +=  PetscConj(v[0])*x1 +  PetscConj(v[1])*x2 +  PetscConj(v[2])*x3 +  PetscConj(v[3])*x4 +  PetscConj(v[4])*x5;
1664         z[rval++] +=  PetscConj(v[5])*x1 +  PetscConj(v[6])*x2 +  PetscConj(v[7])*x3 +  PetscConj(v[8])*x4 +  PetscConj(v[9])*x5;
1665         z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5;
1666         z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5;
1667         z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5;
1668         v  += 25;
1669       }
1670       if (!usecprow) xb += 5;
1671     }
1672     break;
1673   default: {      /* block sizes larger than 5 by 5 are handled by BLAS */
1674       PetscInt     ncols,k;
1675       PetscScalar  *work,*workt,*xtmp;
1676 
1677       SETERRQ(PETSC_ERR_SUP,"block size larger than 5 is not supported yet");
1678       if (!a->mult_work) {
1679         k = PetscMax(A->rmap->n,A->cmap->n);
1680         ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
1681       }
1682       work = a->mult_work;
1683       xtmp = x;
1684       for (i=0; i<mbs; i++) {
1685         n     = ii[1] - ii[0]; ii++;
1686         ncols = n*bs;
1687         ierr  = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1688         if (usecprow) {
1689           xtmp = x + bs*ridx[i];
1690         }
1691         Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
1692         /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
1693         v += n*bs2;
1694         if (!usecprow) xtmp += bs;
1695         workt = work;
1696         for (j=0; j<n; j++) {
1697           zb = z + bs*(*idx++);
1698           for (k=0; k<bs; k++) zb[k] += workt[k] ;
1699           workt += bs;
1700         }
1701       }
1702     }
1703   }
1704   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1705   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1706   ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr);
1707   PetscFunctionReturn(0);
1708 }
1709 
1710 #undef __FUNCT__
1711 #define __FUNCT__ "MatMultTransposeAdd_SeqBAIJ"
1712 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1713 {
1714   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1715   PetscScalar       *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5;
1716   MatScalar         *v;
1717   PetscErrorCode    ierr;
1718   PetscInt          mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=PETSC_NULL;
1719   Mat_CompressedRow cprow = a->compressedrow;
1720   PetscTruth        usecprow=cprow.use;
1721 
1722   PetscFunctionBegin;
1723   if (yy != zz) { ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); }
1724   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1725   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1726 
1727   idx = a->j;
1728   v   = a->a;
1729   if (usecprow){
1730     mbs  = cprow.nrows;
1731     ii   = cprow.i;
1732     ridx = cprow.rindex;
1733   } else {
1734     mbs=a->mbs;
1735     ii = a->i;
1736     xb = x;
1737   }
1738 
1739   switch (bs) {
1740   case 1:
1741     for (i=0; i<mbs; i++) {
1742       if (usecprow) xb = x + ridx[i];
1743       x1 = xb[0];
1744       ib = idx + ii[0];
1745       n  = ii[1] - ii[0]; ii++;
1746       for (j=0; j<n; j++) {
1747         rval    = ib[j];
1748         z[rval] += *v * x1;
1749         v++;
1750       }
1751       if (!usecprow) xb++;
1752     }
1753     break;
1754   case 2:
1755     for (i=0; i<mbs; i++) {
1756       if (usecprow) xb = x + 2*ridx[i];
1757       x1 = xb[0]; x2 = xb[1];
1758       ib = idx + ii[0];
1759       n  = ii[1] - ii[0]; ii++;
1760       for (j=0; j<n; j++) {
1761         rval      = ib[j]*2;
1762         z[rval++] += v[0]*x1 + v[1]*x2;
1763         z[rval++] += v[2]*x1 + v[3]*x2;
1764         v  += 4;
1765       }
1766       if (!usecprow) xb += 2;
1767     }
1768     break;
1769   case 3:
1770     for (i=0; i<mbs; i++) {
1771       if (usecprow) xb = x + 3*ridx[i];
1772       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1773       ib = idx + ii[0];
1774       n  = ii[1] - ii[0]; ii++;
1775       for (j=0; j<n; j++) {
1776         rval      = ib[j]*3;
1777         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1778         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1779         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1780         v  += 9;
1781       }
1782       if (!usecprow) xb += 3;
1783     }
1784     break;
1785   case 4:
1786     for (i=0; i<mbs; i++) {
1787       if (usecprow) xb = x + 4*ridx[i];
1788       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1789       ib = idx + ii[0];
1790       n  = ii[1] - ii[0]; ii++;
1791       for (j=0; j<n; j++) {
1792         rval      = ib[j]*4;
1793         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1794         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1795         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1796         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1797         v  += 16;
1798       }
1799       if (!usecprow) xb += 4;
1800     }
1801     break;
1802   case 5:
1803     for (i=0; i<mbs; i++) {
1804       if (usecprow) xb = x + 5*ridx[i];
1805       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1806       x4 = xb[3]; x5 = xb[4];
1807       ib = idx + ii[0];
1808       n  = ii[1] - ii[0]; ii++;
1809       for (j=0; j<n; j++) {
1810         rval      = ib[j]*5;
1811         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1812         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1813         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1814         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1815         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1816         v  += 25;
1817       }
1818       if (!usecprow) xb += 5;
1819     }
1820     break;
1821   default: {      /* block sizes larger then 5 by 5 are handled by BLAS */
1822       PetscInt     ncols,k;
1823       PetscScalar  *work,*workt,*xtmp;
1824 
1825       if (!a->mult_work) {
1826         k = PetscMax(A->rmap->n,A->cmap->n);
1827         ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
1828       }
1829       work = a->mult_work;
1830       xtmp = x;
1831       for (i=0; i<mbs; i++) {
1832         n     = ii[1] - ii[0]; ii++;
1833         ncols = n*bs;
1834         ierr  = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1835         if (usecprow) {
1836           xtmp = x + bs*ridx[i];
1837         }
1838         Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
1839         /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
1840         v += n*bs2;
1841         if (!usecprow) xtmp += bs;
1842         workt = work;
1843         for (j=0; j<n; j++) {
1844           zb = z + bs*(*idx++);
1845           for (k=0; k<bs; k++) zb[k] += workt[k] ;
1846           workt += bs;
1847         }
1848       }
1849     }
1850   }
1851   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1852   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1853   ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr);
1854   PetscFunctionReturn(0);
1855 }
1856 
1857 #undef __FUNCT__
1858 #define __FUNCT__ "MatScale_SeqBAIJ"
1859 PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
1860 {
1861   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)inA->data;
1862   PetscInt       totalnz = a->bs2*a->nz;
1863   PetscScalar    oalpha = alpha;
1864   PetscErrorCode ierr;
1865   PetscBLASInt   one = 1,tnz = PetscBLASIntCast(totalnz);
1866 
1867   PetscFunctionBegin;
1868   BLASscal_(&tnz,&oalpha,a->a,&one);
1869   ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
1870   PetscFunctionReturn(0);
1871 }
1872 
1873 #undef __FUNCT__
1874 #define __FUNCT__ "MatNorm_SeqBAIJ"
1875 PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
1876 {
1877   PetscErrorCode ierr;
1878   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1879   MatScalar      *v = a->a;
1880   PetscReal      sum = 0.0;
1881   PetscInt       i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1;
1882 
1883   PetscFunctionBegin;
1884   if (type == NORM_FROBENIUS) {
1885     for (i=0; i< bs2*nz; i++) {
1886 #if defined(PETSC_USE_COMPLEX)
1887       sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
1888 #else
1889       sum += (*v)*(*v); v++;
1890 #endif
1891     }
1892     *norm = sqrt(sum);
1893   } else if (type == NORM_1) { /* maximum column sum */
1894     PetscReal *tmp;
1895     PetscInt  *bcol = a->j;
1896     ierr = PetscMalloc((A->cmap->n+1)*sizeof(PetscReal),&tmp);CHKERRQ(ierr);
1897     ierr = PetscMemzero(tmp,A->cmap->n*sizeof(PetscReal));CHKERRQ(ierr);
1898     for (i=0; i<nz; i++){
1899       for (j=0; j<bs; j++){
1900         k1 = bs*(*bcol) + j; /* column index */
1901         for (k=0; k<bs; k++){
1902           tmp[k1] += PetscAbsScalar(*v); v++;
1903         }
1904       }
1905       bcol++;
1906     }
1907     *norm = 0.0;
1908     for (j=0; j<A->cmap->n; j++) {
1909       if (tmp[j] > *norm) *norm = tmp[j];
1910     }
1911     ierr = PetscFree(tmp);CHKERRQ(ierr);
1912   } else if (type == NORM_INFINITY) { /* maximum row sum */
1913     *norm = 0.0;
1914     for (k=0; k<bs; k++) {
1915       for (j=0; j<a->mbs; j++) {
1916         v = a->a + bs2*a->i[j] + k;
1917         sum = 0.0;
1918         for (i=0; i<a->i[j+1]-a->i[j]; i++) {
1919           for (k1=0; k1<bs; k1++){
1920             sum += PetscAbsScalar(*v);
1921             v   += bs;
1922           }
1923         }
1924         if (sum > *norm) *norm = sum;
1925       }
1926     }
1927   } else {
1928     SETERRQ(PETSC_ERR_SUP,"No support for this norm yet");
1929   }
1930   PetscFunctionReturn(0);
1931 }
1932 
1933 
1934 #undef __FUNCT__
1935 #define __FUNCT__ "MatEqual_SeqBAIJ"
1936 PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscTruth* flg)
1937 {
1938   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data,*b = (Mat_SeqBAIJ *)B->data;
1939   PetscErrorCode ierr;
1940 
1941   PetscFunctionBegin;
1942   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1943   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1944     *flg = PETSC_FALSE;
1945     PetscFunctionReturn(0);
1946   }
1947 
1948   /* if the a->i are the same */
1949   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1950   if (!*flg) {
1951     PetscFunctionReturn(0);
1952   }
1953 
1954   /* if a->j are the same */
1955   ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1956   if (!*flg) {
1957     PetscFunctionReturn(0);
1958   }
1959   /* if a->a are the same */
1960   ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
1961   PetscFunctionReturn(0);
1962 
1963 }
1964 
1965 #undef __FUNCT__
1966 #define __FUNCT__ "MatGetDiagonal_SeqBAIJ"
1967 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
1968 {
1969   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1970   PetscErrorCode ierr;
1971   PetscInt       i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1972   PetscScalar    *x,zero = 0.0;
1973   MatScalar      *aa,*aa_j;
1974 
1975   PetscFunctionBegin;
1976   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1977   bs   = A->rmap->bs;
1978   aa   = a->a;
1979   ai   = a->i;
1980   aj   = a->j;
1981   ambs = a->mbs;
1982   bs2  = a->bs2;
1983 
1984   ierr = VecSet(v,zero);CHKERRQ(ierr);
1985   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1986   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1987   if (n != A->rmap->N) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1988   for (i=0; i<ambs; i++) {
1989     for (j=ai[i]; j<ai[i+1]; j++) {
1990       if (aj[j] == i) {
1991         row  = i*bs;
1992         aa_j = aa+j*bs2;
1993         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1994         break;
1995       }
1996     }
1997   }
1998   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1999   PetscFunctionReturn(0);
2000 }
2001 
2002 #undef __FUNCT__
2003 #define __FUNCT__ "MatDiagonalScale_SeqBAIJ"
2004 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
2005 {
2006   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2007   PetscScalar    *l,*r,x,*li,*ri;
2008   MatScalar      *aa,*v;
2009   PetscErrorCode ierr;
2010   PetscInt       i,j,k,lm,rn,M,m,n,*ai,*aj,mbs,tmp,bs,bs2;
2011 
2012   PetscFunctionBegin;
2013   ai  = a->i;
2014   aj  = a->j;
2015   aa  = a->a;
2016   m   = A->rmap->n;
2017   n   = A->cmap->n;
2018   bs  = A->rmap->bs;
2019   mbs = a->mbs;
2020   bs2 = a->bs2;
2021   if (ll) {
2022     ierr = VecGetArray(ll,&l);CHKERRQ(ierr);
2023     ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
2024     if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
2025     for (i=0; i<mbs; i++) { /* for each block row */
2026       M  = ai[i+1] - ai[i];
2027       li = l + i*bs;
2028       v  = aa + bs2*ai[i];
2029       for (j=0; j<M; j++) { /* for each block */
2030         for (k=0; k<bs2; k++) {
2031           (*v++) *= li[k%bs];
2032         }
2033       }
2034     }
2035     ierr = VecRestoreArray(ll,&l);CHKERRQ(ierr);
2036     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2037   }
2038 
2039   if (rr) {
2040     ierr = VecGetArray(rr,&r);CHKERRQ(ierr);
2041     ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr);
2042     if (rn != n) SETERRQ(PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
2043     for (i=0; i<mbs; i++) { /* for each block row */
2044       M  = ai[i+1] - ai[i];
2045       v  = aa + bs2*ai[i];
2046       for (j=0; j<M; j++) { /* for each block */
2047         ri = r + bs*aj[ai[i]+j];
2048         for (k=0; k<bs; k++) {
2049           x = ri[k];
2050           for (tmp=0; tmp<bs; tmp++) (*v++) *= x;
2051         }
2052       }
2053     }
2054     ierr = VecRestoreArray(rr,&r);CHKERRQ(ierr);
2055     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2056   }
2057   PetscFunctionReturn(0);
2058 }
2059 
2060 
2061 #undef __FUNCT__
2062 #define __FUNCT__ "MatGetInfo_SeqBAIJ"
2063 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
2064 {
2065   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2066 
2067   PetscFunctionBegin;
2068   info->block_size     = a->bs2;
2069   info->nz_allocated   = a->maxnz;
2070   info->nz_used        = a->bs2*a->nz;
2071   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
2072   info->assemblies   = A->num_ass;
2073   info->mallocs      = a->reallocs;
2074   info->memory       = ((PetscObject)A)->mem;
2075   if (A->factor) {
2076     info->fill_ratio_given  = A->info.fill_ratio_given;
2077     info->fill_ratio_needed = A->info.fill_ratio_needed;
2078     info->factor_mallocs    = A->info.factor_mallocs;
2079   } else {
2080     info->fill_ratio_given  = 0;
2081     info->fill_ratio_needed = 0;
2082     info->factor_mallocs    = 0;
2083   }
2084   PetscFunctionReturn(0);
2085 }
2086 
2087 
2088 #undef __FUNCT__
2089 #define __FUNCT__ "MatZeroEntries_SeqBAIJ"
2090 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
2091 {
2092   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2093   PetscErrorCode ierr;
2094 
2095   PetscFunctionBegin;
2096   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
2097   PetscFunctionReturn(0);
2098 }
2099