xref: /petsc/src/mat/impls/baij/seq/baij2.c (revision 2ae9f97c2b3342c7fa240ee3f9868a1cbf4cbe21)
1 #define PETSCMAT_DLL
2 
3 #include "../src/mat/impls/baij/seq/baij.h"
4 #include "../src/mat/blockinvert.h"
5 #include "petscbt.h"
6 
7 #undef __FUNCT__
8 #define __FUNCT__ "MatIncreaseOverlap_SeqBAIJ"
9 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
10 {
11   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
12   PetscErrorCode ierr;
13   PetscInt       row,i,j,k,l,m,n,*nidx,isz,val,ival;
14   const PetscInt *idx;
15   PetscInt       start,end,*ai,*aj,bs,*nidx2;
16   PetscBT        table;
17 
18   PetscFunctionBegin;
19   m     = a->mbs;
20   ai    = a->i;
21   aj    = a->j;
22   bs    = A->rmap->bs;
23 
24   if (ov < 0)  SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
25 
26   ierr = PetscBTCreate(m,table);CHKERRQ(ierr);
27   ierr = PetscMalloc((m+1)*sizeof(PetscInt),&nidx);CHKERRQ(ierr);
28   ierr = PetscMalloc((A->rmap->N+1)*sizeof(PetscInt),&nidx2);CHKERRQ(ierr);
29 
30   for (i=0; i<is_max; i++) {
31     /* Initialise the two local arrays */
32     isz  = 0;
33     ierr = PetscBTMemzero(m,table);CHKERRQ(ierr);
34 
35     /* Extract the indices, assume there can be duplicate entries */
36     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
37     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
38 
39     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
40     for (j=0; j<n ; ++j){
41       ival = idx[j]/bs; /* convert the indices into block indices */
42       if (ival>=m) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
43       if(!PetscBTLookupSet(table,ival)) { nidx[isz++] = ival;}
44     }
45     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
46     ierr = ISDestroy(is[i]);CHKERRQ(ierr);
47 
48     k = 0;
49     for (j=0; j<ov; j++){ /* for each overlap*/
50       n = isz;
51       for (; k<n ; k++){ /* do only those rows in nidx[k], which are not done yet */
52         row   = nidx[k];
53         start = ai[row];
54         end   = ai[row+1];
55         for (l = start; l<end ; l++){
56           val = aj[l];
57           if (!PetscBTLookupSet(table,val)) {nidx[isz++] = val;}
58         }
59       }
60     }
61     /* expand the Index Set */
62     for (j=0; j<isz; j++) {
63       for (k=0; k<bs; k++)
64         nidx2[j*bs+k] = nidx[j]*bs+k;
65     }
66     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);CHKERRQ(ierr);
67   }
68   ierr = PetscBTDestroy(table);CHKERRQ(ierr);
69   ierr = PetscFree(nidx);CHKERRQ(ierr);
70   ierr = PetscFree(nidx2);CHKERRQ(ierr);
71   PetscFunctionReturn(0);
72 }
73 
74 #undef __FUNCT__
75 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ_Private"
76 PetscErrorCode MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
77 {
78   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*c;
79   PetscErrorCode ierr;
80   PetscInt       *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
81   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
82   const PetscInt *irow,*icol;
83   PetscInt       nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
84   PetscInt       *aj = a->j,*ai = a->i;
85   MatScalar      *mat_a;
86   Mat            C;
87   PetscTruth     flag,sorted;
88 
89   PetscFunctionBegin;
90   ierr = ISSorted(iscol,&sorted);CHKERRQ(ierr);
91   if (!sorted) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
92 
93   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
94   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
95   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
96   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
97 
98   ierr = PetscMalloc((1+oldcols)*sizeof(PetscInt),&smap);CHKERRQ(ierr);
99   ssmap = smap;
100   ierr = PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);CHKERRQ(ierr);
101   ierr  = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr);
102   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
103   /* determine lens of each row */
104   for (i=0; i<nrows; i++) {
105     kstart  = ai[irow[i]];
106     kend    = kstart + a->ilen[irow[i]];
107     lens[i] = 0;
108       for (k=kstart; k<kend; k++) {
109         if (ssmap[aj[k]]) {
110           lens[i]++;
111         }
112       }
113     }
114   /* Create and fill new matrix */
115   if (scall == MAT_REUSE_MATRIX) {
116     c = (Mat_SeqBAIJ *)((*B)->data);
117 
118     if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
119     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr);
120     if (!flag) {
121       SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
122     }
123     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr);
124     C = *B;
125   } else {
126     ierr = MatCreate(((PetscObject)A)->comm,&C);CHKERRQ(ierr);
127     ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
128     ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
129     ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(C,bs,0,lens);CHKERRQ(ierr);
130   }
131   c = (Mat_SeqBAIJ *)(C->data);
132   for (i=0; i<nrows; i++) {
133     row    = irow[i];
134     kstart = ai[row];
135     kend   = kstart + a->ilen[row];
136     mat_i  = c->i[i];
137     mat_j  = c->j + mat_i;
138     mat_a  = c->a + mat_i*bs2;
139     mat_ilen = c->ilen + i;
140     for (k=kstart; k<kend; k++) {
141       if ((tcol=ssmap[a->j[k]])) {
142         *mat_j++ = tcol - 1;
143         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
144         mat_a   += bs2;
145         (*mat_ilen)++;
146       }
147     }
148   }
149 
150   /* Free work space */
151   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
152   ierr = PetscFree(smap);CHKERRQ(ierr);
153   ierr = PetscFree(lens);CHKERRQ(ierr);
154   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
155   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
156 
157   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
158   *B = C;
159   PetscFunctionReturn(0);
160 }
161 
162 #undef __FUNCT__
163 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ"
164 PetscErrorCode MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
165 {
166   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
167   IS             is1,is2;
168   PetscErrorCode ierr;
169   PetscInt       *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count;
170   const PetscInt *irow,*icol;
171 
172   PetscFunctionBegin;
173   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
174   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
175   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
176   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
177 
178   /* Verify if the indices corespond to each element in a block
179    and form the IS with compressed IS */
180   ierr = PetscMalloc2(a->mbs,PetscInt,&vary,a->mbs,PetscInt,&iary);CHKERRQ(ierr);
181   ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr);
182   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
183   count = 0;
184   for (i=0; i<a->mbs; i++) {
185     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
186     if (vary[i]==bs) iary[count++] = i;
187   }
188   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr);
189 
190   ierr = PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));CHKERRQ(ierr);
191   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
192   count = 0;
193   for (i=0; i<a->mbs; i++) {
194     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_PLIB,"Internal error in PETSc");
195     if (vary[i]==bs) iary[count++] = i;
196   }
197   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is2);CHKERRQ(ierr);
198   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
199   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
200   ierr = PetscFree2(vary,iary);CHKERRQ(ierr);
201 
202   ierr = MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr);
203   ISDestroy(is1);
204   ISDestroy(is2);
205   PetscFunctionReturn(0);
206 }
207 
208 #undef __FUNCT__
209 #define __FUNCT__ "MatGetSubMatrices_SeqBAIJ"
210 PetscErrorCode MatGetSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
211 {
212   PetscErrorCode ierr;
213   PetscInt       i;
214 
215   PetscFunctionBegin;
216   if (scall == MAT_INITIAL_MATRIX) {
217     ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr);
218   }
219 
220   for (i=0; i<n; i++) {
221     ierr = MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr);
222   }
223   PetscFunctionReturn(0);
224 }
225 
226 
227 /* -------------------------------------------------------*/
228 /* Should check that shapes of vectors and matrices match */
229 /* -------------------------------------------------------*/
230 #include "petscblaslapack.h"
231 
232 #undef __FUNCT__
233 #define __FUNCT__ "MatMult_SeqBAIJ_1"
234 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
235 {
236   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
237   PetscScalar       *z,sum;
238   const PetscScalar *x;
239   const MatScalar   *v;
240   PetscErrorCode    ierr;
241   PetscInt          mbs,i,n,nonzerorow=0;
242   const PetscInt    *idx,*ii,*ridx=PETSC_NULL;
243   PetscTruth        usecprow=a->compressedrow.use;
244 
245   PetscFunctionBegin;
246   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
247   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
248 
249   if (usecprow){
250     mbs  = a->compressedrow.nrows;
251     ii   = a->compressedrow.i;
252     ridx = a->compressedrow.rindex;
253     ierr = PetscMemzero(z,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
254   } else {
255     mbs = a->mbs;
256     ii  = a->i;
257   }
258 
259   for (i=0; i<mbs; i++) {
260     n    = ii[1] - ii[0];
261     v    = a->a + ii[0];
262     idx  = a->j + ii[0];
263     ii++;
264     sum  = 0.0;
265     PetscSparseDensePlusDot(sum,x,v,idx,n);
266     if (usecprow){
267       z[ridx[i]] = sum;
268     } else {
269       nonzerorow += (n>0);
270       z[i] = sum;
271     }
272   }
273   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
274   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
275   ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr);
276   PetscFunctionReturn(0);
277 }
278 
279 #undef __FUNCT__
280 #define __FUNCT__ "MatMult_SeqBAIJ_2"
281 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
282 {
283   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
284   PetscScalar       *z = 0,sum1,sum2,*zarray;
285   const PetscScalar *x,*xb;
286   PetscScalar       x1,x2;
287   const MatScalar   *v;
288   PetscErrorCode    ierr;
289   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
290   PetscTruth        usecprow=a->compressedrow.use;
291 
292   PetscFunctionBegin;
293   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
294   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
295 
296   idx = a->j;
297   v   = a->a;
298   if (usecprow){
299     mbs  = a->compressedrow.nrows;
300     ii   = a->compressedrow.i;
301     ridx = a->compressedrow.rindex;
302   } else {
303     mbs = a->mbs;
304     ii  = a->i;
305     z   = zarray;
306   }
307 
308   for (i=0; i<mbs; i++) {
309     n  = ii[1] - ii[0]; ii++;
310     sum1 = 0.0; sum2 = 0.0;
311     nonzerorow += (n>0);
312     for (j=0; j<n; j++) {
313       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
314       sum1 += v[0]*x1 + v[2]*x2;
315       sum2 += v[1]*x1 + v[3]*x2;
316       v += 4;
317     }
318     if (usecprow) z = zarray + 2*ridx[i];
319     z[0] = sum1; z[1] = sum2;
320     if (!usecprow) z += 2;
321   }
322   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
323   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
324   ierr = PetscLogFlops(8.0*a->nz - 2.0*nonzerorow);CHKERRQ(ierr);
325   PetscFunctionReturn(0);
326 }
327 
328 #undef __FUNCT__
329 #define __FUNCT__ "MatMult_SeqBAIJ_3"
330 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
331 {
332   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
333   PetscScalar       *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray;
334   const PetscScalar *x,*xb;
335   const MatScalar   *v;
336   PetscErrorCode    ierr;
337   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
338   PetscTruth        usecprow=a->compressedrow.use;
339 
340 
341 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
342 #pragma disjoint(*v,*z,*xb)
343 #endif
344 
345   PetscFunctionBegin;
346   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
347   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
348 
349   idx = a->j;
350   v   = a->a;
351   if (usecprow){
352     mbs  = a->compressedrow.nrows;
353     ii   = a->compressedrow.i;
354     ridx = a->compressedrow.rindex;
355   } else {
356     mbs = a->mbs;
357     ii  = a->i;
358     z   = zarray;
359   }
360 
361   for (i=0; i<mbs; i++) {
362     n  = ii[1] - ii[0]; ii++;
363     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
364     nonzerorow += (n>0);
365     for (j=0; j<n; j++) {
366       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
367       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
368       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
369       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
370       v += 9;
371     }
372     if (usecprow) z = zarray + 3*ridx[i];
373     z[0] = sum1; z[1] = sum2; z[2] = sum3;
374     if (!usecprow) z += 3;
375   }
376   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
377   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
378   ierr = PetscLogFlops(18.0*a->nz - 3.0*nonzerorow);CHKERRQ(ierr);
379   PetscFunctionReturn(0);
380 }
381 
382 #undef __FUNCT__
383 #define __FUNCT__ "MatMult_SeqBAIJ_4"
384 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
385 {
386   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
387   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
388   const PetscScalar *x,*xb;
389   const MatScalar   *v;
390   PetscErrorCode    ierr;
391   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
392   PetscTruth        usecprow=a->compressedrow.use;
393 
394   PetscFunctionBegin;
395   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
396   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
397 
398   idx = a->j;
399   v   = a->a;
400   if (usecprow){
401     mbs  = a->compressedrow.nrows;
402     ii   = a->compressedrow.i;
403     ridx = a->compressedrow.rindex;
404   } else {
405     mbs = a->mbs;
406     ii  = a->i;
407     z   = zarray;
408   }
409 
410   for (i=0; i<mbs; i++) {
411     n  = ii[1] - ii[0]; ii++;
412     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
413     nonzerorow += (n>0);
414     for (j=0; j<n; j++) {
415       xb = x + 4*(*idx++);
416       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
417       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
418       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
419       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
420       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
421       v += 16;
422     }
423     if (usecprow) z = zarray + 4*ridx[i];
424     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
425     if (!usecprow) z += 4;
426   }
427   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
428   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
429   ierr = PetscLogFlops(32.0*a->nz - 4.0*nonzerorow);CHKERRQ(ierr);
430   PetscFunctionReturn(0);
431 }
432 
433 #undef __FUNCT__
434 #define __FUNCT__ "MatMult_SeqBAIJ_5"
435 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
436 {
437   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
438   PetscScalar       sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray;
439   const PetscScalar *xb,*x;
440   const MatScalar   *v;
441   PetscErrorCode    ierr;
442   const PetscInt    *idx,*ii,*ridx=PETSC_NULL;
443   PetscInt          mbs,i,j,n,nonzerorow=0;
444   PetscTruth        usecprow=a->compressedrow.use;
445 
446   PetscFunctionBegin;
447   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
448   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
449 
450   idx = a->j;
451   v   = a->a;
452   if (usecprow){
453     mbs  = a->compressedrow.nrows;
454     ii   = a->compressedrow.i;
455     ridx = a->compressedrow.rindex;
456   } else {
457     mbs = a->mbs;
458     ii  = a->i;
459     z   = zarray;
460   }
461 
462   for (i=0; i<mbs; i++) {
463     n  = ii[1] - ii[0]; ii++;
464     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
465     nonzerorow += (n>0);
466     for (j=0; j<n; j++) {
467       xb = x + 5*(*idx++);
468       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
469       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
470       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
471       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
472       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
473       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
474       v += 25;
475     }
476     if (usecprow) z = zarray + 5*ridx[i];
477     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
478     if (!usecprow) z += 5;
479   }
480   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
481   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
482   ierr = PetscLogFlops(50.0*a->nz - 5.0*nonzerorow);CHKERRQ(ierr);
483   PetscFunctionReturn(0);
484 }
485 
486 
487 #undef __FUNCT__
488 #define __FUNCT__ "MatMult_SeqBAIJ_6"
489 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
490 {
491   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
492   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
493   const PetscScalar *x,*xb;
494   PetscScalar       x1,x2,x3,x4,x5,x6,*zarray;
495   const MatScalar   *v;
496   PetscErrorCode    ierr;
497   PetscInt          mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
498   PetscTruth        usecprow=a->compressedrow.use;
499 
500   PetscFunctionBegin;
501   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
502   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
503 
504   idx = a->j;
505   v   = a->a;
506   if (usecprow){
507     mbs  = a->compressedrow.nrows;
508     ii   = a->compressedrow.i;
509     ridx = a->compressedrow.rindex;
510   } else {
511     mbs = a->mbs;
512     ii  = a->i;
513     z   = zarray;
514   }
515 
516   for (i=0; i<mbs; i++) {
517     n  = ii[1] - ii[0]; ii++;
518     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0;
519     nonzerorow += (n>0);
520     for (j=0; j<n; j++) {
521       xb = x + 6*(*idx++);
522       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
523       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
524       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
525       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
526       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
527       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
528       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
529       v += 36;
530     }
531     if (usecprow) z = zarray + 6*ridx[i];
532     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
533     if (!usecprow) z += 6;
534   }
535 
536   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
537   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
538   ierr = PetscLogFlops(72.0*a->nz - 6.0*nonzerorow);CHKERRQ(ierr);
539   PetscFunctionReturn(0);
540 }
541 #undef __FUNCT__
542 #define __FUNCT__ "MatMult_SeqBAIJ_7"
543 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
544 {
545   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
546   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
547   const PetscScalar *x,*xb;
548   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*zarray;
549   const MatScalar   *v;
550   PetscErrorCode    ierr;
551   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
552   PetscTruth        usecprow=a->compressedrow.use;
553 
554   PetscFunctionBegin;
555   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
556   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
557 
558   idx = a->j;
559   v   = a->a;
560   if (usecprow){
561     mbs    = a->compressedrow.nrows;
562     ii     = a->compressedrow.i;
563     ridx = a->compressedrow.rindex;
564   } else {
565     mbs = a->mbs;
566     ii  = a->i;
567     z   = zarray;
568   }
569 
570   for (i=0; i<mbs; i++) {
571     n  = ii[1] - ii[0]; ii++;
572     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
573     nonzerorow += (n>0);
574     for (j=0; j<n; j++) {
575       xb = x + 7*(*idx++);
576       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
577       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
578       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
579       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
580       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
581       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
582       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
583       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
584       v += 49;
585     }
586     if (usecprow) z = zarray + 7*ridx[i];
587     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
588     if (!usecprow) z += 7;
589   }
590 
591   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
592   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
593   ierr = PetscLogFlops(98.0*a->nz - 7.0*nonzerorow);CHKERRQ(ierr);
594   PetscFunctionReturn(0);
595 }
596 
597 /*
598     This will not work with MatScalar == float because it calls the BLAS
599 */
600 #undef __FUNCT__
601 #define __FUNCT__ "MatMult_SeqBAIJ_N"
602 PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
603 {
604   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
605   PetscScalar    *x,*z = 0,*xb,*work,*workt,*zarray;
606   MatScalar      *v;
607   PetscErrorCode ierr;
608   PetscInt       mbs=a->mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2;
609   PetscInt       ncols,k,*ridx=PETSC_NULL,nonzerorow=0;
610   PetscTruth     usecprow=a->compressedrow.use;
611 
612   PetscFunctionBegin;
613   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
614   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
615 
616   idx = a->j;
617   v   = a->a;
618   if (usecprow){
619     mbs  = a->compressedrow.nrows;
620     ii   = a->compressedrow.i;
621     ridx = a->compressedrow.rindex;
622   } else {
623     mbs = a->mbs;
624     ii  = a->i;
625     z   = zarray;
626   }
627 
628   if (!a->mult_work) {
629     k    = PetscMax(A->rmap->n,A->cmap->n);
630     ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
631   }
632   work = a->mult_work;
633   for (i=0; i<mbs; i++) {
634     n     = ii[1] - ii[0]; ii++;
635     ncols = n*bs;
636     workt = work;
637     nonzerorow += (n>0);
638     for (j=0; j<n; j++) {
639       xb = x + bs*(*idx++);
640       for (k=0; k<bs; k++) workt[k] = xb[k];
641       workt += bs;
642     }
643     if (usecprow) z = zarray + bs*ridx[i];
644     Kernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
645     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
646     v += n*bs2;
647     if (!usecprow) z += bs;
648   }
649   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
650   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
651   ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*nonzerorow);CHKERRQ(ierr);
652   PetscFunctionReturn(0);
653 }
654 
655 extern PetscErrorCode VecCopy_Seq(Vec,Vec);
656 #undef __FUNCT__
657 #define __FUNCT__ "MatMultAdd_SeqBAIJ_1"
658 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
659 {
660   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data;
661   const PetscScalar  *x;
662   PetscScalar        *y,*z,sum;
663   const MatScalar    *v;
664   PetscErrorCode     ierr;
665   PetscInt           mbs=a->mbs,i,n,*ridx=PETSC_NULL,nonzerorow=0;
666   const PetscInt     *idx,*ii;
667   PetscTruth         usecprow=a->compressedrow.use;
668 
669   PetscFunctionBegin;
670   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
671   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
672   if (zz != yy) {
673     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
674   } else {
675     z = y;
676   }
677 
678   idx = a->j;
679   v   = a->a;
680   if (usecprow){
681     if (zz != yy){
682       ierr = PetscMemcpy(z,y,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
683     }
684     mbs  = a->compressedrow.nrows;
685     ii   = a->compressedrow.i;
686     ridx = a->compressedrow.rindex;
687   } else {
688     ii  = a->i;
689   }
690 
691   for (i=0; i<mbs; i++) {
692     n    = ii[1] - ii[0];
693     ii++;
694     if (!usecprow){
695       nonzerorow += (n>0);
696       sum = y[i];
697     } else {
698       sum = y[ridx[i]];
699     }
700     PetscSparseDensePlusDot(sum,x,v,idx,n);
701     v += n;
702     idx += n;
703     if (usecprow){
704       z[ridx[i]] = sum;
705     } else {
706       z[i] = sum;
707     }
708   }
709   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
710   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
711   if (zz != yy) {
712     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
713   }
714   ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr);
715   PetscFunctionReturn(0);
716 }
717 
718 #undef __FUNCT__
719 #define __FUNCT__ "MatMultAdd_SeqBAIJ_2"
720 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
721 {
722   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
723   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2;
724   PetscScalar    x1,x2,*yarray,*zarray;
725   MatScalar      *v;
726   PetscErrorCode ierr;
727   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
728   PetscTruth     usecprow=a->compressedrow.use;
729 
730   PetscFunctionBegin;
731   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
732   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
733   if (zz != yy) {
734     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
735   } else {
736     zarray = yarray;
737   }
738 
739   idx = a->j;
740   v   = a->a;
741   if (usecprow){
742     if (zz != yy){
743       ierr = PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
744     }
745     mbs  = a->compressedrow.nrows;
746     ii   = a->compressedrow.i;
747     ridx = a->compressedrow.rindex;
748     if (zz != yy){
749       ierr = PetscMemcpy(zarray,yarray,a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
750     }
751   } else {
752     ii  = a->i;
753     y   = yarray;
754     z   = zarray;
755   }
756 
757   for (i=0; i<mbs; i++) {
758     n  = ii[1] - ii[0]; ii++;
759     if (usecprow){
760       z = zarray + 2*ridx[i];
761       y = yarray + 2*ridx[i];
762     }
763     sum1 = y[0]; sum2 = y[1];
764     for (j=0; j<n; j++) {
765       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
766       sum1 += v[0]*x1 + v[2]*x2;
767       sum2 += v[1]*x1 + v[3]*x2;
768       v += 4;
769     }
770     z[0] = sum1; z[1] = sum2;
771     if (!usecprow){
772       z += 2; y += 2;
773     }
774   }
775   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
776   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
777   if (zz != yy) {
778     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
779   }
780   ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr);
781   PetscFunctionReturn(0);
782 }
783 
784 #undef __FUNCT__
785 #define __FUNCT__ "MatMultAdd_SeqBAIJ_3"
786 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
787 {
788   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
789   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
790   MatScalar      *v;
791   PetscErrorCode ierr;
792   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
793   PetscTruth     usecprow=a->compressedrow.use;
794 
795   PetscFunctionBegin;
796   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
797   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
798   if (zz != yy) {
799     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
800   } else {
801     zarray = yarray;
802   }
803 
804   idx = a->j;
805   v   = a->a;
806   if (usecprow){
807     if (zz != yy){
808       ierr = PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
809     }
810     mbs  = a->compressedrow.nrows;
811     ii   = a->compressedrow.i;
812     ridx = a->compressedrow.rindex;
813   } else {
814     ii  = a->i;
815     y   = yarray;
816     z   = zarray;
817   }
818 
819   for (i=0; i<mbs; i++) {
820     n  = ii[1] - ii[0]; ii++;
821     if (usecprow){
822       z = zarray + 3*ridx[i];
823       y = yarray + 3*ridx[i];
824     }
825     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
826     for (j=0; j<n; j++) {
827       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
828       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
829       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
830       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
831       v += 9;
832     }
833     z[0] = sum1; z[1] = sum2; z[2] = sum3;
834     if (!usecprow){
835       z += 3; y += 3;
836     }
837   }
838   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
839   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
840   if (zz != yy) {
841     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
842   }
843   ierr = PetscLogFlops(18.0*a->nz);CHKERRQ(ierr);
844   PetscFunctionReturn(0);
845 }
846 
847 #undef __FUNCT__
848 #define __FUNCT__ "MatMultAdd_SeqBAIJ_4"
849 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
850 {
851   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
852   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
853   MatScalar      *v;
854   PetscErrorCode ierr;
855   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
856   PetscTruth     usecprow=a->compressedrow.use;
857 
858   PetscFunctionBegin;
859   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
860   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
861   if (zz != yy) {
862     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
863   } else {
864     zarray = yarray;
865   }
866 
867   idx   = a->j;
868   v     = a->a;
869   if (usecprow){
870     if (zz != yy){
871       ierr = PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
872     }
873     mbs  = a->compressedrow.nrows;
874     ii   = a->compressedrow.i;
875     ridx = a->compressedrow.rindex;
876   } else {
877     ii  = a->i;
878     y   = yarray;
879     z   = zarray;
880   }
881 
882   for (i=0; i<mbs; i++) {
883     n  = ii[1] - ii[0]; ii++;
884     if (usecprow){
885       z = zarray + 4*ridx[i];
886       y = yarray + 4*ridx[i];
887     }
888     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
889     for (j=0; j<n; j++) {
890       xb = x + 4*(*idx++);
891       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
892       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
893       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
894       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
895       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
896       v += 16;
897     }
898     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
899     if (!usecprow){
900       z += 4; y += 4;
901     }
902   }
903   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
904   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
905   if (zz != yy) {
906     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
907   }
908   ierr = PetscLogFlops(32.0*a->nz);CHKERRQ(ierr);
909   PetscFunctionReturn(0);
910 }
911 
912 #undef __FUNCT__
913 #define __FUNCT__ "MatMultAdd_SeqBAIJ_5"
914 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
915 {
916   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
917   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
918   PetscScalar    *yarray,*zarray;
919   MatScalar      *v;
920   PetscErrorCode ierr;
921   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
922   PetscTruth     usecprow=a->compressedrow.use;
923 
924   PetscFunctionBegin;
925   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
926   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
927   if (zz != yy) {
928     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
929   } else {
930     zarray = yarray;
931   }
932 
933   idx = a->j;
934   v   = a->a;
935   if (usecprow){
936     if (zz != yy){
937       ierr = PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
938     }
939     mbs  = a->compressedrow.nrows;
940     ii   = a->compressedrow.i;
941     ridx = a->compressedrow.rindex;
942   } else {
943     ii  = a->i;
944     y   = yarray;
945     z   = zarray;
946   }
947 
948   for (i=0; i<mbs; i++) {
949     n  = ii[1] - ii[0]; ii++;
950     if (usecprow){
951       z = zarray + 5*ridx[i];
952       y = yarray + 5*ridx[i];
953     }
954     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
955     for (j=0; j<n; j++) {
956       xb = x + 5*(*idx++);
957       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
958       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
959       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
960       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
961       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
962       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
963       v += 25;
964     }
965     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
966     if (!usecprow){
967       z += 5; y += 5;
968     }
969   }
970   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
971   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
972   if (zz != yy) {
973     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
974   }
975   ierr = PetscLogFlops(50.0*a->nz);CHKERRQ(ierr);
976   PetscFunctionReturn(0);
977 }
978 #undef __FUNCT__
979 #define __FUNCT__ "MatMultAdd_SeqBAIJ_6"
980 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
981 {
982   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
983   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6;
984   PetscScalar    x1,x2,x3,x4,x5,x6,*yarray,*zarray;
985   MatScalar      *v;
986   PetscErrorCode ierr;
987   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
988   PetscTruth     usecprow=a->compressedrow.use;
989 
990   PetscFunctionBegin;
991   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
992   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
993   if (zz != yy) {
994     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
995   } else {
996     zarray = yarray;
997   }
998 
999   idx = a->j;
1000   v   = a->a;
1001   if (usecprow){
1002     if (zz != yy){
1003       ierr = PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1004     }
1005     mbs  = a->compressedrow.nrows;
1006     ii   = a->compressedrow.i;
1007     ridx = a->compressedrow.rindex;
1008   } else {
1009     ii  = a->i;
1010     y   = yarray;
1011     z   = zarray;
1012   }
1013 
1014   for (i=0; i<mbs; i++) {
1015     n  = ii[1] - ii[0]; ii++;
1016     if (usecprow){
1017       z = zarray + 6*ridx[i];
1018       y = yarray + 6*ridx[i];
1019     }
1020     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
1021     for (j=0; j<n; j++) {
1022       xb = x + 6*(*idx++);
1023       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
1024       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
1025       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
1026       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
1027       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
1028       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
1029       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
1030       v += 36;
1031     }
1032     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
1033     if (!usecprow){
1034       z += 6; y += 6;
1035     }
1036   }
1037   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1038   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
1039   if (zz != yy) {
1040     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1041   }
1042   ierr = PetscLogFlops(72.0*a->nz);CHKERRQ(ierr);
1043   PetscFunctionReturn(0);
1044 }
1045 
1046 #undef __FUNCT__
1047 #define __FUNCT__ "MatMultAdd_SeqBAIJ_7"
1048 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1049 {
1050   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1051   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1052   PetscScalar    x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
1053   MatScalar      *v;
1054   PetscErrorCode ierr;
1055   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1056   PetscTruth     usecprow=a->compressedrow.use;
1057 
1058   PetscFunctionBegin;
1059   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1060   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
1061   if (zz != yy) {
1062     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1063   } else {
1064     zarray = yarray;
1065   }
1066 
1067   idx = a->j;
1068   v   = a->a;
1069   if (usecprow){
1070     if (zz != yy){
1071       ierr = PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1072     }
1073     mbs  = a->compressedrow.nrows;
1074     ii   = a->compressedrow.i;
1075     ridx = a->compressedrow.rindex;
1076   } else {
1077     ii  = a->i;
1078     y   = yarray;
1079     z   = zarray;
1080   }
1081 
1082   for (i=0; i<mbs; i++) {
1083     n  = ii[1] - ii[0]; ii++;
1084     if (usecprow){
1085       z = zarray + 7*ridx[i];
1086       y = yarray + 7*ridx[i];
1087     }
1088     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1089     for (j=0; j<n; j++) {
1090       xb = x + 7*(*idx++);
1091       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1092       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1093       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1094       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1095       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1096       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1097       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1098       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1099       v += 49;
1100     }
1101     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1102     if (!usecprow){
1103       z += 7; y += 7;
1104     }
1105   }
1106   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1107   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
1108   if (zz != yy) {
1109     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1110   }
1111   ierr = PetscLogFlops(98.0*a->nz);CHKERRQ(ierr);
1112   PetscFunctionReturn(0);
1113 }
1114 
1115 #undef __FUNCT__
1116 #define __FUNCT__ "MatMultAdd_SeqBAIJ_N"
1117 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1118 {
1119   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1120   PetscScalar    *x,*z = 0,*xb,*work,*workt,*zarray;
1121   MatScalar      *v;
1122   PetscErrorCode ierr;
1123   PetscInt       mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2;
1124   PetscInt       ncols,k,*ridx=PETSC_NULL;
1125   PetscTruth     usecprow=a->compressedrow.use;
1126 
1127   PetscFunctionBegin;
1128   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
1129   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1130   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1131 
1132   idx = a->j;
1133   v   = a->a;
1134   if (usecprow){
1135     mbs    = a->compressedrow.nrows;
1136     ii     = a->compressedrow.i;
1137     ridx = a->compressedrow.rindex;
1138   } else {
1139     mbs = a->mbs;
1140     ii  = a->i;
1141     z   = zarray;
1142   }
1143 
1144   if (!a->mult_work) {
1145     k    = PetscMax(A->rmap->n,A->cmap->n);
1146     ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
1147   }
1148   work = a->mult_work;
1149   for (i=0; i<mbs; i++) {
1150     n     = ii[1] - ii[0]; ii++;
1151     ncols = n*bs;
1152     workt = work;
1153     for (j=0; j<n; j++) {
1154       xb = x + bs*(*idx++);
1155       for (k=0; k<bs; k++) workt[k] = xb[k];
1156       workt += bs;
1157     }
1158     if (usecprow) z = zarray + bs*ridx[i];
1159     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1160     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
1161     v += n*bs2;
1162     if (!usecprow){
1163       z += bs;
1164     }
1165   }
1166   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1167   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1168   ierr = PetscLogFlops(2.0*a->nz*bs2);CHKERRQ(ierr);
1169   PetscFunctionReturn(0);
1170 }
1171 
1172 #undef __FUNCT__
1173 #define __FUNCT__ "MatMultHermitianTranspose_SeqBAIJ"
1174 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1175 {
1176   PetscScalar    zero = 0.0;
1177   PetscErrorCode ierr;
1178 
1179   PetscFunctionBegin;
1180   ierr = VecSet(zz,zero);CHKERRQ(ierr);
1181   ierr = MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr);
1182   PetscFunctionReturn(0);
1183 }
1184 
1185 #undef __FUNCT__
1186 #define __FUNCT__ "MatMultTranspose_SeqBAIJ"
1187 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1188 {
1189   PetscScalar    zero = 0.0;
1190   PetscErrorCode ierr;
1191 
1192   PetscFunctionBegin;
1193   ierr = VecSet(zz,zero);CHKERRQ(ierr);
1194   ierr = MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr);
1195   PetscFunctionReturn(0);
1196 }
1197 
1198 #undef __FUNCT__
1199 #define __FUNCT__ "MatMultHermitianTransposeAdd_SeqBAIJ"
1200 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1201 
1202 {
1203   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1204   PetscScalar       *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5;
1205   MatScalar         *v;
1206   PetscErrorCode    ierr;
1207   PetscInt          mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=PETSC_NULL;
1208   Mat_CompressedRow cprow = a->compressedrow;
1209   PetscTruth        usecprow=cprow.use;
1210 
1211   PetscFunctionBegin;
1212   if (yy != zz) { ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); }
1213   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1214   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1215 
1216   idx = a->j;
1217   v   = a->a;
1218   if (usecprow){
1219     mbs  = cprow.nrows;
1220     ii   = cprow.i;
1221     ridx = cprow.rindex;
1222   } else {
1223     mbs=a->mbs;
1224     ii = a->i;
1225     xb = x;
1226   }
1227 
1228   switch (bs) {
1229   case 1:
1230     for (i=0; i<mbs; i++) {
1231       if (usecprow) xb = x + ridx[i];
1232       x1 = xb[0];
1233       ib = idx + ii[0];
1234       n  = ii[1] - ii[0]; ii++;
1235       for (j=0; j<n; j++) {
1236         rval    = ib[j];
1237         z[rval] += PetscConj(*v) * x1;
1238         v++;
1239       }
1240       if (!usecprow) xb++;
1241     }
1242     break;
1243   case 2:
1244     for (i=0; i<mbs; i++) {
1245       if (usecprow) xb = x + 2*ridx[i];
1246       x1 = xb[0]; x2 = xb[1];
1247       ib = idx + ii[0];
1248       n  = ii[1] - ii[0]; ii++;
1249       for (j=0; j<n; j++) {
1250         rval      = ib[j]*2;
1251         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2;
1252         z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2;
1253         v  += 4;
1254       }
1255       if (!usecprow) xb += 2;
1256     }
1257     break;
1258   case 3:
1259     for (i=0; i<mbs; i++) {
1260       if (usecprow) xb = x + 3*ridx[i];
1261       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1262       ib = idx + ii[0];
1263       n  = ii[1] - ii[0]; ii++;
1264       for (j=0; j<n; j++) {
1265         rval      = ib[j]*3;
1266         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3;
1267         z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3;
1268         z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3;
1269         v  += 9;
1270       }
1271       if (!usecprow) xb += 3;
1272     }
1273     break;
1274   case 4:
1275     for (i=0; i<mbs; i++) {
1276       if (usecprow) xb = x + 4*ridx[i];
1277       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1278       ib = idx + ii[0];
1279       n  = ii[1] - ii[0]; ii++;
1280       for (j=0; j<n; j++) {
1281         rval      = ib[j]*4;
1282         z[rval++] +=  PetscConj(v[0])*x1 + PetscConj(v[1])*x2  + PetscConj(v[2])*x3  + PetscConj(v[3])*x4;
1283         z[rval++] +=  PetscConj(v[4])*x1 + PetscConj(v[5])*x2  + PetscConj(v[6])*x3  + PetscConj(v[7])*x4;
1284         z[rval++] +=  PetscConj(v[8])*x1 + PetscConj(v[9])*x2  + PetscConj(v[10])*x3 + PetscConj(v[11])*x4;
1285         z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4;
1286         v  += 16;
1287       }
1288       if (!usecprow) xb += 4;
1289     }
1290     break;
1291   case 5:
1292     for (i=0; i<mbs; i++) {
1293       if (usecprow) xb = x + 5*ridx[i];
1294       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1295       x4 = xb[3]; x5 = xb[4];
1296       ib = idx + ii[0];
1297       n  = ii[1] - ii[0]; ii++;
1298       for (j=0; j<n; j++) {
1299         rval      = ib[j]*5;
1300         z[rval++] +=  PetscConj(v[0])*x1 +  PetscConj(v[1])*x2 +  PetscConj(v[2])*x3 +  PetscConj(v[3])*x4 +  PetscConj(v[4])*x5;
1301         z[rval++] +=  PetscConj(v[5])*x1 +  PetscConj(v[6])*x2 +  PetscConj(v[7])*x3 +  PetscConj(v[8])*x4 +  PetscConj(v[9])*x5;
1302         z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5;
1303         z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5;
1304         z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5;
1305         v  += 25;
1306       }
1307       if (!usecprow) xb += 5;
1308     }
1309     break;
1310   default: {      /* block sizes larger than 5 by 5 are handled by BLAS */
1311       PetscInt     ncols,k;
1312       PetscScalar  *work,*workt,*xtmp;
1313 
1314       SETERRQ(PETSC_ERR_SUP,"block size larger than 5 is not supported yet");
1315       if (!a->mult_work) {
1316         k = PetscMax(A->rmap->n,A->cmap->n);
1317         ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
1318       }
1319       work = a->mult_work;
1320       xtmp = x;
1321       for (i=0; i<mbs; i++) {
1322         n     = ii[1] - ii[0]; ii++;
1323         ncols = n*bs;
1324         ierr  = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1325         if (usecprow) {
1326           xtmp = x + bs*ridx[i];
1327         }
1328         Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
1329         /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
1330         v += n*bs2;
1331         if (!usecprow) xtmp += bs;
1332         workt = work;
1333         for (j=0; j<n; j++) {
1334           zb = z + bs*(*idx++);
1335           for (k=0; k<bs; k++) zb[k] += workt[k] ;
1336           workt += bs;
1337         }
1338       }
1339     }
1340   }
1341   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1342   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1343   ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr);
1344   PetscFunctionReturn(0);
1345 }
1346 
1347 #undef __FUNCT__
1348 #define __FUNCT__ "MatMultTransposeAdd_SeqBAIJ"
1349 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1350 {
1351   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1352   PetscScalar       *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5;
1353   MatScalar         *v;
1354   PetscErrorCode    ierr;
1355   PetscInt          mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=PETSC_NULL;
1356   Mat_CompressedRow cprow = a->compressedrow;
1357   PetscTruth        usecprow=cprow.use;
1358 
1359   PetscFunctionBegin;
1360   if (yy != zz) { ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); }
1361   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1362   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1363 
1364   idx = a->j;
1365   v   = a->a;
1366   if (usecprow){
1367     mbs  = cprow.nrows;
1368     ii   = cprow.i;
1369     ridx = cprow.rindex;
1370   } else {
1371     mbs=a->mbs;
1372     ii = a->i;
1373     xb = x;
1374   }
1375 
1376   switch (bs) {
1377   case 1:
1378     for (i=0; i<mbs; i++) {
1379       if (usecprow) xb = x + ridx[i];
1380       x1 = xb[0];
1381       ib = idx + ii[0];
1382       n  = ii[1] - ii[0]; ii++;
1383       for (j=0; j<n; j++) {
1384         rval    = ib[j];
1385         z[rval] += *v * x1;
1386         v++;
1387       }
1388       if (!usecprow) xb++;
1389     }
1390     break;
1391   case 2:
1392     for (i=0; i<mbs; i++) {
1393       if (usecprow) xb = x + 2*ridx[i];
1394       x1 = xb[0]; x2 = xb[1];
1395       ib = idx + ii[0];
1396       n  = ii[1] - ii[0]; ii++;
1397       for (j=0; j<n; j++) {
1398         rval      = ib[j]*2;
1399         z[rval++] += v[0]*x1 + v[1]*x2;
1400         z[rval++] += v[2]*x1 + v[3]*x2;
1401         v  += 4;
1402       }
1403       if (!usecprow) xb += 2;
1404     }
1405     break;
1406   case 3:
1407     for (i=0; i<mbs; i++) {
1408       if (usecprow) xb = x + 3*ridx[i];
1409       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1410       ib = idx + ii[0];
1411       n  = ii[1] - ii[0]; ii++;
1412       for (j=0; j<n; j++) {
1413         rval      = ib[j]*3;
1414         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1415         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1416         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1417         v  += 9;
1418       }
1419       if (!usecprow) xb += 3;
1420     }
1421     break;
1422   case 4:
1423     for (i=0; i<mbs; i++) {
1424       if (usecprow) xb = x + 4*ridx[i];
1425       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1426       ib = idx + ii[0];
1427       n  = ii[1] - ii[0]; ii++;
1428       for (j=0; j<n; j++) {
1429         rval      = ib[j]*4;
1430         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1431         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1432         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1433         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1434         v  += 16;
1435       }
1436       if (!usecprow) xb += 4;
1437     }
1438     break;
1439   case 5:
1440     for (i=0; i<mbs; i++) {
1441       if (usecprow) xb = x + 5*ridx[i];
1442       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1443       x4 = xb[3]; x5 = xb[4];
1444       ib = idx + ii[0];
1445       n  = ii[1] - ii[0]; ii++;
1446       for (j=0; j<n; j++) {
1447         rval      = ib[j]*5;
1448         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1449         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1450         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1451         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1452         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1453         v  += 25;
1454       }
1455       if (!usecprow) xb += 5;
1456     }
1457     break;
1458   default: {      /* block sizes larger then 5 by 5 are handled by BLAS */
1459       PetscInt     ncols,k;
1460       PetscScalar  *work,*workt,*xtmp;
1461 
1462       if (!a->mult_work) {
1463         k = PetscMax(A->rmap->n,A->cmap->n);
1464         ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
1465       }
1466       work = a->mult_work;
1467       xtmp = x;
1468       for (i=0; i<mbs; i++) {
1469         n     = ii[1] - ii[0]; ii++;
1470         ncols = n*bs;
1471         ierr  = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1472         if (usecprow) {
1473           xtmp = x + bs*ridx[i];
1474         }
1475         Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
1476         /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
1477         v += n*bs2;
1478         if (!usecprow) xtmp += bs;
1479         workt = work;
1480         for (j=0; j<n; j++) {
1481           zb = z + bs*(*idx++);
1482           for (k=0; k<bs; k++) zb[k] += workt[k] ;
1483           workt += bs;
1484         }
1485       }
1486     }
1487   }
1488   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1489   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1490   ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr);
1491   PetscFunctionReturn(0);
1492 }
1493 
1494 #undef __FUNCT__
1495 #define __FUNCT__ "MatScale_SeqBAIJ"
1496 PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
1497 {
1498   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)inA->data;
1499   PetscInt       totalnz = a->bs2*a->nz;
1500   PetscScalar    oalpha = alpha;
1501   PetscErrorCode ierr;
1502   PetscBLASInt   one = 1,tnz = PetscBLASIntCast(totalnz);
1503 
1504   PetscFunctionBegin;
1505   BLASscal_(&tnz,&oalpha,a->a,&one);
1506   ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
1507   PetscFunctionReturn(0);
1508 }
1509 
1510 #undef __FUNCT__
1511 #define __FUNCT__ "MatNorm_SeqBAIJ"
1512 PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
1513 {
1514   PetscErrorCode ierr;
1515   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1516   MatScalar      *v = a->a;
1517   PetscReal      sum = 0.0;
1518   PetscInt       i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1;
1519 
1520   PetscFunctionBegin;
1521   if (type == NORM_FROBENIUS) {
1522     for (i=0; i< bs2*nz; i++) {
1523 #if defined(PETSC_USE_COMPLEX)
1524       sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
1525 #else
1526       sum += (*v)*(*v); v++;
1527 #endif
1528     }
1529     *norm = sqrt(sum);
1530   } else if (type == NORM_1) { /* maximum column sum */
1531     PetscReal *tmp;
1532     PetscInt  *bcol = a->j;
1533     ierr = PetscMalloc((A->cmap->n+1)*sizeof(PetscReal),&tmp);CHKERRQ(ierr);
1534     ierr = PetscMemzero(tmp,A->cmap->n*sizeof(PetscReal));CHKERRQ(ierr);
1535     for (i=0; i<nz; i++){
1536       for (j=0; j<bs; j++){
1537         k1 = bs*(*bcol) + j; /* column index */
1538         for (k=0; k<bs; k++){
1539           tmp[k1] += PetscAbsScalar(*v); v++;
1540         }
1541       }
1542       bcol++;
1543     }
1544     *norm = 0.0;
1545     for (j=0; j<A->cmap->n; j++) {
1546       if (tmp[j] > *norm) *norm = tmp[j];
1547     }
1548     ierr = PetscFree(tmp);CHKERRQ(ierr);
1549   } else if (type == NORM_INFINITY) { /* maximum row sum */
1550     *norm = 0.0;
1551     for (k=0; k<bs; k++) {
1552       for (j=0; j<a->mbs; j++) {
1553         v = a->a + bs2*a->i[j] + k;
1554         sum = 0.0;
1555         for (i=0; i<a->i[j+1]-a->i[j]; i++) {
1556           for (k1=0; k1<bs; k1++){
1557             sum += PetscAbsScalar(*v);
1558             v   += bs;
1559           }
1560         }
1561         if (sum > *norm) *norm = sum;
1562       }
1563     }
1564   } else {
1565     SETERRQ(PETSC_ERR_SUP,"No support for this norm yet");
1566   }
1567   PetscFunctionReturn(0);
1568 }
1569 
1570 
1571 #undef __FUNCT__
1572 #define __FUNCT__ "MatEqual_SeqBAIJ"
1573 PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscTruth* flg)
1574 {
1575   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data,*b = (Mat_SeqBAIJ *)B->data;
1576   PetscErrorCode ierr;
1577 
1578   PetscFunctionBegin;
1579   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1580   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1581     *flg = PETSC_FALSE;
1582     PetscFunctionReturn(0);
1583   }
1584 
1585   /* if the a->i are the same */
1586   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1587   if (!*flg) {
1588     PetscFunctionReturn(0);
1589   }
1590 
1591   /* if a->j are the same */
1592   ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1593   if (!*flg) {
1594     PetscFunctionReturn(0);
1595   }
1596   /* if a->a are the same */
1597   ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
1598   PetscFunctionReturn(0);
1599 
1600 }
1601 
1602 #undef __FUNCT__
1603 #define __FUNCT__ "MatGetDiagonal_SeqBAIJ"
1604 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
1605 {
1606   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1607   PetscErrorCode ierr;
1608   PetscInt       i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1609   PetscScalar    *x,zero = 0.0;
1610   MatScalar      *aa,*aa_j;
1611 
1612   PetscFunctionBegin;
1613   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1614   bs   = A->rmap->bs;
1615   aa   = a->a;
1616   ai   = a->i;
1617   aj   = a->j;
1618   ambs = a->mbs;
1619   bs2  = a->bs2;
1620 
1621   ierr = VecSet(v,zero);CHKERRQ(ierr);
1622   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1623   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1624   if (n != A->rmap->N) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1625   for (i=0; i<ambs; i++) {
1626     for (j=ai[i]; j<ai[i+1]; j++) {
1627       if (aj[j] == i) {
1628         row  = i*bs;
1629         aa_j = aa+j*bs2;
1630         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1631         break;
1632       }
1633     }
1634   }
1635   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1636   PetscFunctionReturn(0);
1637 }
1638 
1639 #undef __FUNCT__
1640 #define __FUNCT__ "MatDiagonalScale_SeqBAIJ"
1641 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
1642 {
1643   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1644   PetscScalar    *l,*r,x,*li,*ri;
1645   MatScalar      *aa,*v;
1646   PetscErrorCode ierr;
1647   PetscInt       i,j,k,lm,rn,M,m,n,*ai,*aj,mbs,tmp,bs,bs2;
1648 
1649   PetscFunctionBegin;
1650   ai  = a->i;
1651   aj  = a->j;
1652   aa  = a->a;
1653   m   = A->rmap->n;
1654   n   = A->cmap->n;
1655   bs  = A->rmap->bs;
1656   mbs = a->mbs;
1657   bs2 = a->bs2;
1658   if (ll) {
1659     ierr = VecGetArray(ll,&l);CHKERRQ(ierr);
1660     ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1661     if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1662     for (i=0; i<mbs; i++) { /* for each block row */
1663       M  = ai[i+1] - ai[i];
1664       li = l + i*bs;
1665       v  = aa + bs2*ai[i];
1666       for (j=0; j<M; j++) { /* for each block */
1667         for (k=0; k<bs2; k++) {
1668           (*v++) *= li[k%bs];
1669         }
1670       }
1671     }
1672     ierr = VecRestoreArray(ll,&l);CHKERRQ(ierr);
1673     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
1674   }
1675 
1676   if (rr) {
1677     ierr = VecGetArray(rr,&r);CHKERRQ(ierr);
1678     ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr);
1679     if (rn != n) SETERRQ(PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
1680     for (i=0; i<mbs; i++) { /* for each block row */
1681       M  = ai[i+1] - ai[i];
1682       v  = aa + bs2*ai[i];
1683       for (j=0; j<M; j++) { /* for each block */
1684         ri = r + bs*aj[ai[i]+j];
1685         for (k=0; k<bs; k++) {
1686           x = ri[k];
1687           for (tmp=0; tmp<bs; tmp++) (*v++) *= x;
1688         }
1689       }
1690     }
1691     ierr = VecRestoreArray(rr,&r);CHKERRQ(ierr);
1692     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
1693   }
1694   PetscFunctionReturn(0);
1695 }
1696 
1697 
1698 #undef __FUNCT__
1699 #define __FUNCT__ "MatGetInfo_SeqBAIJ"
1700 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1701 {
1702   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1703 
1704   PetscFunctionBegin;
1705   info->block_size     = a->bs2;
1706   info->nz_allocated   = a->maxnz;
1707   info->nz_used        = a->bs2*a->nz;
1708   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
1709   info->assemblies   = A->num_ass;
1710   info->mallocs      = a->reallocs;
1711   info->memory       = ((PetscObject)A)->mem;
1712   if (A->factor) {
1713     info->fill_ratio_given  = A->info.fill_ratio_given;
1714     info->fill_ratio_needed = A->info.fill_ratio_needed;
1715     info->factor_mallocs    = A->info.factor_mallocs;
1716   } else {
1717     info->fill_ratio_given  = 0;
1718     info->fill_ratio_needed = 0;
1719     info->factor_mallocs    = 0;
1720   }
1721   PetscFunctionReturn(0);
1722 }
1723 
1724 
1725 #undef __FUNCT__
1726 #define __FUNCT__ "MatZeroEntries_SeqBAIJ"
1727 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
1728 {
1729   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1730   PetscErrorCode ierr;
1731 
1732   PetscFunctionBegin;
1733   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
1734   PetscFunctionReturn(0);
1735 }
1736