xref: /petsc/src/mat/impls/baij/seq/baij2.c (revision 8d50da3b9e40b7fdaaffb6c7019ddf47ab2ac9d9)
1 
2 #include <../src/mat/impls/baij/seq/baij.h>
3 #include <petsc-private/kernels/blockinvert.h>
4 #include <petscbt.h>
5 #include <petscblaslapack.h>
6 
7 #undef __FUNCT__
8 #define __FUNCT__ "MatIncreaseOverlap_SeqBAIJ"
9 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
10 {
11   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
12   PetscErrorCode ierr;
13   PetscInt       row,i,j,k,l,m,n,*nidx,isz,val,ival;
14   const PetscInt *idx;
15   PetscInt       start,end,*ai,*aj,bs,*nidx2;
16   PetscBT        table;
17 
18   PetscFunctionBegin;
19   m  = a->mbs;
20   ai = a->i;
21   aj = a->j;
22   bs = A->rmap->bs;
23 
24   if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
25 
26   ierr = PetscBTCreate(m,&table);CHKERRQ(ierr);
27   ierr = PetscMalloc((m+1)*sizeof(PetscInt),&nidx);CHKERRQ(ierr);
28   ierr = PetscMalloc((A->rmap->N+1)*sizeof(PetscInt),&nidx2);CHKERRQ(ierr);
29 
30   for (i=0; i<is_max; i++) {
31     /* Initialise the two local arrays */
32     isz  = 0;
33     ierr = PetscBTMemzero(m,table);CHKERRQ(ierr);
34 
35     /* Extract the indices, assume there can be duplicate entries */
36     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
37     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
38 
39     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
40     for (j=0; j<n; ++j) {
41       ival = idx[j]/bs; /* convert the indices into block indices */
42       if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
43       if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival;
44     }
45     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
46     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
47 
48     k = 0;
49     for (j=0; j<ov; j++) { /* for each overlap*/
50       n = isz;
51       for (; k<n; k++) {  /* do only those rows in nidx[k], which are not done yet */
52         row   = nidx[k];
53         start = ai[row];
54         end   = ai[row+1];
55         for (l = start; l<end; l++) {
56           val = aj[l];
57           if (!PetscBTLookupSet(table,val)) nidx[isz++] = val;
58         }
59       }
60     }
61     /* expand the Index Set */
62     for (j=0; j<isz; j++) {
63       for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
64     }
65     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
66   }
67   ierr = PetscBTDestroy(&table);CHKERRQ(ierr);
68   ierr = PetscFree(nidx);CHKERRQ(ierr);
69   ierr = PetscFree(nidx2);CHKERRQ(ierr);
70   PetscFunctionReturn(0);
71 }
72 
73 #undef __FUNCT__
74 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ_Private"
75 PetscErrorCode MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
76 {
77   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*c;
78   PetscErrorCode ierr;
79   PetscInt       *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
80   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
81   const PetscInt *irow,*icol;
82   PetscInt       nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
83   PetscInt       *aj = a->j,*ai = a->i;
84   MatScalar      *mat_a;
85   Mat            C;
86   PetscBool      flag,sorted;
87 
88   PetscFunctionBegin;
89   ierr = ISSorted(iscol,&sorted);CHKERRQ(ierr);
90   if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
91 
92   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
93   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
94   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
95   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
96 
97   ierr  = PetscMalloc((1+oldcols)*sizeof(PetscInt),&smap);CHKERRQ(ierr);
98   ssmap = smap;
99   ierr  = PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);CHKERRQ(ierr);
100   ierr  = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr);
101   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
102   /* determine lens of each row */
103   for (i=0; i<nrows; i++) {
104     kstart  = ai[irow[i]];
105     kend    = kstart + a->ilen[irow[i]];
106     lens[i] = 0;
107     for (k=kstart; k<kend; k++) {
108       if (ssmap[aj[k]]) lens[i]++;
109     }
110   }
111   /* Create and fill new matrix */
112   if (scall == MAT_REUSE_MATRIX) {
113     c = (Mat_SeqBAIJ*)((*B)->data);
114 
115     if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
116     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr);
117     if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
118     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr);
119     C    = *B;
120   } else {
121     ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr);
122     ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
123     ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
124     ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(C,bs,0,lens);CHKERRQ(ierr);
125   }
126   c = (Mat_SeqBAIJ*)(C->data);
127   for (i=0; i<nrows; i++) {
128     row      = irow[i];
129     kstart   = ai[row];
130     kend     = kstart + a->ilen[row];
131     mat_i    = c->i[i];
132     mat_j    = c->j + mat_i;
133     mat_a    = c->a + mat_i*bs2;
134     mat_ilen = c->ilen + i;
135     for (k=kstart; k<kend; k++) {
136       if ((tcol=ssmap[a->j[k]])) {
137         *mat_j++ = tcol - 1;
138         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
139         mat_a   += bs2;
140         (*mat_ilen)++;
141       }
142     }
143   }
144 
145   /* Free work space */
146   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
147   ierr = PetscFree(smap);CHKERRQ(ierr);
148   ierr = PetscFree(lens);CHKERRQ(ierr);
149   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
150   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
151 
152   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
153   *B   = C;
154   PetscFunctionReturn(0);
155 }
156 
157 #undef __FUNCT__
158 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ"
159 PetscErrorCode MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
160 {
161   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
162   IS             is1,is2;
163   PetscErrorCode ierr;
164   PetscInt       *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count;
165   const PetscInt *irow,*icol;
166 
167   PetscFunctionBegin;
168   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
169   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
170   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
171   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
172 
173   /* Verify if the indices corespond to each element in a block
174    and form the IS with compressed IS */
175   ierr = PetscMalloc2(a->mbs,PetscInt,&vary,a->mbs,PetscInt,&iary);CHKERRQ(ierr);
176   ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr);
177   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
178   count = 0;
179   for (i=0; i<a->mbs; i++) {
180     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
181     if (vary[i]==bs) iary[count++] = i;
182   }
183   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr);
184 
185   ierr = PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));CHKERRQ(ierr);
186   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
187   count = 0;
188   for (i=0; i<a->mbs; i++) {
189     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
190     if (vary[i]==bs) iary[count++] = i;
191   }
192   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr);
193   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
194   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
195   ierr = PetscFree2(vary,iary);CHKERRQ(ierr);
196 
197   ierr = MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr);
198   ierr = ISDestroy(&is1);CHKERRQ(ierr);
199   ierr = ISDestroy(&is2);CHKERRQ(ierr);
200   PetscFunctionReturn(0);
201 }
202 
203 #undef __FUNCT__
204 #define __FUNCT__ "MatGetSubMatrices_SeqBAIJ"
205 PetscErrorCode MatGetSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
206 {
207   PetscErrorCode ierr;
208   PetscInt       i;
209 
210   PetscFunctionBegin;
211   if (scall == MAT_INITIAL_MATRIX) {
212     ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr);
213   }
214 
215   for (i=0; i<n; i++) {
216     ierr = MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr);
217   }
218   PetscFunctionReturn(0);
219 }
220 
221 
222 /* -------------------------------------------------------*/
223 /* Should check that shapes of vectors and matrices match */
224 /* -------------------------------------------------------*/
225 
226 #undef __FUNCT__
227 #define __FUNCT__ "MatMult_SeqBAIJ_1"
228 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
229 {
230   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
231   PetscScalar       *z,sum;
232   const PetscScalar *x;
233   const MatScalar   *v;
234   PetscErrorCode    ierr;
235   PetscInt          mbs,i,n;
236   const PetscInt    *idx,*ii,*ridx=NULL;
237   PetscBool         usecprow=a->compressedrow.use;
238 
239   PetscFunctionBegin;
240   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
241   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
242 
243   if (usecprow) {
244     mbs  = a->compressedrow.nrows;
245     ii   = a->compressedrow.i;
246     ridx = a->compressedrow.rindex;
247     ierr = PetscMemzero(z,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
248   } else {
249     mbs = a->mbs;
250     ii  = a->i;
251   }
252 
253   for (i=0; i<mbs; i++) {
254     n   = ii[1] - ii[0];
255     v   = a->a + ii[0];
256     idx = a->j + ii[0];
257     ii++;
258     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
259     PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
260     sum = 0.0;
261     PetscSparseDensePlusDot(sum,x,v,idx,n);
262     if (usecprow) {
263       z[ridx[i]] = sum;
264     } else {
265       z[i]        = sum;
266     }
267   }
268   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
269   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
270   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
271   PetscFunctionReturn(0);
272 }
273 
274 #undef __FUNCT__
275 #define __FUNCT__ "MatMult_SeqBAIJ_2"
276 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
277 {
278   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
279   PetscScalar       *z = 0,sum1,sum2,*zarray;
280   const PetscScalar *x,*xb;
281   PetscScalar       x1,x2;
282   const MatScalar   *v;
283   PetscErrorCode    ierr;
284   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
285   PetscBool         usecprow=a->compressedrow.use;
286 
287   PetscFunctionBegin;
288   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
289   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
290 
291   idx = a->j;
292   v   = a->a;
293   if (usecprow) {
294     mbs  = a->compressedrow.nrows;
295     ii   = a->compressedrow.i;
296     ridx = a->compressedrow.rindex;
297   } else {
298     mbs = a->mbs;
299     ii  = a->i;
300     z   = zarray;
301   }
302 
303   for (i=0; i<mbs; i++) {
304     n           = ii[1] - ii[0]; ii++;
305     sum1        = 0.0; sum2 = 0.0;
306     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
307     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
308     for (j=0; j<n; j++) {
309       xb    = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
310       sum1 += v[0]*x1 + v[2]*x2;
311       sum2 += v[1]*x1 + v[3]*x2;
312       v    += 4;
313     }
314     if (usecprow) z = zarray + 2*ridx[i];
315     z[0] = sum1; z[1] = sum2;
316     if (!usecprow) z += 2;
317   }
318   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
319   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
320   ierr = PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);CHKERRQ(ierr);
321   PetscFunctionReturn(0);
322 }
323 
324 #undef __FUNCT__
325 #define __FUNCT__ "MatMult_SeqBAIJ_3"
326 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
327 {
328   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
329   PetscScalar       *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray;
330   const PetscScalar *x,*xb;
331   const MatScalar   *v;
332   PetscErrorCode    ierr;
333   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
334   PetscBool         usecprow=a->compressedrow.use;
335 
336 
337 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
338 #pragma disjoint(*v,*z,*xb)
339 #endif
340 
341   PetscFunctionBegin;
342   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
343   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
344 
345   idx = a->j;
346   v   = a->a;
347   if (usecprow) {
348     mbs  = a->compressedrow.nrows;
349     ii   = a->compressedrow.i;
350     ridx = a->compressedrow.rindex;
351   } else {
352     mbs = a->mbs;
353     ii  = a->i;
354     z   = zarray;
355   }
356 
357   for (i=0; i<mbs; i++) {
358     n           = ii[1] - ii[0]; ii++;
359     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0;
360     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
361     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
362     for (j=0; j<n; j++) {
363       xb = x + 3*(*idx++);
364       x1 = xb[0];
365       x2 = xb[1];
366       x3 = xb[2];
367 
368       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
369       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
370       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
371       v    += 9;
372     }
373     if (usecprow) z = zarray + 3*ridx[i];
374     z[0] = sum1; z[1] = sum2; z[2] = sum3;
375     if (!usecprow) z += 3;
376   }
377   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
378   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
379   ierr = PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);CHKERRQ(ierr);
380   PetscFunctionReturn(0);
381 }
382 
383 #undef __FUNCT__
384 #define __FUNCT__ "MatMult_SeqBAIJ_4"
385 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
386 {
387   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
388   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
389   const PetscScalar *x,*xb;
390   const MatScalar   *v;
391   PetscErrorCode    ierr;
392   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
393   PetscBool         usecprow=a->compressedrow.use;
394 
395   PetscFunctionBegin;
396   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
397   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
398 
399   idx = a->j;
400   v   = a->a;
401   if (usecprow) {
402     mbs  = a->compressedrow.nrows;
403     ii   = a->compressedrow.i;
404     ridx = a->compressedrow.rindex;
405   } else {
406     mbs = a->mbs;
407     ii  = a->i;
408     z   = zarray;
409   }
410 
411   for (i=0; i<mbs; i++) {
412     n = ii[1] - ii[0];
413     ii++;
414     sum1 = 0.0;
415     sum2 = 0.0;
416     sum3 = 0.0;
417     sum4 = 0.0;
418 
419     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
420     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
421     for (j=0; j<n; j++) {
422       xb    = x + 4*(*idx++);
423       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
424       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
425       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
426       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
427       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
428       v    += 16;
429     }
430     if (usecprow) z = zarray + 4*ridx[i];
431     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
432     if (!usecprow) z += 4;
433   }
434   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
435   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
436   ierr = PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);CHKERRQ(ierr);
437   PetscFunctionReturn(0);
438 }
439 
440 #undef __FUNCT__
441 #define __FUNCT__ "MatMult_SeqBAIJ_5"
442 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
443 {
444   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
445   PetscScalar       sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray;
446   const PetscScalar *xb,*x;
447   const MatScalar   *v;
448   PetscErrorCode    ierr;
449   const PetscInt    *idx,*ii,*ridx=NULL;
450   PetscInt          mbs,i,j,n;
451   PetscBool         usecprow=a->compressedrow.use;
452 
453   PetscFunctionBegin;
454   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
455   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
456 
457   idx = a->j;
458   v   = a->a;
459   if (usecprow) {
460     mbs  = a->compressedrow.nrows;
461     ii   = a->compressedrow.i;
462     ridx = a->compressedrow.rindex;
463   } else {
464     mbs = a->mbs;
465     ii  = a->i;
466     z   = zarray;
467   }
468 
469   for (i=0; i<mbs; i++) {
470     n           = ii[1] - ii[0]; ii++;
471     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
472     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
473     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
474     for (j=0; j<n; j++) {
475       xb    = x + 5*(*idx++);
476       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
477       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
478       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
479       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
480       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
481       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
482       v    += 25;
483     }
484     if (usecprow) z = zarray + 5*ridx[i];
485     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
486     if (!usecprow) z += 5;
487   }
488   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
489   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
490   ierr = PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);CHKERRQ(ierr);
491   PetscFunctionReturn(0);
492 }
493 
494 
495 #undef __FUNCT__
496 #define __FUNCT__ "MatMult_SeqBAIJ_6"
497 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
498 {
499   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
500   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
501   const PetscScalar *x,*xb;
502   PetscScalar       x1,x2,x3,x4,x5,x6,*zarray;
503   const MatScalar   *v;
504   PetscErrorCode    ierr;
505   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
506   PetscBool         usecprow=a->compressedrow.use;
507 
508   PetscFunctionBegin;
509   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
510   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
511 
512   idx = a->j;
513   v   = a->a;
514   if (usecprow) {
515     mbs  = a->compressedrow.nrows;
516     ii   = a->compressedrow.i;
517     ridx = a->compressedrow.rindex;
518   } else {
519     mbs = a->mbs;
520     ii  = a->i;
521     z   = zarray;
522   }
523 
524   for (i=0; i<mbs; i++) {
525     n  = ii[1] - ii[0];
526     ii++;
527     sum1 = 0.0;
528     sum2 = 0.0;
529     sum3 = 0.0;
530     sum4 = 0.0;
531     sum5 = 0.0;
532     sum6 = 0.0;
533 
534     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
535     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
536     for (j=0; j<n; j++) {
537       xb    = x + 6*(*idx++);
538       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
539       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
540       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
541       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
542       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
543       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
544       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
545       v    += 36;
546     }
547     if (usecprow) z = zarray + 6*ridx[i];
548     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
549     if (!usecprow) z += 6;
550   }
551 
552   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
553   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
554   ierr = PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);CHKERRQ(ierr);
555   PetscFunctionReturn(0);
556 }
557 
558 #undef __FUNCT__
559 #define __FUNCT__ "MatMult_SeqBAIJ_7"
560 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
561 {
562   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
563   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
564   const PetscScalar *x,*xb;
565   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*zarray;
566   const MatScalar   *v;
567   PetscErrorCode    ierr;
568   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
569   PetscBool         usecprow=a->compressedrow.use;
570 
571   PetscFunctionBegin;
572   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
573   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
574 
575   idx = a->j;
576   v   = a->a;
577   if (usecprow) {
578     mbs  = a->compressedrow.nrows;
579     ii   = a->compressedrow.i;
580     ridx = a->compressedrow.rindex;
581   } else {
582     mbs = a->mbs;
583     ii  = a->i;
584     z   = zarray;
585   }
586 
587   for (i=0; i<mbs; i++) {
588     n  = ii[1] - ii[0];
589     ii++;
590     sum1 = 0.0;
591     sum2 = 0.0;
592     sum3 = 0.0;
593     sum4 = 0.0;
594     sum5 = 0.0;
595     sum6 = 0.0;
596     sum7 = 0.0;
597 
598     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
599     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
600     for (j=0; j<n; j++) {
601       xb    = x + 7*(*idx++);
602       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
603       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
604       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
605       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
606       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
607       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
608       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
609       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
610       v    += 49;
611     }
612     if (usecprow) z = zarray + 7*ridx[i];
613     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
614     if (!usecprow) z += 7;
615   }
616 
617   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
618   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
619   ierr = PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);CHKERRQ(ierr);
620   PetscFunctionReturn(0);
621 }
622 
623 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
624 /* Default MatMult for block size 15 */
625 
626 #undef __FUNCT__
627 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver1"
628 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz)
629 {
630   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
631   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
632   const PetscScalar *x,*xb;
633   PetscScalar       *zarray,xv;
634   const MatScalar   *v;
635   PetscErrorCode    ierr;
636   const PetscInt    *ii,*ij=a->j,*idx;
637   PetscInt          mbs,i,j,k,n,*ridx=NULL;
638   PetscBool         usecprow=a->compressedrow.use;
639 
640   PetscFunctionBegin;
641   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
642   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
643 
644   v = a->a;
645   if (usecprow) {
646     mbs  = a->compressedrow.nrows;
647     ii   = a->compressedrow.i;
648     ridx = a->compressedrow.rindex;
649   } else {
650     mbs = a->mbs;
651     ii  = a->i;
652     z   = zarray;
653   }
654 
655   for (i=0; i<mbs; i++) {
656     n    = ii[i+1] - ii[i];
657     idx  = ij + ii[i];
658     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
659     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
660 
661     for (j=0; j<n; j++) {
662       xb = x + 15*(idx[j]);
663 
664       for (k=0; k<15; k++) {
665         xv     =  xb[k];
666         sum1  += v[0]*xv;
667         sum2  += v[1]*xv;
668         sum3  += v[2]*xv;
669         sum4  += v[3]*xv;
670         sum5  += v[4]*xv;
671         sum6  += v[5]*xv;
672         sum7  += v[6]*xv;
673         sum8  += v[7]*xv;
674         sum9  += v[8]*xv;
675         sum10 += v[9]*xv;
676         sum11 += v[10]*xv;
677         sum12 += v[11]*xv;
678         sum13 += v[12]*xv;
679         sum14 += v[13]*xv;
680         sum15 += v[14]*xv;
681         v     += 15;
682       }
683     }
684     if (usecprow) z = zarray + 15*ridx[i];
685     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
686     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
687 
688     if (!usecprow) z += 15;
689   }
690 
691   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
692   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
693   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
694   PetscFunctionReturn(0);
695 }
696 
697 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
698 #undef __FUNCT__
699 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver2"
700 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz)
701 {
702   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
703   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
704   const PetscScalar *x,*xb;
705   PetscScalar       x1,x2,x3,x4,*zarray;
706   const MatScalar   *v;
707   PetscErrorCode    ierr;
708   const PetscInt    *ii,*ij=a->j,*idx;
709   PetscInt          mbs,i,j,n,*ridx=NULL;
710   PetscBool         usecprow=a->compressedrow.use;
711 
712   PetscFunctionBegin;
713   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
714   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
715 
716   v = a->a;
717   if (usecprow) {
718     mbs  = a->compressedrow.nrows;
719     ii   = a->compressedrow.i;
720     ridx = a->compressedrow.rindex;
721   } else {
722     mbs = a->mbs;
723     ii  = a->i;
724     z   = zarray;
725   }
726 
727   for (i=0; i<mbs; i++) {
728     n    = ii[i+1] - ii[i];
729     idx  = ij + ii[i];
730     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
731     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
732 
733     for (j=0; j<n; j++) {
734       xb = x + 15*(idx[j]);
735       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
736 
737       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
738       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
739       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
740       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
741       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
742       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
743       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
744       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
745       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
746       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
747       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
748       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
749       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
750       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
751       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
752 
753       v += 60;
754 
755       x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];
756 
757       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
758       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
759       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
760       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
761       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
762       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
763       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
764       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
765       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
766       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
767       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
768       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
769       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
770       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
771       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
772       v     += 60;
773 
774       x1     = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
775       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
776       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
777       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
778       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
779       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
780       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
781       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
782       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
783       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
784       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
785       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
786       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
787       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
788       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
789       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
790       v     += 60;
791 
792       x1     = xb[12]; x2 = xb[13]; x3 = xb[14];
793       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3;
794       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3;
795       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3;
796       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3;
797       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3;
798       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3;
799       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3;
800       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3;
801       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3;
802       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3;
803       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3;
804       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3;
805       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3;
806       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3;
807       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3;
808       v     += 45;
809     }
810     if (usecprow) z = zarray + 15*ridx[i];
811     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
812     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
813 
814     if (!usecprow) z += 15;
815   }
816 
817   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
818   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
819   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
820   PetscFunctionReturn(0);
821 }
822 
823 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
824 #undef __FUNCT__
825 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver3"
826 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz)
827 {
828   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
829   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
830   const PetscScalar *x,*xb;
831   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,*zarray;
832   const MatScalar   *v;
833   PetscErrorCode    ierr;
834   const PetscInt    *ii,*ij=a->j,*idx;
835   PetscInt          mbs,i,j,n,*ridx=NULL;
836   PetscBool         usecprow=a->compressedrow.use;
837 
838   PetscFunctionBegin;
839   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
840   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
841 
842   v = a->a;
843   if (usecprow) {
844     mbs  = a->compressedrow.nrows;
845     ii   = a->compressedrow.i;
846     ridx = a->compressedrow.rindex;
847   } else {
848     mbs = a->mbs;
849     ii  = a->i;
850     z   = zarray;
851   }
852 
853   for (i=0; i<mbs; i++) {
854     n    = ii[i+1] - ii[i];
855     idx  = ij + ii[i];
856     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
857     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
858 
859     for (j=0; j<n; j++) {
860       xb = x + 15*(idx[j]);
861       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
862       x8 = xb[7];
863 
864       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8;
865       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8;
866       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8;
867       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8;
868       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8;
869       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8;
870       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8;
871       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8;
872       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8;
873       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8;
874       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8;
875       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8;
876       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8;
877       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8;
878       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8;
879       v     += 120;
880 
881       x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14];
882 
883       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7;
884       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7;
885       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7;
886       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7;
887       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7;
888       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7;
889       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7;
890       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7;
891       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7;
892       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7;
893       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7;
894       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7;
895       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7;
896       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7;
897       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7;
898       v     += 105;
899     }
900     if (usecprow) z = zarray + 15*ridx[i];
901     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
902     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
903 
904     if (!usecprow) z += 15;
905   }
906 
907   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
908   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
909   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
910   PetscFunctionReturn(0);
911 }
912 
913 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
914 
915 #undef __FUNCT__
916 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver4"
917 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz)
918 {
919   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
920   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
921   const PetscScalar *x,*xb;
922   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray;
923   const MatScalar   *v;
924   PetscErrorCode    ierr;
925   const PetscInt    *ii,*ij=a->j,*idx;
926   PetscInt          mbs,i,j,n,*ridx=NULL;
927   PetscBool         usecprow=a->compressedrow.use;
928 
929   PetscFunctionBegin;
930   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
931   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
932 
933   v = a->a;
934   if (usecprow) {
935     mbs  = a->compressedrow.nrows;
936     ii   = a->compressedrow.i;
937     ridx = a->compressedrow.rindex;
938   } else {
939     mbs = a->mbs;
940     ii  = a->i;
941     z   = zarray;
942   }
943 
944   for (i=0; i<mbs; i++) {
945     n    = ii[i+1] - ii[i];
946     idx  = ij + ii[i];
947     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
948     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
949 
950     for (j=0; j<n; j++) {
951       xb = x + 15*(idx[j]);
952       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
953       x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14];
954 
955       sum1  +=  v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
956       sum2  +=  v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
957       sum3  +=  v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
958       sum4  +=  v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
959       sum5  += v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
960       sum6  += v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
961       sum7  += v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
962       sum8  += v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
963       sum9  += v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
964       sum10 += v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
965       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
966       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
967       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
968       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
969       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
970       v     += 225;
971     }
972     if (usecprow) z = zarray + 15*ridx[i];
973     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
974     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
975 
976     if (!usecprow) z += 15;
977   }
978 
979   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
980   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
981   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
982   PetscFunctionReturn(0);
983 }
984 
985 
986 /*
987     This will not work with MatScalar == float because it calls the BLAS
988 */
989 #undef __FUNCT__
990 #define __FUNCT__ "MatMult_SeqBAIJ_N"
991 PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
992 {
993   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
994   PetscScalar    *x,*z = 0,*xb,*work,*workt,*zarray;
995   MatScalar      *v;
996   PetscErrorCode ierr;
997   PetscInt       mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2;
998   PetscInt       ncols,k,*ridx=NULL;
999   PetscBool      usecprow=a->compressedrow.use;
1000 
1001   PetscFunctionBegin;
1002   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1003   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1004 
1005   idx = a->j;
1006   v   = a->a;
1007   if (usecprow) {
1008     mbs  = a->compressedrow.nrows;
1009     ii   = a->compressedrow.i;
1010     ridx = a->compressedrow.rindex;
1011   } else {
1012     mbs = a->mbs;
1013     ii  = a->i;
1014     z   = zarray;
1015   }
1016 
1017   if (!a->mult_work) {
1018     k    = PetscMax(A->rmap->n,A->cmap->n);
1019     ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
1020   }
1021   work = a->mult_work;
1022   for (i=0; i<mbs; i++) {
1023     n           = ii[1] - ii[0]; ii++;
1024     ncols       = n*bs;
1025     workt       = work;
1026     for (j=0; j<n; j++) {
1027       xb = x + bs*(*idx++);
1028       for (k=0; k<bs; k++) workt[k] = xb[k];
1029       workt += bs;
1030     }
1031     if (usecprow) z = zarray + bs*ridx[i];
1032     PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
1033     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
1034     v += n*bs2;
1035     if (!usecprow) z += bs;
1036   }
1037   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1038   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1039   ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);CHKERRQ(ierr);
1040   PetscFunctionReturn(0);
1041 }
1042 
1043 #undef __FUNCT__
1044 #define __FUNCT__ "MatMultAdd_SeqBAIJ_1"
1045 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
1046 {
1047   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1048   const PetscScalar *x;
1049   PetscScalar       *y,*z,sum;
1050   const MatScalar   *v;
1051   PetscErrorCode    ierr;
1052   PetscInt          mbs=a->mbs,i,n,*ridx=NULL;
1053   const PetscInt    *idx,*ii;
1054   PetscBool         usecprow=a->compressedrow.use;
1055 
1056   PetscFunctionBegin;
1057   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1058   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
1059   if (zz != yy) {
1060     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1061   } else {
1062     z = y;
1063   }
1064 
1065   idx = a->j;
1066   v   = a->a;
1067   if (usecprow) {
1068     if (zz != yy) {
1069       ierr = PetscMemcpy(z,y,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1070     }
1071     mbs  = a->compressedrow.nrows;
1072     ii   = a->compressedrow.i;
1073     ridx = a->compressedrow.rindex;
1074   } else {
1075     ii = a->i;
1076   }
1077 
1078   for (i=0; i<mbs; i++) {
1079     n = ii[1] - ii[0];
1080     ii++;
1081     if (!usecprow) {
1082       sum         = y[i];
1083     } else {
1084       sum = y[ridx[i]];
1085     }
1086     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1087     PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
1088     PetscSparseDensePlusDot(sum,x,v,idx,n);
1089     v   += n;
1090     idx += n;
1091     if (usecprow) {
1092       z[ridx[i]] = sum;
1093     } else {
1094       z[i] = sum;
1095     }
1096   }
1097   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1098   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
1099   if (zz != yy) {
1100     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1101   }
1102   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1103   PetscFunctionReturn(0);
1104 }
1105 
1106 #undef __FUNCT__
1107 #define __FUNCT__ "MatMultAdd_SeqBAIJ_2"
1108 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
1109 {
1110   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1111   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2;
1112   PetscScalar    x1,x2,*yarray,*zarray;
1113   MatScalar      *v;
1114   PetscErrorCode ierr;
1115   PetscInt       mbs     =a->mbs,i,*idx,*ii,j,n,*ridx=NULL;
1116   PetscBool      usecprow=a->compressedrow.use;
1117 
1118   PetscFunctionBegin;
1119   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1120   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
1121   if (zz != yy) {
1122     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1123   } else {
1124     zarray = yarray;
1125   }
1126 
1127   idx = a->j;
1128   v   = a->a;
1129   if (usecprow) {
1130     if (zz != yy) {
1131       ierr = PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1132     }
1133     mbs  = a->compressedrow.nrows;
1134     ii   = a->compressedrow.i;
1135     ridx = a->compressedrow.rindex;
1136     if (zz != yy) {
1137       ierr = PetscMemcpy(zarray,yarray,a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1138     }
1139   } else {
1140     ii = a->i;
1141     y  = yarray;
1142     z  = zarray;
1143   }
1144 
1145   for (i=0; i<mbs; i++) {
1146     n = ii[1] - ii[0]; ii++;
1147     if (usecprow) {
1148       z = zarray + 2*ridx[i];
1149       y = yarray + 2*ridx[i];
1150     }
1151     sum1 = y[0]; sum2 = y[1];
1152     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1153     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1154     for (j=0; j<n; j++) {
1155       xb = x + 2*(*idx++);
1156       x1 = xb[0];
1157       x2 = xb[1];
1158 
1159       sum1 += v[0]*x1 + v[2]*x2;
1160       sum2 += v[1]*x1 + v[3]*x2;
1161       v    += 4;
1162     }
1163     z[0] = sum1; z[1] = sum2;
1164     if (!usecprow) {
1165       z += 2; y += 2;
1166     }
1167   }
1168   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1169   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
1170   if (zz != yy) {
1171     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1172   }
1173   ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr);
1174   PetscFunctionReturn(0);
1175 }
1176 
1177 #undef __FUNCT__
1178 #define __FUNCT__ "MatMultAdd_SeqBAIJ_3"
1179 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
1180 {
1181   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1182   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
1183   MatScalar      *v;
1184   PetscErrorCode ierr;
1185   PetscInt       mbs     =a->mbs,i,*idx,*ii,j,n,*ridx=NULL;
1186   PetscBool      usecprow=a->compressedrow.use;
1187 
1188   PetscFunctionBegin;
1189   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1190   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
1191   if (zz != yy) {
1192     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1193   } else {
1194     zarray = yarray;
1195   }
1196 
1197   idx = a->j;
1198   v   = a->a;
1199   if (usecprow) {
1200     if (zz != yy) {
1201       ierr = PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1202     }
1203     mbs  = a->compressedrow.nrows;
1204     ii   = a->compressedrow.i;
1205     ridx = a->compressedrow.rindex;
1206   } else {
1207     ii = a->i;
1208     y  = yarray;
1209     z  = zarray;
1210   }
1211 
1212   for (i=0; i<mbs; i++) {
1213     n = ii[1] - ii[0]; ii++;
1214     if (usecprow) {
1215       z = zarray + 3*ridx[i];
1216       y = yarray + 3*ridx[i];
1217     }
1218     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
1219     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1220     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1221     for (j=0; j<n; j++) {
1222       xb    = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1223       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
1224       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
1225       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
1226       v    += 9;
1227     }
1228     z[0] = sum1; z[1] = sum2; z[2] = sum3;
1229     if (!usecprow) {
1230       z += 3; y += 3;
1231     }
1232   }
1233   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1234   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
1235   if (zz != yy) {
1236     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1237   }
1238   ierr = PetscLogFlops(18.0*a->nz);CHKERRQ(ierr);
1239   PetscFunctionReturn(0);
1240 }
1241 
1242 #undef __FUNCT__
1243 #define __FUNCT__ "MatMultAdd_SeqBAIJ_4"
1244 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
1245 {
1246   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1247   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
1248   MatScalar      *v;
1249   PetscErrorCode ierr;
1250   PetscInt       mbs     =a->mbs,i,*idx,*ii,j,n,*ridx=NULL;
1251   PetscBool      usecprow=a->compressedrow.use;
1252 
1253   PetscFunctionBegin;
1254   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1255   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
1256   if (zz != yy) {
1257     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1258   } else {
1259     zarray = yarray;
1260   }
1261 
1262   idx = a->j;
1263   v   = a->a;
1264   if (usecprow) {
1265     if (zz != yy) {
1266       ierr = PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1267     }
1268     mbs  = a->compressedrow.nrows;
1269     ii   = a->compressedrow.i;
1270     ridx = a->compressedrow.rindex;
1271   } else {
1272     ii = a->i;
1273     y  = yarray;
1274     z  = zarray;
1275   }
1276 
1277   for (i=0; i<mbs; i++) {
1278     n = ii[1] - ii[0]; ii++;
1279     if (usecprow) {
1280       z = zarray + 4*ridx[i];
1281       y = yarray + 4*ridx[i];
1282     }
1283     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
1284     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1285     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1286     for (j=0; j<n; j++) {
1287       xb    = x + 4*(*idx++);
1288       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1289       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1290       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1291       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1292       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1293       v    += 16;
1294     }
1295     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
1296     if (!usecprow) {
1297       z += 4; y += 4;
1298     }
1299   }
1300   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1301   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
1302   if (zz != yy) {
1303     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1304   }
1305   ierr = PetscLogFlops(32.0*a->nz);CHKERRQ(ierr);
1306   PetscFunctionReturn(0);
1307 }
1308 
1309 #undef __FUNCT__
1310 #define __FUNCT__ "MatMultAdd_SeqBAIJ_5"
1311 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
1312 {
1313   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1314   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
1315   PetscScalar    *yarray,*zarray;
1316   MatScalar      *v;
1317   PetscErrorCode ierr;
1318   PetscInt       mbs     =a->mbs,i,*idx,*ii,j,n,*ridx=NULL;
1319   PetscBool      usecprow=a->compressedrow.use;
1320 
1321   PetscFunctionBegin;
1322   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1323   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
1324   if (zz != yy) {
1325     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1326   } else {
1327     zarray = yarray;
1328   }
1329 
1330   idx = a->j;
1331   v   = a->a;
1332   if (usecprow) {
1333     if (zz != yy) {
1334       ierr = PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1335     }
1336     mbs  = a->compressedrow.nrows;
1337     ii   = a->compressedrow.i;
1338     ridx = a->compressedrow.rindex;
1339   } else {
1340     ii = a->i;
1341     y  = yarray;
1342     z  = zarray;
1343   }
1344 
1345   for (i=0; i<mbs; i++) {
1346     n = ii[1] - ii[0]; ii++;
1347     if (usecprow) {
1348       z = zarray + 5*ridx[i];
1349       y = yarray + 5*ridx[i];
1350     }
1351     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
1352     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1353     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1354     for (j=0; j<n; j++) {
1355       xb    = x + 5*(*idx++);
1356       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
1357       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1358       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1359       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1360       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1361       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1362       v    += 25;
1363     }
1364     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
1365     if (!usecprow) {
1366       z += 5; y += 5;
1367     }
1368   }
1369   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1370   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
1371   if (zz != yy) {
1372     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1373   }
1374   ierr = PetscLogFlops(50.0*a->nz);CHKERRQ(ierr);
1375   PetscFunctionReturn(0);
1376 }
1377 #undef __FUNCT__
1378 #define __FUNCT__ "MatMultAdd_SeqBAIJ_6"
1379 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
1380 {
1381   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1382   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6;
1383   PetscScalar    x1,x2,x3,x4,x5,x6,*yarray,*zarray;
1384   MatScalar      *v;
1385   PetscErrorCode ierr;
1386   PetscInt       mbs     =a->mbs,i,*idx,*ii,j,n,*ridx=NULL;
1387   PetscBool      usecprow=a->compressedrow.use;
1388 
1389   PetscFunctionBegin;
1390   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1391   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
1392   if (zz != yy) {
1393     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1394   } else {
1395     zarray = yarray;
1396   }
1397 
1398   idx = a->j;
1399   v   = a->a;
1400   if (usecprow) {
1401     if (zz != yy) {
1402       ierr = PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1403     }
1404     mbs  = a->compressedrow.nrows;
1405     ii   = a->compressedrow.i;
1406     ridx = a->compressedrow.rindex;
1407   } else {
1408     ii = a->i;
1409     y  = yarray;
1410     z  = zarray;
1411   }
1412 
1413   for (i=0; i<mbs; i++) {
1414     n = ii[1] - ii[0]; ii++;
1415     if (usecprow) {
1416       z = zarray + 6*ridx[i];
1417       y = yarray + 6*ridx[i];
1418     }
1419     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
1420     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1421     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1422     for (j=0; j<n; j++) {
1423       xb    = x + 6*(*idx++);
1424       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
1425       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
1426       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
1427       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
1428       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
1429       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
1430       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
1431       v    += 36;
1432     }
1433     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
1434     if (!usecprow) {
1435       z += 6; y += 6;
1436     }
1437   }
1438   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1439   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
1440   if (zz != yy) {
1441     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1442   }
1443   ierr = PetscLogFlops(72.0*a->nz);CHKERRQ(ierr);
1444   PetscFunctionReturn(0);
1445 }
1446 
1447 #undef __FUNCT__
1448 #define __FUNCT__ "MatMultAdd_SeqBAIJ_7"
1449 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1450 {
1451   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1452   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1453   PetscScalar    x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
1454   MatScalar      *v;
1455   PetscErrorCode ierr;
1456   PetscInt       mbs     =a->mbs,i,*idx,*ii,j,n,*ridx=NULL;
1457   PetscBool      usecprow=a->compressedrow.use;
1458 
1459   PetscFunctionBegin;
1460   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1461   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
1462   if (zz != yy) {
1463     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1464   } else {
1465     zarray = yarray;
1466   }
1467 
1468   idx = a->j;
1469   v   = a->a;
1470   if (usecprow) {
1471     if (zz != yy) {
1472       ierr = PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1473     }
1474     mbs  = a->compressedrow.nrows;
1475     ii   = a->compressedrow.i;
1476     ridx = a->compressedrow.rindex;
1477   } else {
1478     ii = a->i;
1479     y  = yarray;
1480     z  = zarray;
1481   }
1482 
1483   for (i=0; i<mbs; i++) {
1484     n = ii[1] - ii[0]; ii++;
1485     if (usecprow) {
1486       z = zarray + 7*ridx[i];
1487       y = yarray + 7*ridx[i];
1488     }
1489     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1490     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1491     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1492     for (j=0; j<n; j++) {
1493       xb    = x + 7*(*idx++);
1494       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1495       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1496       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1497       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1498       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1499       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1500       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1501       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1502       v    += 49;
1503     }
1504     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1505     if (!usecprow) {
1506       z += 7; y += 7;
1507     }
1508   }
1509   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1510   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
1511   if (zz != yy) {
1512     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1513   }
1514   ierr = PetscLogFlops(98.0*a->nz);CHKERRQ(ierr);
1515   PetscFunctionReturn(0);
1516 }
1517 
1518 #undef __FUNCT__
1519 #define __FUNCT__ "MatMultAdd_SeqBAIJ_N"
1520 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1521 {
1522   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1523   PetscScalar    *x,*z = 0,*xb,*work,*workt,*zarray;
1524   MatScalar      *v;
1525   PetscErrorCode ierr;
1526   PetscInt       mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2;
1527   PetscInt       ncols,k,*ridx=NULL;
1528   PetscBool      usecprow=a->compressedrow.use;
1529 
1530   PetscFunctionBegin;
1531   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1532   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1533   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1534 
1535   idx = a->j;
1536   v   = a->a;
1537   if (usecprow) {
1538     mbs  = a->compressedrow.nrows;
1539     ii   = a->compressedrow.i;
1540     ridx = a->compressedrow.rindex;
1541   } else {
1542     mbs = a->mbs;
1543     ii  = a->i;
1544     z   = zarray;
1545   }
1546 
1547   if (!a->mult_work) {
1548     k    = PetscMax(A->rmap->n,A->cmap->n);
1549     ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
1550   }
1551   work = a->mult_work;
1552   for (i=0; i<mbs; i++) {
1553     n     = ii[1] - ii[0]; ii++;
1554     ncols = n*bs;
1555     workt = work;
1556     for (j=0; j<n; j++) {
1557       xb = x + bs*(*idx++);
1558       for (k=0; k<bs; k++) workt[k] = xb[k];
1559       workt += bs;
1560     }
1561     if (usecprow) z = zarray + bs*ridx[i];
1562     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1563     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
1564     v += n*bs2;
1565     if (!usecprow) z += bs;
1566   }
1567   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1568   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1569   ierr = PetscLogFlops(2.0*a->nz*bs2);CHKERRQ(ierr);
1570   PetscFunctionReturn(0);
1571 }
1572 
1573 #undef __FUNCT__
1574 #define __FUNCT__ "MatMultHermitianTranspose_SeqBAIJ"
1575 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1576 {
1577   PetscScalar    zero = 0.0;
1578   PetscErrorCode ierr;
1579 
1580   PetscFunctionBegin;
1581   ierr = VecSet(zz,zero);CHKERRQ(ierr);
1582   ierr = MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr);
1583   PetscFunctionReturn(0);
1584 }
1585 
1586 #undef __FUNCT__
1587 #define __FUNCT__ "MatMultTranspose_SeqBAIJ"
1588 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1589 {
1590   PetscScalar    zero = 0.0;
1591   PetscErrorCode ierr;
1592 
1593   PetscFunctionBegin;
1594   ierr = VecSet(zz,zero);CHKERRQ(ierr);
1595   ierr = MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr);
1596   PetscFunctionReturn(0);
1597 }
1598 
1599 #undef __FUNCT__
1600 #define __FUNCT__ "MatMultHermitianTransposeAdd_SeqBAIJ"
1601 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1602 {
1603   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1604   PetscScalar       *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5;
1605   MatScalar         *v;
1606   PetscErrorCode    ierr;
1607   PetscInt          mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=NULL;
1608   Mat_CompressedRow cprow   = a->compressedrow;
1609   PetscBool         usecprow=cprow.use;
1610 
1611   PetscFunctionBegin;
1612   if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); }
1613   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1614   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1615 
1616   idx = a->j;
1617   v   = a->a;
1618   if (usecprow) {
1619     mbs  = cprow.nrows;
1620     ii   = cprow.i;
1621     ridx = cprow.rindex;
1622   } else {
1623     mbs=a->mbs;
1624     ii = a->i;
1625     xb = x;
1626   }
1627 
1628   switch (bs) {
1629   case 1:
1630     for (i=0; i<mbs; i++) {
1631       if (usecprow) xb = x + ridx[i];
1632       x1 = xb[0];
1633       ib = idx + ii[0];
1634       n  = ii[1] - ii[0]; ii++;
1635       for (j=0; j<n; j++) {
1636         rval     = ib[j];
1637         z[rval] += PetscConj(*v) * x1;
1638         v++;
1639       }
1640       if (!usecprow) xb++;
1641     }
1642     break;
1643   case 2:
1644     for (i=0; i<mbs; i++) {
1645       if (usecprow) xb = x + 2*ridx[i];
1646       x1 = xb[0]; x2 = xb[1];
1647       ib = idx + ii[0];
1648       n  = ii[1] - ii[0]; ii++;
1649       for (j=0; j<n; j++) {
1650         rval       = ib[j]*2;
1651         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2;
1652         z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2;
1653         v         += 4;
1654       }
1655       if (!usecprow) xb += 2;
1656     }
1657     break;
1658   case 3:
1659     for (i=0; i<mbs; i++) {
1660       if (usecprow) xb = x + 3*ridx[i];
1661       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1662       ib = idx + ii[0];
1663       n  = ii[1] - ii[0]; ii++;
1664       for (j=0; j<n; j++) {
1665         rval       = ib[j]*3;
1666         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3;
1667         z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3;
1668         z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3;
1669         v         += 9;
1670       }
1671       if (!usecprow) xb += 3;
1672     }
1673     break;
1674   case 4:
1675     for (i=0; i<mbs; i++) {
1676       if (usecprow) xb = x + 4*ridx[i];
1677       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1678       ib = idx + ii[0];
1679       n  = ii[1] - ii[0]; ii++;
1680       for (j=0; j<n; j++) {
1681         rval       = ib[j]*4;
1682         z[rval++] +=  PetscConj(v[0])*x1 + PetscConj(v[1])*x2  + PetscConj(v[2])*x3  + PetscConj(v[3])*x4;
1683         z[rval++] +=  PetscConj(v[4])*x1 + PetscConj(v[5])*x2  + PetscConj(v[6])*x3  + PetscConj(v[7])*x4;
1684         z[rval++] +=  PetscConj(v[8])*x1 + PetscConj(v[9])*x2  + PetscConj(v[10])*x3 + PetscConj(v[11])*x4;
1685         z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4;
1686         v         += 16;
1687       }
1688       if (!usecprow) xb += 4;
1689     }
1690     break;
1691   case 5:
1692     for (i=0; i<mbs; i++) {
1693       if (usecprow) xb = x + 5*ridx[i];
1694       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1695       x4 = xb[3]; x5 = xb[4];
1696       ib = idx + ii[0];
1697       n  = ii[1] - ii[0]; ii++;
1698       for (j=0; j<n; j++) {
1699         rval       = ib[j]*5;
1700         z[rval++] +=  PetscConj(v[0])*x1 +  PetscConj(v[1])*x2 +  PetscConj(v[2])*x3 +  PetscConj(v[3])*x4 +  PetscConj(v[4])*x5;
1701         z[rval++] +=  PetscConj(v[5])*x1 +  PetscConj(v[6])*x2 +  PetscConj(v[7])*x3 +  PetscConj(v[8])*x4 +  PetscConj(v[9])*x5;
1702         z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5;
1703         z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5;
1704         z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5;
1705         v         += 25;
1706       }
1707       if (!usecprow) xb += 5;
1708     }
1709     break;
1710   default: {      /* block sizes larger than 5 by 5 are handled by BLAS */
1711     PetscInt    ncols,k;
1712     PetscScalar *work,*workt,*xtmp;
1713 
1714     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet");
1715     if (!a->mult_work) {
1716       k    = PetscMax(A->rmap->n,A->cmap->n);
1717       ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
1718     }
1719     work = a->mult_work;
1720     xtmp = x;
1721     for (i=0; i<mbs; i++) {
1722       n     = ii[1] - ii[0]; ii++;
1723       ncols = n*bs;
1724       ierr  = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1725       if (usecprow) xtmp = x + bs*ridx[i];
1726       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
1727       /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
1728       v += n*bs2;
1729       if (!usecprow) xtmp += bs;
1730       workt = work;
1731       for (j=0; j<n; j++) {
1732         zb = z + bs*(*idx++);
1733         for (k=0; k<bs; k++) zb[k] += workt[k] ;
1734         workt += bs;
1735       }
1736     }
1737     }
1738   }
1739   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1740   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1741   ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr);
1742   PetscFunctionReturn(0);
1743 }
1744 
1745 #undef __FUNCT__
1746 #define __FUNCT__ "MatMultTransposeAdd_SeqBAIJ"
1747 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1748 {
1749   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1750   PetscScalar       *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5;
1751   MatScalar         *v;
1752   PetscErrorCode    ierr;
1753   PetscInt          mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=NULL;
1754   Mat_CompressedRow cprow   = a->compressedrow;
1755   PetscBool         usecprow=cprow.use;
1756 
1757   PetscFunctionBegin;
1758   if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); }
1759   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1760   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1761 
1762   idx = a->j;
1763   v   = a->a;
1764   if (usecprow) {
1765     mbs  = cprow.nrows;
1766     ii   = cprow.i;
1767     ridx = cprow.rindex;
1768   } else {
1769     mbs=a->mbs;
1770     ii = a->i;
1771     xb = x;
1772   }
1773 
1774   switch (bs) {
1775   case 1:
1776     for (i=0; i<mbs; i++) {
1777       if (usecprow) xb = x + ridx[i];
1778       x1 = xb[0];
1779       ib = idx + ii[0];
1780       n  = ii[1] - ii[0]; ii++;
1781       for (j=0; j<n; j++) {
1782         rval     = ib[j];
1783         z[rval] += *v * x1;
1784         v++;
1785       }
1786       if (!usecprow) xb++;
1787     }
1788     break;
1789   case 2:
1790     for (i=0; i<mbs; i++) {
1791       if (usecprow) xb = x + 2*ridx[i];
1792       x1 = xb[0]; x2 = xb[1];
1793       ib = idx + ii[0];
1794       n  = ii[1] - ii[0]; ii++;
1795       for (j=0; j<n; j++) {
1796         rval       = ib[j]*2;
1797         z[rval++] += v[0]*x1 + v[1]*x2;
1798         z[rval++] += v[2]*x1 + v[3]*x2;
1799         v         += 4;
1800       }
1801       if (!usecprow) xb += 2;
1802     }
1803     break;
1804   case 3:
1805     for (i=0; i<mbs; i++) {
1806       if (usecprow) xb = x + 3*ridx[i];
1807       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1808       ib = idx + ii[0];
1809       n  = ii[1] - ii[0]; ii++;
1810       for (j=0; j<n; j++) {
1811         rval       = ib[j]*3;
1812         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1813         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1814         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1815         v         += 9;
1816       }
1817       if (!usecprow) xb += 3;
1818     }
1819     break;
1820   case 4:
1821     for (i=0; i<mbs; i++) {
1822       if (usecprow) xb = x + 4*ridx[i];
1823       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1824       ib = idx + ii[0];
1825       n  = ii[1] - ii[0]; ii++;
1826       for (j=0; j<n; j++) {
1827         rval       = ib[j]*4;
1828         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1829         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1830         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1831         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1832         v         += 16;
1833       }
1834       if (!usecprow) xb += 4;
1835     }
1836     break;
1837   case 5:
1838     for (i=0; i<mbs; i++) {
1839       if (usecprow) xb = x + 5*ridx[i];
1840       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1841       x4 = xb[3]; x5 = xb[4];
1842       ib = idx + ii[0];
1843       n  = ii[1] - ii[0]; ii++;
1844       for (j=0; j<n; j++) {
1845         rval       = ib[j]*5;
1846         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1847         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1848         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1849         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1850         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1851         v         += 25;
1852       }
1853       if (!usecprow) xb += 5;
1854     }
1855     break;
1856   default: {      /* block sizes larger then 5 by 5 are handled by BLAS */
1857     PetscInt    ncols,k;
1858     PetscScalar *work,*workt,*xtmp;
1859 
1860     if (!a->mult_work) {
1861       k    = PetscMax(A->rmap->n,A->cmap->n);
1862       ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
1863     }
1864     work = a->mult_work;
1865     xtmp = x;
1866     for (i=0; i<mbs; i++) {
1867       n     = ii[1] - ii[0]; ii++;
1868       ncols = n*bs;
1869       ierr  = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1870       if (usecprow) xtmp = x + bs*ridx[i];
1871       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
1872       /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
1873       v += n*bs2;
1874       if (!usecprow) xtmp += bs;
1875       workt = work;
1876       for (j=0; j<n; j++) {
1877         zb = z + bs*(*idx++);
1878         for (k=0; k<bs; k++) zb[k] += workt[k];
1879         workt += bs;
1880       }
1881     }
1882     }
1883   }
1884   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1885   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1886   ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr);
1887   PetscFunctionReturn(0);
1888 }
1889 
1890 #undef __FUNCT__
1891 #define __FUNCT__ "MatScale_SeqBAIJ"
1892 PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
1893 {
1894   Mat_SeqBAIJ    *a      = (Mat_SeqBAIJ*)inA->data;
1895   PetscInt       totalnz = a->bs2*a->nz;
1896   PetscScalar    oalpha  = alpha;
1897   PetscErrorCode ierr;
1898   PetscBLASInt   one = 1,tnz;
1899 
1900   PetscFunctionBegin;
1901   ierr = PetscBLASIntCast(totalnz,&tnz);CHKERRQ(ierr);
1902   PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one));
1903   ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
1904   PetscFunctionReturn(0);
1905 }
1906 
1907 #undef __FUNCT__
1908 #define __FUNCT__ "MatNorm_SeqBAIJ"
1909 PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
1910 {
1911   PetscErrorCode ierr;
1912   Mat_SeqBAIJ    *a  = (Mat_SeqBAIJ*)A->data;
1913   MatScalar      *v  = a->a;
1914   PetscReal      sum = 0.0;
1915   PetscInt       i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1;
1916 
1917   PetscFunctionBegin;
1918   if (type == NORM_FROBENIUS) {
1919     for (i=0; i< bs2*nz; i++) {
1920       sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
1921     }
1922     *norm = PetscSqrtReal(sum);
1923   } else if (type == NORM_1) { /* maximum column sum */
1924     PetscReal *tmp;
1925     PetscInt  *bcol = a->j;
1926     ierr = PetscMalloc((A->cmap->n+1)*sizeof(PetscReal),&tmp);CHKERRQ(ierr);
1927     ierr = PetscMemzero(tmp,A->cmap->n*sizeof(PetscReal));CHKERRQ(ierr);
1928     for (i=0; i<nz; i++) {
1929       for (j=0; j<bs; j++) {
1930         k1 = bs*(*bcol) + j; /* column index */
1931         for (k=0; k<bs; k++) {
1932           tmp[k1] += PetscAbsScalar(*v); v++;
1933         }
1934       }
1935       bcol++;
1936     }
1937     *norm = 0.0;
1938     for (j=0; j<A->cmap->n; j++) {
1939       if (tmp[j] > *norm) *norm = tmp[j];
1940     }
1941     ierr = PetscFree(tmp);CHKERRQ(ierr);
1942   } else if (type == NORM_INFINITY) { /* maximum row sum */
1943     *norm = 0.0;
1944     for (k=0; k<bs; k++) {
1945       for (j=0; j<a->mbs; j++) {
1946         v   = a->a + bs2*a->i[j] + k;
1947         sum = 0.0;
1948         for (i=0; i<a->i[j+1]-a->i[j]; i++) {
1949           for (k1=0; k1<bs; k1++) {
1950             sum += PetscAbsScalar(*v);
1951             v   += bs;
1952           }
1953         }
1954         if (sum > *norm) *norm = sum;
1955       }
1956     }
1957   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
1958   PetscFunctionReturn(0);
1959 }
1960 
1961 
1962 #undef __FUNCT__
1963 #define __FUNCT__ "MatEqual_SeqBAIJ"
1964 PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg)
1965 {
1966   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
1967   PetscErrorCode ierr;
1968 
1969   PetscFunctionBegin;
1970   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1971   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1972     *flg = PETSC_FALSE;
1973     PetscFunctionReturn(0);
1974   }
1975 
1976   /* if the a->i are the same */
1977   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1978   if (!*flg) PetscFunctionReturn(0);
1979 
1980   /* if a->j are the same */
1981   ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1982   if (!*flg) PetscFunctionReturn(0);
1983 
1984   /* if a->a are the same */
1985   ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
1986   PetscFunctionReturn(0);
1987 
1988 }
1989 
1990 #undef __FUNCT__
1991 #define __FUNCT__ "MatGetDiagonal_SeqBAIJ"
1992 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
1993 {
1994   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1995   PetscErrorCode ierr;
1996   PetscInt       i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1997   PetscScalar    *x,zero = 0.0;
1998   MatScalar      *aa,*aa_j;
1999 
2000   PetscFunctionBegin;
2001   if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
2002   bs   = A->rmap->bs;
2003   aa   = a->a;
2004   ai   = a->i;
2005   aj   = a->j;
2006   ambs = a->mbs;
2007   bs2  = a->bs2;
2008 
2009   ierr = VecSet(v,zero);CHKERRQ(ierr);
2010   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
2011   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
2012   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
2013   for (i=0; i<ambs; i++) {
2014     for (j=ai[i]; j<ai[i+1]; j++) {
2015       if (aj[j] == i) {
2016         row  = i*bs;
2017         aa_j = aa+j*bs2;
2018         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
2019         break;
2020       }
2021     }
2022   }
2023   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
2024   PetscFunctionReturn(0);
2025 }
2026 
2027 #undef __FUNCT__
2028 #define __FUNCT__ "MatDiagonalScale_SeqBAIJ"
2029 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
2030 {
2031   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2032   const PetscScalar *l,*r,*li,*ri;
2033   PetscScalar       x;
2034   MatScalar         *aa, *v;
2035   PetscErrorCode    ierr;
2036   PetscInt          i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai;
2037   const PetscInt    *ai,*aj;
2038 
2039   PetscFunctionBegin;
2040   ai  = a->i;
2041   aj  = a->j;
2042   aa  = a->a;
2043   m   = A->rmap->n;
2044   n   = A->cmap->n;
2045   bs  = A->rmap->bs;
2046   mbs = a->mbs;
2047   bs2 = a->bs2;
2048   if (ll) {
2049     ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr);
2050     ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
2051     if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
2052     for (i=0; i<mbs; i++) { /* for each block row */
2053       M  = ai[i+1] - ai[i];
2054       li = l + i*bs;
2055       v  = aa + bs2*ai[i];
2056       for (j=0; j<M; j++) { /* for each block */
2057         for (k=0; k<bs2; k++) {
2058           (*v++) *= li[k%bs];
2059         }
2060       }
2061     }
2062     ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr);
2063     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2064   }
2065 
2066   if (rr) {
2067     ierr = VecGetArrayRead(rr,&r);CHKERRQ(ierr);
2068     ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr);
2069     if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
2070     for (i=0; i<mbs; i++) { /* for each block row */
2071       iai = ai[i];
2072       M   = ai[i+1] - iai;
2073       v   = aa + bs2*iai;
2074       for (j=0; j<M; j++) { /* for each block */
2075         ri = r + bs*aj[iai+j];
2076         for (k=0; k<bs; k++) {
2077           x = ri[k];
2078           for (tmp=0; tmp<bs; tmp++) v[tmp] *= x;
2079           v += bs;
2080         }
2081       }
2082     }
2083     ierr = VecRestoreArrayRead(rr,&r);CHKERRQ(ierr);
2084     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2085   }
2086   PetscFunctionReturn(0);
2087 }
2088 
2089 
2090 #undef __FUNCT__
2091 #define __FUNCT__ "MatGetInfo_SeqBAIJ"
2092 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
2093 {
2094   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2095 
2096   PetscFunctionBegin;
2097   info->block_size   = a->bs2;
2098   info->nz_allocated = a->bs2*a->maxnz;
2099   info->nz_used      = a->bs2*a->nz;
2100   info->nz_unneeded  = (double)(info->nz_allocated - info->nz_used);
2101   info->assemblies   = A->num_ass;
2102   info->mallocs      = A->info.mallocs;
2103   info->memory       = ((PetscObject)A)->mem;
2104   if (A->factortype) {
2105     info->fill_ratio_given  = A->info.fill_ratio_given;
2106     info->fill_ratio_needed = A->info.fill_ratio_needed;
2107     info->factor_mallocs    = A->info.factor_mallocs;
2108   } else {
2109     info->fill_ratio_given  = 0;
2110     info->fill_ratio_needed = 0;
2111     info->factor_mallocs    = 0;
2112   }
2113   PetscFunctionReturn(0);
2114 }
2115 
2116 
2117 #undef __FUNCT__
2118 #define __FUNCT__ "MatZeroEntries_SeqBAIJ"
2119 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
2120 {
2121   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2122   PetscErrorCode ierr;
2123 
2124   PetscFunctionBegin;
2125   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
2126   PetscFunctionReturn(0);
2127 }
2128