xref: /petsc/src/mat/impls/baij/seq/baij2.c (revision 2f6eced2a19e978d64f27de66fbc6c26cc5c7934)
1 
2 #include <../src/mat/impls/baij/seq/baij.h>
3 #include <petsc/private/kernels/blockinvert.h>
4 #include <petscbt.h>
5 #include <petscblaslapack.h>
6 
7 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
8 {
9   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
10   PetscErrorCode ierr;
11   PetscInt       row,i,j,k,l,m,n,*nidx,isz,val,ival;
12   const PetscInt *idx;
13   PetscInt       start,end,*ai,*aj,bs,*nidx2;
14   PetscBT        table;
15 
16   PetscFunctionBegin;
17   m  = a->mbs;
18   ai = a->i;
19   aj = a->j;
20   bs = A->rmap->bs;
21 
22   if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
23 
24   ierr = PetscBTCreate(m,&table);CHKERRQ(ierr);
25   ierr = PetscMalloc1(m+1,&nidx);CHKERRQ(ierr);
26   ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr);
27 
28   for (i=0; i<is_max; i++) {
29     /* Initialise the two local arrays */
30     isz  = 0;
31     ierr = PetscBTMemzero(m,table);CHKERRQ(ierr);
32 
33     /* Extract the indices, assume there can be duplicate entries */
34     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
35     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
36 
37     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
38     for (j=0; j<n; ++j) {
39       ival = idx[j]/bs; /* convert the indices into block indices */
40       if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
41       if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival;
42     }
43     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
44     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
45 
46     k = 0;
47     for (j=0; j<ov; j++) { /* for each overlap*/
48       n = isz;
49       for (; k<n; k++) {  /* do only those rows in nidx[k], which are not done yet */
50         row   = nidx[k];
51         start = ai[row];
52         end   = ai[row+1];
53         for (l = start; l<end; l++) {
54           val = aj[l];
55           if (!PetscBTLookupSet(table,val)) nidx[isz++] = val;
56         }
57       }
58     }
59     /* expand the Index Set */
60     for (j=0; j<isz; j++) {
61       for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
62     }
63     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
64   }
65   ierr = PetscBTDestroy(&table);CHKERRQ(ierr);
66   ierr = PetscFree(nidx);CHKERRQ(ierr);
67   ierr = PetscFree(nidx2);CHKERRQ(ierr);
68   PetscFunctionReturn(0);
69 }
70 
71 PetscErrorCode MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
72 {
73   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*c;
74   PetscErrorCode ierr;
75   PetscInt       *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
76   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
77   const PetscInt *irow,*icol;
78   PetscInt       nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
79   PetscInt       *aj = a->j,*ai = a->i;
80   MatScalar      *mat_a;
81   Mat            C;
82   PetscBool      flag;
83 
84   PetscFunctionBegin;
85   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
86   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
87   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
88   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
89 
90   ierr  = PetscCalloc1(1+oldcols,&smap);CHKERRQ(ierr);
91   ssmap = smap;
92   ierr  = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr);
93   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
94   /* determine lens of each row */
95   for (i=0; i<nrows; i++) {
96     kstart  = ai[irow[i]];
97     kend    = kstart + a->ilen[irow[i]];
98     lens[i] = 0;
99     for (k=kstart; k<kend; k++) {
100       if (ssmap[aj[k]]) lens[i]++;
101     }
102   }
103   /* Create and fill new matrix */
104   if (scall == MAT_REUSE_MATRIX) {
105     c = (Mat_SeqBAIJ*)((*B)->data);
106 
107     if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
108     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr);
109     if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
110     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr);
111     C    = *B;
112   } else {
113     ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr);
114     ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
115     ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
116     ierr = MatSeqBAIJSetPreallocation(C,bs,0,lens);CHKERRQ(ierr);
117   }
118   c = (Mat_SeqBAIJ*)(C->data);
119   for (i=0; i<nrows; i++) {
120     row      = irow[i];
121     kstart   = ai[row];
122     kend     = kstart + a->ilen[row];
123     mat_i    = c->i[i];
124     mat_j    = c->j + mat_i;
125     mat_a    = c->a + mat_i*bs2;
126     mat_ilen = c->ilen + i;
127     for (k=kstart; k<kend; k++) {
128       if ((tcol=ssmap[a->j[k]])) {
129         *mat_j++ = tcol - 1;
130         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
131         mat_a   += bs2;
132         (*mat_ilen)++;
133       }
134     }
135   }
136   /* sort */
137   {
138     MatScalar *work;
139     ierr = PetscMalloc1(bs2,&work);CHKERRQ(ierr);
140     for (i=0; i<nrows; i++) {
141       PetscInt ilen;
142       mat_i = c->i[i];
143       mat_j = c->j + mat_i;
144       mat_a = c->a + mat_i*bs2;
145       ilen  = c->ilen[i];
146       ierr  = PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);CHKERRQ(ierr);
147     }
148     ierr = PetscFree(work);CHKERRQ(ierr);
149   }
150 
151   /* Free work space */
152   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
153   ierr = PetscFree(smap);CHKERRQ(ierr);
154   ierr = PetscFree(lens);CHKERRQ(ierr);
155   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
156   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
157 
158   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
159   *B   = C;
160   PetscFunctionReturn(0);
161 }
162 
163 PetscErrorCode MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
164 {
165   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
166   IS             is1,is2;
167   PetscErrorCode ierr;
168   PetscInt       *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs,j;
169   const PetscInt *irow,*icol;
170 
171   PetscFunctionBegin;
172   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
173   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
174   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
175   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
176 
177   /* Verify if the indices corespond to each element in a block
178    and form the IS with compressed IS */
179   maxmnbs = PetscMax(a->mbs,a->nbs);
180   ierr = PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);CHKERRQ(ierr);
181   ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr);
182   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
183   for (i=0; i<a->mbs; i++) {
184     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
185   }
186   count = 0;
187   for (i=0; i<nrows; i++) {
188     j = irow[i] / bs;
189     if ((vary[j]--)==bs) iary[count++] = j;
190   }
191   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr);
192 
193   ierr = PetscMemzero(vary,(a->nbs)*sizeof(PetscInt));CHKERRQ(ierr);
194   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
195   for (i=0; i<a->nbs; i++) {
196     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
197   }
198   count = 0;
199   for (i=0; i<ncols; i++) {
200     j = icol[i] / bs;
201     if ((vary[j]--)==bs) iary[count++] = j;
202   }
203   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr);
204   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
205   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
206   ierr = PetscFree2(vary,iary);CHKERRQ(ierr);
207 
208   ierr = MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr);
209   ierr = ISDestroy(&is1);CHKERRQ(ierr);
210   ierr = ISDestroy(&is2);CHKERRQ(ierr);
211   PetscFunctionReturn(0);
212 }
213 
214 PetscErrorCode MatGetSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
215 {
216   PetscErrorCode ierr;
217   PetscInt       i;
218 
219   PetscFunctionBegin;
220   if (scall == MAT_INITIAL_MATRIX) {
221     ierr = PetscMalloc1(n+1,B);CHKERRQ(ierr);
222   }
223 
224   for (i=0; i<n; i++) {
225     ierr = MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr);
226   }
227   PetscFunctionReturn(0);
228 }
229 
230 
231 /* -------------------------------------------------------*/
232 /* Should check that shapes of vectors and matrices match */
233 /* -------------------------------------------------------*/
234 
235 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
236 {
237   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
238   PetscScalar       *z,sum;
239   const PetscScalar *x;
240   const MatScalar   *v;
241   PetscErrorCode    ierr;
242   PetscInt          mbs,i,n;
243   const PetscInt    *idx,*ii,*ridx=NULL;
244   PetscBool         usecprow=a->compressedrow.use;
245 
246   PetscFunctionBegin;
247   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
248   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
249 
250   if (usecprow) {
251     mbs  = a->compressedrow.nrows;
252     ii   = a->compressedrow.i;
253     ridx = a->compressedrow.rindex;
254     ierr = PetscMemzero(z,a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
255   } else {
256     mbs = a->mbs;
257     ii  = a->i;
258   }
259 
260   for (i=0; i<mbs; i++) {
261     n   = ii[1] - ii[0];
262     v   = a->a + ii[0];
263     idx = a->j + ii[0];
264     ii++;
265     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
266     PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
267     sum = 0.0;
268     PetscSparseDensePlusDot(sum,x,v,idx,n);
269     if (usecprow) {
270       z[ridx[i]] = sum;
271     } else {
272       z[i]       = sum;
273     }
274   }
275   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
276   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
277   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
278   PetscFunctionReturn(0);
279 }
280 
281 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
282 {
283   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
284   PetscScalar       *z = 0,sum1,sum2,*zarray;
285   const PetscScalar *x,*xb;
286   PetscScalar       x1,x2;
287   const MatScalar   *v;
288   PetscErrorCode    ierr;
289   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
290   PetscBool         usecprow=a->compressedrow.use;
291 
292   PetscFunctionBegin;
293   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
294   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
295 
296   idx = a->j;
297   v   = a->a;
298   if (usecprow) {
299     mbs  = a->compressedrow.nrows;
300     ii   = a->compressedrow.i;
301     ridx = a->compressedrow.rindex;
302     ierr = PetscMemzero(zarray,2*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
303   } else {
304     mbs = a->mbs;
305     ii  = a->i;
306     z   = zarray;
307   }
308 
309   for (i=0; i<mbs; i++) {
310     n           = ii[1] - ii[0]; ii++;
311     sum1        = 0.0; sum2 = 0.0;
312     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
313     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
314     for (j=0; j<n; j++) {
315       xb    = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
316       sum1 += v[0]*x1 + v[2]*x2;
317       sum2 += v[1]*x1 + v[3]*x2;
318       v    += 4;
319     }
320     if (usecprow) z = zarray + 2*ridx[i];
321     z[0] = sum1; z[1] = sum2;
322     if (!usecprow) z += 2;
323   }
324   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
325   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
326   ierr = PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);CHKERRQ(ierr);
327   PetscFunctionReturn(0);
328 }
329 
330 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
331 {
332   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
333   PetscScalar       *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray;
334   const PetscScalar *x,*xb;
335   const MatScalar   *v;
336   PetscErrorCode    ierr;
337   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
338   PetscBool         usecprow=a->compressedrow.use;
339 
340 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
341 #pragma disjoint(*v,*z,*xb)
342 #endif
343 
344   PetscFunctionBegin;
345   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
346   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
347 
348   idx = a->j;
349   v   = a->a;
350   if (usecprow) {
351     mbs  = a->compressedrow.nrows;
352     ii   = a->compressedrow.i;
353     ridx = a->compressedrow.rindex;
354     ierr = PetscMemzero(zarray,3*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
355   } else {
356     mbs = a->mbs;
357     ii  = a->i;
358     z   = zarray;
359   }
360 
361   for (i=0; i<mbs; i++) {
362     n           = ii[1] - ii[0]; ii++;
363     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0;
364     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
365     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
366     for (j=0; j<n; j++) {
367       xb = x + 3*(*idx++);
368       x1 = xb[0];
369       x2 = xb[1];
370       x3 = xb[2];
371 
372       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
373       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
374       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
375       v    += 9;
376     }
377     if (usecprow) z = zarray + 3*ridx[i];
378     z[0] = sum1; z[1] = sum2; z[2] = sum3;
379     if (!usecprow) z += 3;
380   }
381   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
382   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
383   ierr = PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);CHKERRQ(ierr);
384   PetscFunctionReturn(0);
385 }
386 
387 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
388 {
389   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
390   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
391   const PetscScalar *x,*xb;
392   const MatScalar   *v;
393   PetscErrorCode    ierr;
394   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
395   PetscBool         usecprow=a->compressedrow.use;
396 
397   PetscFunctionBegin;
398   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
399   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
400 
401   idx = a->j;
402   v   = a->a;
403   if (usecprow) {
404     mbs  = a->compressedrow.nrows;
405     ii   = a->compressedrow.i;
406     ridx = a->compressedrow.rindex;
407     ierr = PetscMemzero(zarray,4*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
408   } else {
409     mbs = a->mbs;
410     ii  = a->i;
411     z   = zarray;
412   }
413 
414   for (i=0; i<mbs; i++) {
415     n = ii[1] - ii[0];
416     ii++;
417     sum1 = 0.0;
418     sum2 = 0.0;
419     sum3 = 0.0;
420     sum4 = 0.0;
421 
422     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
423     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
424     for (j=0; j<n; j++) {
425       xb    = x + 4*(*idx++);
426       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
427       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
428       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
429       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
430       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
431       v    += 16;
432     }
433     if (usecprow) z = zarray + 4*ridx[i];
434     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
435     if (!usecprow) z += 4;
436   }
437   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
438   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
439   ierr = PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);CHKERRQ(ierr);
440   PetscFunctionReturn(0);
441 }
442 
443 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
444 {
445   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
446   PetscScalar       sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray;
447   const PetscScalar *xb,*x;
448   const MatScalar   *v;
449   PetscErrorCode    ierr;
450   const PetscInt    *idx,*ii,*ridx=NULL;
451   PetscInt          mbs,i,j,n;
452   PetscBool         usecprow=a->compressedrow.use;
453 
454   PetscFunctionBegin;
455   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
456   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
457 
458   idx = a->j;
459   v   = a->a;
460   if (usecprow) {
461     mbs  = a->compressedrow.nrows;
462     ii   = a->compressedrow.i;
463     ridx = a->compressedrow.rindex;
464     ierr = PetscMemzero(zarray,5*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
465   } else {
466     mbs = a->mbs;
467     ii  = a->i;
468     z   = zarray;
469   }
470 
471   for (i=0; i<mbs; i++) {
472     n           = ii[1] - ii[0]; ii++;
473     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
474     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
475     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
476     for (j=0; j<n; j++) {
477       xb    = x + 5*(*idx++);
478       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
479       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
480       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
481       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
482       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
483       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
484       v    += 25;
485     }
486     if (usecprow) z = zarray + 5*ridx[i];
487     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
488     if (!usecprow) z += 5;
489   }
490   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
491   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
492   ierr = PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);CHKERRQ(ierr);
493   PetscFunctionReturn(0);
494 }
495 
496 
497 
498 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
499 {
500   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
501   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
502   const PetscScalar *x,*xb;
503   PetscScalar       x1,x2,x3,x4,x5,x6,*zarray;
504   const MatScalar   *v;
505   PetscErrorCode    ierr;
506   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
507   PetscBool         usecprow=a->compressedrow.use;
508 
509   PetscFunctionBegin;
510   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
511   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
512 
513   idx = a->j;
514   v   = a->a;
515   if (usecprow) {
516     mbs  = a->compressedrow.nrows;
517     ii   = a->compressedrow.i;
518     ridx = a->compressedrow.rindex;
519     ierr = PetscMemzero(zarray,6*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
520   } else {
521     mbs = a->mbs;
522     ii  = a->i;
523     z   = zarray;
524   }
525 
526   for (i=0; i<mbs; i++) {
527     n  = ii[1] - ii[0];
528     ii++;
529     sum1 = 0.0;
530     sum2 = 0.0;
531     sum3 = 0.0;
532     sum4 = 0.0;
533     sum5 = 0.0;
534     sum6 = 0.0;
535 
536     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
537     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
538     for (j=0; j<n; j++) {
539       xb    = x + 6*(*idx++);
540       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
541       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
542       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
543       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
544       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
545       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
546       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
547       v    += 36;
548     }
549     if (usecprow) z = zarray + 6*ridx[i];
550     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
551     if (!usecprow) z += 6;
552   }
553 
554   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
555   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
556   ierr = PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);CHKERRQ(ierr);
557   PetscFunctionReturn(0);
558 }
559 
560 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
561 {
562   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
563   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
564   const PetscScalar *x,*xb;
565   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*zarray;
566   const MatScalar   *v;
567   PetscErrorCode    ierr;
568   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
569   PetscBool         usecprow=a->compressedrow.use;
570 
571   PetscFunctionBegin;
572   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
573   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
574 
575   idx = a->j;
576   v   = a->a;
577   if (usecprow) {
578     mbs  = a->compressedrow.nrows;
579     ii   = a->compressedrow.i;
580     ridx = a->compressedrow.rindex;
581     ierr = PetscMemzero(zarray,7*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
582   } else {
583     mbs = a->mbs;
584     ii  = a->i;
585     z   = zarray;
586   }
587 
588   for (i=0; i<mbs; i++) {
589     n  = ii[1] - ii[0];
590     ii++;
591     sum1 = 0.0;
592     sum2 = 0.0;
593     sum3 = 0.0;
594     sum4 = 0.0;
595     sum5 = 0.0;
596     sum6 = 0.0;
597     sum7 = 0.0;
598 
599     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
600     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
601     for (j=0; j<n; j++) {
602       xb    = x + 7*(*idx++);
603       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
604       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
605       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
606       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
607       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
608       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
609       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
610       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
611       v    += 49;
612     }
613     if (usecprow) z = zarray + 7*ridx[i];
614     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
615     if (!usecprow) z += 7;
616   }
617 
618   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
619   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
620   ierr = PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);CHKERRQ(ierr);
621   PetscFunctionReturn(0);
622 }
623 
624 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
625 /* Default MatMult for block size 15 */
626 
627 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz)
628 {
629   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
630   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
631   const PetscScalar *x,*xb;
632   PetscScalar       *zarray,xv;
633   const MatScalar   *v;
634   PetscErrorCode    ierr;
635   const PetscInt    *ii,*ij=a->j,*idx;
636   PetscInt          mbs,i,j,k,n,*ridx=NULL;
637   PetscBool         usecprow=a->compressedrow.use;
638 
639   PetscFunctionBegin;
640   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
641   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
642 
643   v = a->a;
644   if (usecprow) {
645     mbs  = a->compressedrow.nrows;
646     ii   = a->compressedrow.i;
647     ridx = a->compressedrow.rindex;
648     ierr = PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
649   } else {
650     mbs = a->mbs;
651     ii  = a->i;
652     z   = zarray;
653   }
654 
655   for (i=0; i<mbs; i++) {
656     n    = ii[i+1] - ii[i];
657     idx  = ij + ii[i];
658     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
659     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
660 
661     for (j=0; j<n; j++) {
662       xb = x + 15*(idx[j]);
663 
664       for (k=0; k<15; k++) {
665         xv     =  xb[k];
666         sum1  += v[0]*xv;
667         sum2  += v[1]*xv;
668         sum3  += v[2]*xv;
669         sum4  += v[3]*xv;
670         sum5  += v[4]*xv;
671         sum6  += v[5]*xv;
672         sum7  += v[6]*xv;
673         sum8  += v[7]*xv;
674         sum9  += v[8]*xv;
675         sum10 += v[9]*xv;
676         sum11 += v[10]*xv;
677         sum12 += v[11]*xv;
678         sum13 += v[12]*xv;
679         sum14 += v[13]*xv;
680         sum15 += v[14]*xv;
681         v     += 15;
682       }
683     }
684     if (usecprow) z = zarray + 15*ridx[i];
685     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
686     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
687 
688     if (!usecprow) z += 15;
689   }
690 
691   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
692   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
693   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
694   PetscFunctionReturn(0);
695 }
696 
697 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
698 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz)
699 {
700   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
701   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
702   const PetscScalar *x,*xb;
703   PetscScalar       x1,x2,x3,x4,*zarray;
704   const MatScalar   *v;
705   PetscErrorCode    ierr;
706   const PetscInt    *ii,*ij=a->j,*idx;
707   PetscInt          mbs,i,j,n,*ridx=NULL;
708   PetscBool         usecprow=a->compressedrow.use;
709 
710   PetscFunctionBegin;
711   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
712   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
713 
714   v = a->a;
715   if (usecprow) {
716     mbs  = a->compressedrow.nrows;
717     ii   = a->compressedrow.i;
718     ridx = a->compressedrow.rindex;
719     ierr = PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
720   } else {
721     mbs = a->mbs;
722     ii  = a->i;
723     z   = zarray;
724   }
725 
726   for (i=0; i<mbs; i++) {
727     n    = ii[i+1] - ii[i];
728     idx  = ij + ii[i];
729     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
730     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
731 
732     for (j=0; j<n; j++) {
733       xb = x + 15*(idx[j]);
734       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
735 
736       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
737       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
738       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
739       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
740       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
741       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
742       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
743       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
744       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
745       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
746       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
747       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
748       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
749       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
750       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
751 
752       v += 60;
753 
754       x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];
755 
756       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
757       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
758       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
759       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
760       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
761       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
762       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
763       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
764       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
765       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
766       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
767       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
768       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
769       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
770       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
771       v     += 60;
772 
773       x1     = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
774       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
775       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
776       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
777       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
778       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
779       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
780       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
781       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
782       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
783       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
784       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
785       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
786       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
787       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
788       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
789       v     += 60;
790 
791       x1     = xb[12]; x2 = xb[13]; x3 = xb[14];
792       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3;
793       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3;
794       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3;
795       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3;
796       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3;
797       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3;
798       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3;
799       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3;
800       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3;
801       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3;
802       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3;
803       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3;
804       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3;
805       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3;
806       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3;
807       v     += 45;
808     }
809     if (usecprow) z = zarray + 15*ridx[i];
810     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
811     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
812 
813     if (!usecprow) z += 15;
814   }
815 
816   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
817   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
818   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
819   PetscFunctionReturn(0);
820 }
821 
822 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
823 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz)
824 {
825   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
826   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
827   const PetscScalar *x,*xb;
828   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,*zarray;
829   const MatScalar   *v;
830   PetscErrorCode    ierr;
831   const PetscInt    *ii,*ij=a->j,*idx;
832   PetscInt          mbs,i,j,n,*ridx=NULL;
833   PetscBool         usecprow=a->compressedrow.use;
834 
835   PetscFunctionBegin;
836   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
837   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
838 
839   v = a->a;
840   if (usecprow) {
841     mbs  = a->compressedrow.nrows;
842     ii   = a->compressedrow.i;
843     ridx = a->compressedrow.rindex;
844     ierr = PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
845   } else {
846     mbs = a->mbs;
847     ii  = a->i;
848     z   = zarray;
849   }
850 
851   for (i=0; i<mbs; i++) {
852     n    = ii[i+1] - ii[i];
853     idx  = ij + ii[i];
854     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
855     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
856 
857     for (j=0; j<n; j++) {
858       xb = x + 15*(idx[j]);
859       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
860       x8 = xb[7];
861 
862       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8;
863       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8;
864       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8;
865       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8;
866       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8;
867       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8;
868       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8;
869       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8;
870       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8;
871       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8;
872       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8;
873       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8;
874       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8;
875       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8;
876       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8;
877       v     += 120;
878 
879       x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14];
880 
881       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7;
882       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7;
883       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7;
884       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7;
885       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7;
886       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7;
887       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7;
888       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7;
889       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7;
890       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7;
891       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7;
892       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7;
893       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7;
894       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7;
895       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7;
896       v     += 105;
897     }
898     if (usecprow) z = zarray + 15*ridx[i];
899     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
900     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
901 
902     if (!usecprow) z += 15;
903   }
904 
905   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
906   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
907   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
908   PetscFunctionReturn(0);
909 }
910 
911 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
912 
913 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz)
914 {
915   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
916   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
917   const PetscScalar *x,*xb;
918   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray;
919   const MatScalar   *v;
920   PetscErrorCode    ierr;
921   const PetscInt    *ii,*ij=a->j,*idx;
922   PetscInt          mbs,i,j,n,*ridx=NULL;
923   PetscBool         usecprow=a->compressedrow.use;
924 
925   PetscFunctionBegin;
926   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
927   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
928 
929   v = a->a;
930   if (usecprow) {
931     mbs  = a->compressedrow.nrows;
932     ii   = a->compressedrow.i;
933     ridx = a->compressedrow.rindex;
934     ierr = PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
935   } else {
936     mbs = a->mbs;
937     ii  = a->i;
938     z   = zarray;
939   }
940 
941   for (i=0; i<mbs; i++) {
942     n    = ii[i+1] - ii[i];
943     idx  = ij + ii[i];
944     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
945     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
946 
947     for (j=0; j<n; j++) {
948       xb = x + 15*(idx[j]);
949       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
950       x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14];
951 
952       sum1  +=  v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
953       sum2  +=  v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
954       sum3  +=  v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
955       sum4  +=  v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
956       sum5  += v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
957       sum6  += v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
958       sum7  += v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
959       sum8  += v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
960       sum9  += v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
961       sum10 += v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
962       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
963       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
964       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
965       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
966       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
967       v     += 225;
968     }
969     if (usecprow) z = zarray + 15*ridx[i];
970     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
971     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
972 
973     if (!usecprow) z += 15;
974   }
975 
976   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
977   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
978   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
979   PetscFunctionReturn(0);
980 }
981 
982 
983 /*
984     This will not work with MatScalar == float because it calls the BLAS
985 */
986 PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
987 {
988   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
989   PetscScalar       *z = 0,*work,*workt,*zarray;
990   const PetscScalar *x,*xb;
991   const MatScalar   *v;
992   PetscErrorCode    ierr;
993   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
994   const PetscInt    *idx,*ii,*ridx=NULL;
995   PetscInt          ncols,k;
996   PetscBool         usecprow=a->compressedrow.use;
997 
998   PetscFunctionBegin;
999   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1000   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1001 
1002   idx = a->j;
1003   v   = a->a;
1004   if (usecprow) {
1005     mbs  = a->compressedrow.nrows;
1006     ii   = a->compressedrow.i;
1007     ridx = a->compressedrow.rindex;
1008     ierr = PetscMemzero(zarray,bs*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1009   } else {
1010     mbs = a->mbs;
1011     ii  = a->i;
1012     z   = zarray;
1013   }
1014 
1015   if (!a->mult_work) {
1016     k    = PetscMax(A->rmap->n,A->cmap->n);
1017     ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
1018   }
1019   work = a->mult_work;
1020   for (i=0; i<mbs; i++) {
1021     n           = ii[1] - ii[0]; ii++;
1022     ncols       = n*bs;
1023     workt       = work;
1024     for (j=0; j<n; j++) {
1025       xb = x + bs*(*idx++);
1026       for (k=0; k<bs; k++) workt[k] = xb[k];
1027       workt += bs;
1028     }
1029     if (usecprow) z = zarray + bs*ridx[i];
1030     PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
1031     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
1032     v += n*bs2;
1033     if (!usecprow) z += bs;
1034   }
1035   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1036   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1037   ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);CHKERRQ(ierr);
1038   PetscFunctionReturn(0);
1039 }
1040 
1041 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
1042 {
1043   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1044   const PetscScalar *x;
1045   PetscScalar       *y,*z,sum;
1046   const MatScalar   *v;
1047   PetscErrorCode    ierr;
1048   PetscInt          mbs=a->mbs,i,n,*ridx=NULL;
1049   const PetscInt    *idx,*ii;
1050   PetscBool         usecprow=a->compressedrow.use;
1051 
1052   PetscFunctionBegin;
1053   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1054   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
1055 
1056   idx = a->j;
1057   v   = a->a;
1058   if (usecprow) {
1059     if (zz != yy) {
1060       ierr = PetscMemcpy(z,y,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1061     }
1062     mbs  = a->compressedrow.nrows;
1063     ii   = a->compressedrow.i;
1064     ridx = a->compressedrow.rindex;
1065   } else {
1066     ii = a->i;
1067   }
1068 
1069   for (i=0; i<mbs; i++) {
1070     n = ii[1] - ii[0];
1071     ii++;
1072     if (!usecprow) {
1073       sum         = y[i];
1074     } else {
1075       sum = y[ridx[i]];
1076     }
1077     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1078     PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
1079     PetscSparseDensePlusDot(sum,x,v,idx,n);
1080     v   += n;
1081     idx += n;
1082     if (usecprow) {
1083       z[ridx[i]] = sum;
1084     } else {
1085       z[i] = sum;
1086     }
1087   }
1088   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1089   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
1090   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1091   PetscFunctionReturn(0);
1092 }
1093 
1094 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
1095 {
1096   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1097   PetscScalar       *y = 0,*z = 0,sum1,sum2;
1098   const PetscScalar *x,*xb;
1099   PetscScalar       x1,x2,*yarray,*zarray;
1100   const MatScalar   *v;
1101   PetscErrorCode    ierr;
1102   PetscInt          mbs = a->mbs,i,n,j;
1103   const PetscInt    *idx,*ii,*ridx = NULL;
1104   PetscBool         usecprow = a->compressedrow.use;
1105 
1106   PetscFunctionBegin;
1107   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1108   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1109 
1110   idx = a->j;
1111   v   = a->a;
1112   if (usecprow) {
1113     if (zz != yy) {
1114       ierr = PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1115     }
1116     mbs  = a->compressedrow.nrows;
1117     ii   = a->compressedrow.i;
1118     ridx = a->compressedrow.rindex;
1119   } else {
1120     ii = a->i;
1121     y  = yarray;
1122     z  = zarray;
1123   }
1124 
1125   for (i=0; i<mbs; i++) {
1126     n = ii[1] - ii[0]; ii++;
1127     if (usecprow) {
1128       z = zarray + 2*ridx[i];
1129       y = yarray + 2*ridx[i];
1130     }
1131     sum1 = y[0]; sum2 = y[1];
1132     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1133     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1134     for (j=0; j<n; j++) {
1135       xb = x + 2*(*idx++);
1136       x1 = xb[0];
1137       x2 = xb[1];
1138 
1139       sum1 += v[0]*x1 + v[2]*x2;
1140       sum2 += v[1]*x1 + v[3]*x2;
1141       v    += 4;
1142     }
1143     z[0] = sum1; z[1] = sum2;
1144     if (!usecprow) {
1145       z += 2; y += 2;
1146     }
1147   }
1148   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1149   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1150   ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr);
1151   PetscFunctionReturn(0);
1152 }
1153 
1154 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
1155 {
1156   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1157   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
1158   const PetscScalar *x,*xb;
1159   const MatScalar   *v;
1160   PetscErrorCode    ierr;
1161   PetscInt          mbs = a->mbs,i,j,n;
1162   const PetscInt    *idx,*ii,*ridx = NULL;
1163   PetscBool         usecprow = a->compressedrow.use;
1164 
1165   PetscFunctionBegin;
1166   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1167   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1168 
1169   idx = a->j;
1170   v   = a->a;
1171   if (usecprow) {
1172     if (zz != yy) {
1173       ierr = PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1174     }
1175     mbs  = a->compressedrow.nrows;
1176     ii   = a->compressedrow.i;
1177     ridx = a->compressedrow.rindex;
1178   } else {
1179     ii = a->i;
1180     y  = yarray;
1181     z  = zarray;
1182   }
1183 
1184   for (i=0; i<mbs; i++) {
1185     n = ii[1] - ii[0]; ii++;
1186     if (usecprow) {
1187       z = zarray + 3*ridx[i];
1188       y = yarray + 3*ridx[i];
1189     }
1190     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
1191     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1192     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1193     for (j=0; j<n; j++) {
1194       xb    = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1195       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
1196       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
1197       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
1198       v    += 9;
1199     }
1200     z[0] = sum1; z[1] = sum2; z[2] = sum3;
1201     if (!usecprow) {
1202       z += 3; y += 3;
1203     }
1204   }
1205   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1206   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1207   ierr = PetscLogFlops(18.0*a->nz);CHKERRQ(ierr);
1208   PetscFunctionReturn(0);
1209 }
1210 
1211 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
1212 {
1213   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1214   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
1215   const PetscScalar *x,*xb;
1216   const MatScalar   *v;
1217   PetscErrorCode    ierr;
1218   PetscInt          mbs = a->mbs,i,j,n;
1219   const PetscInt    *idx,*ii,*ridx=NULL;
1220   PetscBool         usecprow=a->compressedrow.use;
1221 
1222   PetscFunctionBegin;
1223   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1224   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1225 
1226   idx = a->j;
1227   v   = a->a;
1228   if (usecprow) {
1229     if (zz != yy) {
1230       ierr = PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1231     }
1232     mbs  = a->compressedrow.nrows;
1233     ii   = a->compressedrow.i;
1234     ridx = a->compressedrow.rindex;
1235   } else {
1236     ii = a->i;
1237     y  = yarray;
1238     z  = zarray;
1239   }
1240 
1241   for (i=0; i<mbs; i++) {
1242     n = ii[1] - ii[0]; ii++;
1243     if (usecprow) {
1244       z = zarray + 4*ridx[i];
1245       y = yarray + 4*ridx[i];
1246     }
1247     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
1248     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1249     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1250     for (j=0; j<n; j++) {
1251       xb    = x + 4*(*idx++);
1252       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1253       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1254       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1255       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1256       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1257       v    += 16;
1258     }
1259     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
1260     if (!usecprow) {
1261       z += 4; y += 4;
1262     }
1263   }
1264   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1265   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1266   ierr = PetscLogFlops(32.0*a->nz);CHKERRQ(ierr);
1267   PetscFunctionReturn(0);
1268 }
1269 
1270 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
1271 {
1272   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1273   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
1274   const PetscScalar *x,*xb;
1275   PetscScalar       *yarray,*zarray;
1276   const MatScalar   *v;
1277   PetscErrorCode    ierr;
1278   PetscInt          mbs = a->mbs,i,j,n;
1279   const PetscInt    *idx,*ii,*ridx = NULL;
1280   PetscBool         usecprow=a->compressedrow.use;
1281 
1282   PetscFunctionBegin;
1283   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1284   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1285 
1286   idx = a->j;
1287   v   = a->a;
1288   if (usecprow) {
1289     if (zz != yy) {
1290       ierr = PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1291     }
1292     mbs  = a->compressedrow.nrows;
1293     ii   = a->compressedrow.i;
1294     ridx = a->compressedrow.rindex;
1295   } else {
1296     ii = a->i;
1297     y  = yarray;
1298     z  = zarray;
1299   }
1300 
1301   for (i=0; i<mbs; i++) {
1302     n = ii[1] - ii[0]; ii++;
1303     if (usecprow) {
1304       z = zarray + 5*ridx[i];
1305       y = yarray + 5*ridx[i];
1306     }
1307     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
1308     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1309     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1310     for (j=0; j<n; j++) {
1311       xb    = x + 5*(*idx++);
1312       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
1313       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1314       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1315       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1316       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1317       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1318       v    += 25;
1319     }
1320     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
1321     if (!usecprow) {
1322       z += 5; y += 5;
1323     }
1324   }
1325   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1326   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1327   ierr = PetscLogFlops(50.0*a->nz);CHKERRQ(ierr);
1328   PetscFunctionReturn(0);
1329 }
1330 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
1331 {
1332   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1333   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
1334   const PetscScalar *x,*xb;
1335   PetscScalar       x1,x2,x3,x4,x5,x6,*yarray,*zarray;
1336   const MatScalar   *v;
1337   PetscErrorCode    ierr;
1338   PetscInt          mbs = a->mbs,i,j,n;
1339   const PetscInt    *idx,*ii,*ridx=NULL;
1340   PetscBool         usecprow=a->compressedrow.use;
1341 
1342   PetscFunctionBegin;
1343   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1344   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1345 
1346   idx = a->j;
1347   v   = a->a;
1348   if (usecprow) {
1349     if (zz != yy) {
1350       ierr = PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1351     }
1352     mbs  = a->compressedrow.nrows;
1353     ii   = a->compressedrow.i;
1354     ridx = a->compressedrow.rindex;
1355   } else {
1356     ii = a->i;
1357     y  = yarray;
1358     z  = zarray;
1359   }
1360 
1361   for (i=0; i<mbs; i++) {
1362     n = ii[1] - ii[0]; ii++;
1363     if (usecprow) {
1364       z = zarray + 6*ridx[i];
1365       y = yarray + 6*ridx[i];
1366     }
1367     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
1368     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1369     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1370     for (j=0; j<n; j++) {
1371       xb    = x + 6*(*idx++);
1372       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
1373       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
1374       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
1375       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
1376       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
1377       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
1378       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
1379       v    += 36;
1380     }
1381     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
1382     if (!usecprow) {
1383       z += 6; y += 6;
1384     }
1385   }
1386   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1387   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1388   ierr = PetscLogFlops(72.0*a->nz);CHKERRQ(ierr);
1389   PetscFunctionReturn(0);
1390 }
1391 
1392 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1393 {
1394   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1395   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1396   const PetscScalar *x,*xb;
1397   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
1398   const MatScalar   *v;
1399   PetscErrorCode    ierr;
1400   PetscInt          mbs = a->mbs,i,j,n;
1401   const PetscInt    *idx,*ii,*ridx = NULL;
1402   PetscBool         usecprow=a->compressedrow.use;
1403 
1404   PetscFunctionBegin;
1405   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1406   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1407 
1408   idx = a->j;
1409   v   = a->a;
1410   if (usecprow) {
1411     if (zz != yy) {
1412       ierr = PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1413     }
1414     mbs  = a->compressedrow.nrows;
1415     ii   = a->compressedrow.i;
1416     ridx = a->compressedrow.rindex;
1417   } else {
1418     ii = a->i;
1419     y  = yarray;
1420     z  = zarray;
1421   }
1422 
1423   for (i=0; i<mbs; i++) {
1424     n = ii[1] - ii[0]; ii++;
1425     if (usecprow) {
1426       z = zarray + 7*ridx[i];
1427       y = yarray + 7*ridx[i];
1428     }
1429     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1430     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1431     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1432     for (j=0; j<n; j++) {
1433       xb    = x + 7*(*idx++);
1434       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1435       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1436       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1437       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1438       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1439       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1440       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1441       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1442       v    += 49;
1443     }
1444     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1445     if (!usecprow) {
1446       z += 7; y += 7;
1447     }
1448   }
1449   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1450   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1451   ierr = PetscLogFlops(98.0*a->nz);CHKERRQ(ierr);
1452   PetscFunctionReturn(0);
1453 }
1454 
1455 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1456 {
1457   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1458   PetscScalar       *z = 0,*work,*workt,*zarray;
1459   const PetscScalar *x,*xb;
1460   const MatScalar   *v;
1461   PetscErrorCode    ierr;
1462   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1463   PetscInt          ncols,k;
1464   const PetscInt    *ridx = NULL,*idx,*ii;
1465   PetscBool         usecprow = a->compressedrow.use;
1466 
1467   PetscFunctionBegin;
1468   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1469   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1470   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1471 
1472   idx = a->j;
1473   v   = a->a;
1474   if (usecprow) {
1475     mbs  = a->compressedrow.nrows;
1476     ii   = a->compressedrow.i;
1477     ridx = a->compressedrow.rindex;
1478   } else {
1479     mbs = a->mbs;
1480     ii  = a->i;
1481     z   = zarray;
1482   }
1483 
1484   if (!a->mult_work) {
1485     k    = PetscMax(A->rmap->n,A->cmap->n);
1486     ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
1487   }
1488   work = a->mult_work;
1489   for (i=0; i<mbs; i++) {
1490     n     = ii[1] - ii[0]; ii++;
1491     ncols = n*bs;
1492     workt = work;
1493     for (j=0; j<n; j++) {
1494       xb = x + bs*(*idx++);
1495       for (k=0; k<bs; k++) workt[k] = xb[k];
1496       workt += bs;
1497     }
1498     if (usecprow) z = zarray + bs*ridx[i];
1499     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1500     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
1501     v += n*bs2;
1502     if (!usecprow) z += bs;
1503   }
1504   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1505   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1506   ierr = PetscLogFlops(2.0*a->nz*bs2);CHKERRQ(ierr);
1507   PetscFunctionReturn(0);
1508 }
1509 
1510 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1511 {
1512   PetscScalar    zero = 0.0;
1513   PetscErrorCode ierr;
1514 
1515   PetscFunctionBegin;
1516   ierr = VecSet(zz,zero);CHKERRQ(ierr);
1517   ierr = MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr);
1518   PetscFunctionReturn(0);
1519 }
1520 
1521 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1522 {
1523   PetscScalar    zero = 0.0;
1524   PetscErrorCode ierr;
1525 
1526   PetscFunctionBegin;
1527   ierr = VecSet(zz,zero);CHKERRQ(ierr);
1528   ierr = MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr);
1529   PetscFunctionReturn(0);
1530 }
1531 
1532 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1533 {
1534   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1535   PetscScalar       *z,x1,x2,x3,x4,x5;
1536   const PetscScalar *x,*xb = NULL;
1537   const MatScalar   *v;
1538   PetscErrorCode    ierr;
1539   PetscInt          mbs,i,rval,bs=A->rmap->bs,j,n;
1540   const PetscInt    *idx,*ii,*ib,*ridx = NULL;
1541   Mat_CompressedRow cprow = a->compressedrow;
1542   PetscBool         usecprow = cprow.use;
1543 
1544   PetscFunctionBegin;
1545   if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); }
1546   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1547   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1548 
1549   idx = a->j;
1550   v   = a->a;
1551   if (usecprow) {
1552     mbs  = cprow.nrows;
1553     ii   = cprow.i;
1554     ridx = cprow.rindex;
1555   } else {
1556     mbs=a->mbs;
1557     ii = a->i;
1558     xb = x;
1559   }
1560 
1561   switch (bs) {
1562   case 1:
1563     for (i=0; i<mbs; i++) {
1564       if (usecprow) xb = x + ridx[i];
1565       x1 = xb[0];
1566       ib = idx + ii[0];
1567       n  = ii[1] - ii[0]; ii++;
1568       for (j=0; j<n; j++) {
1569         rval     = ib[j];
1570         z[rval] += PetscConj(*v) * x1;
1571         v++;
1572       }
1573       if (!usecprow) xb++;
1574     }
1575     break;
1576   case 2:
1577     for (i=0; i<mbs; i++) {
1578       if (usecprow) xb = x + 2*ridx[i];
1579       x1 = xb[0]; x2 = xb[1];
1580       ib = idx + ii[0];
1581       n  = ii[1] - ii[0]; ii++;
1582       for (j=0; j<n; j++) {
1583         rval       = ib[j]*2;
1584         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2;
1585         z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2;
1586         v         += 4;
1587       }
1588       if (!usecprow) xb += 2;
1589     }
1590     break;
1591   case 3:
1592     for (i=0; i<mbs; i++) {
1593       if (usecprow) xb = x + 3*ridx[i];
1594       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1595       ib = idx + ii[0];
1596       n  = ii[1] - ii[0]; ii++;
1597       for (j=0; j<n; j++) {
1598         rval       = ib[j]*3;
1599         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3;
1600         z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3;
1601         z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3;
1602         v         += 9;
1603       }
1604       if (!usecprow) xb += 3;
1605     }
1606     break;
1607   case 4:
1608     for (i=0; i<mbs; i++) {
1609       if (usecprow) xb = x + 4*ridx[i];
1610       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1611       ib = idx + ii[0];
1612       n  = ii[1] - ii[0]; ii++;
1613       for (j=0; j<n; j++) {
1614         rval       = ib[j]*4;
1615         z[rval++] +=  PetscConj(v[0])*x1 + PetscConj(v[1])*x2  + PetscConj(v[2])*x3  + PetscConj(v[3])*x4;
1616         z[rval++] +=  PetscConj(v[4])*x1 + PetscConj(v[5])*x2  + PetscConj(v[6])*x3  + PetscConj(v[7])*x4;
1617         z[rval++] +=  PetscConj(v[8])*x1 + PetscConj(v[9])*x2  + PetscConj(v[10])*x3 + PetscConj(v[11])*x4;
1618         z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4;
1619         v         += 16;
1620       }
1621       if (!usecprow) xb += 4;
1622     }
1623     break;
1624   case 5:
1625     for (i=0; i<mbs; i++) {
1626       if (usecprow) xb = x + 5*ridx[i];
1627       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1628       x4 = xb[3]; x5 = xb[4];
1629       ib = idx + ii[0];
1630       n  = ii[1] - ii[0]; ii++;
1631       for (j=0; j<n; j++) {
1632         rval       = ib[j]*5;
1633         z[rval++] +=  PetscConj(v[0])*x1 +  PetscConj(v[1])*x2 +  PetscConj(v[2])*x3 +  PetscConj(v[3])*x4 +  PetscConj(v[4])*x5;
1634         z[rval++] +=  PetscConj(v[5])*x1 +  PetscConj(v[6])*x2 +  PetscConj(v[7])*x3 +  PetscConj(v[8])*x4 +  PetscConj(v[9])*x5;
1635         z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5;
1636         z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5;
1637         z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5;
1638         v         += 25;
1639       }
1640       if (!usecprow) xb += 5;
1641     }
1642     break;
1643   default: /* block sizes larger than 5 by 5 are handled by BLAS */
1644     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet");
1645 #if 0
1646     {
1647       PetscInt          ncols,k,bs2=a->bs2;
1648       PetscScalar       *work,*workt,zb;
1649       const PetscScalar *xtmp;
1650       if (!a->mult_work) {
1651         k    = PetscMax(A->rmap->n,A->cmap->n);
1652         ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
1653       }
1654       work = a->mult_work;
1655       xtmp = x;
1656       for (i=0; i<mbs; i++) {
1657         n     = ii[1] - ii[0]; ii++;
1658         ncols = n*bs;
1659         ierr  = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1660         if (usecprow) xtmp = x + bs*ridx[i];
1661         PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
1662         /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
1663         v += n*bs2;
1664         if (!usecprow) xtmp += bs;
1665         workt = work;
1666         for (j=0; j<n; j++) {
1667           zb = z + bs*(*idx++);
1668           for (k=0; k<bs; k++) zb[k] += workt[k] ;
1669           workt += bs;
1670         }
1671       }
1672     }
1673 #endif
1674   }
1675   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1676   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1677   ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr);
1678   PetscFunctionReturn(0);
1679 }
1680 
1681 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1682 {
1683   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1684   PetscScalar       *zb,*z,x1,x2,x3,x4,x5;
1685   const PetscScalar *x,*xb = 0;
1686   const MatScalar   *v;
1687   PetscErrorCode    ierr;
1688   PetscInt          mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2;
1689   const PetscInt    *idx,*ii,*ib,*ridx = NULL;
1690   Mat_CompressedRow cprow   = a->compressedrow;
1691   PetscBool         usecprow=cprow.use;
1692 
1693   PetscFunctionBegin;
1694   if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); }
1695   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1696   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1697 
1698   idx = a->j;
1699   v   = a->a;
1700   if (usecprow) {
1701     mbs  = cprow.nrows;
1702     ii   = cprow.i;
1703     ridx = cprow.rindex;
1704   } else {
1705     mbs=a->mbs;
1706     ii = a->i;
1707     xb = x;
1708   }
1709 
1710   switch (bs) {
1711   case 1:
1712     for (i=0; i<mbs; i++) {
1713       if (usecprow) xb = x + ridx[i];
1714       x1 = xb[0];
1715       ib = idx + ii[0];
1716       n  = ii[1] - ii[0]; ii++;
1717       for (j=0; j<n; j++) {
1718         rval     = ib[j];
1719         z[rval] += *v * x1;
1720         v++;
1721       }
1722       if (!usecprow) xb++;
1723     }
1724     break;
1725   case 2:
1726     for (i=0; i<mbs; i++) {
1727       if (usecprow) xb = x + 2*ridx[i];
1728       x1 = xb[0]; x2 = xb[1];
1729       ib = idx + ii[0];
1730       n  = ii[1] - ii[0]; ii++;
1731       for (j=0; j<n; j++) {
1732         rval       = ib[j]*2;
1733         z[rval++] += v[0]*x1 + v[1]*x2;
1734         z[rval++] += v[2]*x1 + v[3]*x2;
1735         v         += 4;
1736       }
1737       if (!usecprow) xb += 2;
1738     }
1739     break;
1740   case 3:
1741     for (i=0; i<mbs; i++) {
1742       if (usecprow) xb = x + 3*ridx[i];
1743       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1744       ib = idx + ii[0];
1745       n  = ii[1] - ii[0]; ii++;
1746       for (j=0; j<n; j++) {
1747         rval       = ib[j]*3;
1748         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1749         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1750         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1751         v         += 9;
1752       }
1753       if (!usecprow) xb += 3;
1754     }
1755     break;
1756   case 4:
1757     for (i=0; i<mbs; i++) {
1758       if (usecprow) xb = x + 4*ridx[i];
1759       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1760       ib = idx + ii[0];
1761       n  = ii[1] - ii[0]; ii++;
1762       for (j=0; j<n; j++) {
1763         rval       = ib[j]*4;
1764         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1765         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1766         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1767         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1768         v         += 16;
1769       }
1770       if (!usecprow) xb += 4;
1771     }
1772     break;
1773   case 5:
1774     for (i=0; i<mbs; i++) {
1775       if (usecprow) xb = x + 5*ridx[i];
1776       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1777       x4 = xb[3]; x5 = xb[4];
1778       ib = idx + ii[0];
1779       n  = ii[1] - ii[0]; ii++;
1780       for (j=0; j<n; j++) {
1781         rval       = ib[j]*5;
1782         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1783         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1784         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1785         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1786         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1787         v         += 25;
1788       }
1789       if (!usecprow) xb += 5;
1790     }
1791     break;
1792   default: {      /* block sizes larger then 5 by 5 are handled by BLAS */
1793     PetscInt          ncols,k;
1794     PetscScalar       *work,*workt;
1795     const PetscScalar *xtmp;
1796     if (!a->mult_work) {
1797       k    = PetscMax(A->rmap->n,A->cmap->n);
1798       ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
1799     }
1800     work = a->mult_work;
1801     xtmp = x;
1802     for (i=0; i<mbs; i++) {
1803       n     = ii[1] - ii[0]; ii++;
1804       ncols = n*bs;
1805       ierr  = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1806       if (usecprow) xtmp = x + bs*ridx[i];
1807       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
1808       /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
1809       v += n*bs2;
1810       if (!usecprow) xtmp += bs;
1811       workt = work;
1812       for (j=0; j<n; j++) {
1813         zb = z + bs*(*idx++);
1814         for (k=0; k<bs; k++) zb[k] += workt[k];
1815         workt += bs;
1816       }
1817     }
1818     }
1819   }
1820   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1821   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1822   ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr);
1823   PetscFunctionReturn(0);
1824 }
1825 
1826 PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
1827 {
1828   Mat_SeqBAIJ    *a      = (Mat_SeqBAIJ*)inA->data;
1829   PetscInt       totalnz = a->bs2*a->nz;
1830   PetscScalar    oalpha  = alpha;
1831   PetscErrorCode ierr;
1832   PetscBLASInt   one = 1,tnz;
1833 
1834   PetscFunctionBegin;
1835   ierr = PetscBLASIntCast(totalnz,&tnz);CHKERRQ(ierr);
1836   PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one));
1837   ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
1838   PetscFunctionReturn(0);
1839 }
1840 
1841 PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
1842 {
1843   PetscErrorCode ierr;
1844   Mat_SeqBAIJ    *a  = (Mat_SeqBAIJ*)A->data;
1845   MatScalar      *v  = a->a;
1846   PetscReal      sum = 0.0;
1847   PetscInt       i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1;
1848 
1849   PetscFunctionBegin;
1850   if (type == NORM_FROBENIUS) {
1851 #if defined(PETSC_USE_REAL___FP16)
1852     PetscBLASInt one = 1,cnt = bs2*nz;
1853     *norm = BLASnrm2_(&cnt,v,&one);
1854 #else
1855     for (i=0; i<bs2*nz; i++) {
1856       sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
1857     }
1858 #endif
1859     *norm = PetscSqrtReal(sum);
1860     ierr = PetscLogFlops(2*bs2*nz);CHKERRQ(ierr);
1861   } else if (type == NORM_1) { /* maximum column sum */
1862     PetscReal *tmp;
1863     PetscInt  *bcol = a->j;
1864     ierr = PetscCalloc1(A->cmap->n+1,&tmp);CHKERRQ(ierr);
1865     for (i=0; i<nz; i++) {
1866       for (j=0; j<bs; j++) {
1867         k1 = bs*(*bcol) + j; /* column index */
1868         for (k=0; k<bs; k++) {
1869           tmp[k1] += PetscAbsScalar(*v); v++;
1870         }
1871       }
1872       bcol++;
1873     }
1874     *norm = 0.0;
1875     for (j=0; j<A->cmap->n; j++) {
1876       if (tmp[j] > *norm) *norm = tmp[j];
1877     }
1878     ierr = PetscFree(tmp);CHKERRQ(ierr);
1879     ierr = PetscLogFlops(PetscMax(bs2*nz-1,0));CHKERRQ(ierr);
1880   } else if (type == NORM_INFINITY) { /* maximum row sum */
1881     *norm = 0.0;
1882     for (k=0; k<bs; k++) {
1883       for (j=0; j<a->mbs; j++) {
1884         v   = a->a + bs2*a->i[j] + k;
1885         sum = 0.0;
1886         for (i=0; i<a->i[j+1]-a->i[j]; i++) {
1887           for (k1=0; k1<bs; k1++) {
1888             sum += PetscAbsScalar(*v);
1889             v   += bs;
1890           }
1891         }
1892         if (sum > *norm) *norm = sum;
1893       }
1894     }
1895     ierr = PetscLogFlops(PetscMax(bs2*nz-1,0));CHKERRQ(ierr);
1896   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
1897   PetscFunctionReturn(0);
1898 }
1899 
1900 
1901 PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg)
1902 {
1903   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
1904   PetscErrorCode ierr;
1905 
1906   PetscFunctionBegin;
1907   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1908   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1909     *flg = PETSC_FALSE;
1910     PetscFunctionReturn(0);
1911   }
1912 
1913   /* if the a->i are the same */
1914   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1915   if (!*flg) PetscFunctionReturn(0);
1916 
1917   /* if a->j are the same */
1918   ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1919   if (!*flg) PetscFunctionReturn(0);
1920 
1921   /* if a->a are the same */
1922   ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
1923   PetscFunctionReturn(0);
1924 
1925 }
1926 
1927 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
1928 {
1929   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1930   PetscErrorCode ierr;
1931   PetscInt       i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1932   PetscScalar    *x,zero = 0.0;
1933   MatScalar      *aa,*aa_j;
1934 
1935   PetscFunctionBegin;
1936   if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1937   bs   = A->rmap->bs;
1938   aa   = a->a;
1939   ai   = a->i;
1940   aj   = a->j;
1941   ambs = a->mbs;
1942   bs2  = a->bs2;
1943 
1944   ierr = VecSet(v,zero);CHKERRQ(ierr);
1945   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1946   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1947   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1948   for (i=0; i<ambs; i++) {
1949     for (j=ai[i]; j<ai[i+1]; j++) {
1950       if (aj[j] == i) {
1951         row  = i*bs;
1952         aa_j = aa+j*bs2;
1953         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1954         break;
1955       }
1956     }
1957   }
1958   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1959   PetscFunctionReturn(0);
1960 }
1961 
1962 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
1963 {
1964   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1965   const PetscScalar *l,*r,*li,*ri;
1966   PetscScalar       x;
1967   MatScalar         *aa, *v;
1968   PetscErrorCode    ierr;
1969   PetscInt          i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai;
1970   const PetscInt    *ai,*aj;
1971 
1972   PetscFunctionBegin;
1973   ai  = a->i;
1974   aj  = a->j;
1975   aa  = a->a;
1976   m   = A->rmap->n;
1977   n   = A->cmap->n;
1978   bs  = A->rmap->bs;
1979   mbs = a->mbs;
1980   bs2 = a->bs2;
1981   if (ll) {
1982     ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr);
1983     ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1984     if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1985     for (i=0; i<mbs; i++) { /* for each block row */
1986       M  = ai[i+1] - ai[i];
1987       li = l + i*bs;
1988       v  = aa + bs2*ai[i];
1989       for (j=0; j<M; j++) { /* for each block */
1990         for (k=0; k<bs2; k++) {
1991           (*v++) *= li[k%bs];
1992         }
1993       }
1994     }
1995     ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr);
1996     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
1997   }
1998 
1999   if (rr) {
2000     ierr = VecGetArrayRead(rr,&r);CHKERRQ(ierr);
2001     ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr);
2002     if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
2003     for (i=0; i<mbs; i++) { /* for each block row */
2004       iai = ai[i];
2005       M   = ai[i+1] - iai;
2006       v   = aa + bs2*iai;
2007       for (j=0; j<M; j++) { /* for each block */
2008         ri = r + bs*aj[iai+j];
2009         for (k=0; k<bs; k++) {
2010           x = ri[k];
2011           for (tmp=0; tmp<bs; tmp++) v[tmp] *= x;
2012           v += bs;
2013         }
2014       }
2015     }
2016     ierr = VecRestoreArrayRead(rr,&r);CHKERRQ(ierr);
2017     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2018   }
2019   PetscFunctionReturn(0);
2020 }
2021 
2022 
2023 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
2024 {
2025   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2026 
2027   PetscFunctionBegin;
2028   info->block_size   = a->bs2;
2029   info->nz_allocated = a->bs2*a->maxnz;
2030   info->nz_used      = a->bs2*a->nz;
2031   info->nz_unneeded  = (double)(info->nz_allocated - info->nz_used);
2032   info->assemblies   = A->num_ass;
2033   info->mallocs      = A->info.mallocs;
2034   info->memory       = ((PetscObject)A)->mem;
2035   if (A->factortype) {
2036     info->fill_ratio_given  = A->info.fill_ratio_given;
2037     info->fill_ratio_needed = A->info.fill_ratio_needed;
2038     info->factor_mallocs    = A->info.factor_mallocs;
2039   } else {
2040     info->fill_ratio_given  = 0;
2041     info->fill_ratio_needed = 0;
2042     info->factor_mallocs    = 0;
2043   }
2044   PetscFunctionReturn(0);
2045 }
2046 
2047 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
2048 {
2049   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2050   PetscErrorCode ierr;
2051 
2052   PetscFunctionBegin;
2053   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
2054   PetscFunctionReturn(0);
2055 }
2056