xref: /petsc/src/mat/impls/baij/seq/baij2.c (revision cdc6f3ad59c428a9501804ba345252ac3fce004b)
1 
2 #include <../src/mat/impls/baij/seq/baij.h>
3 #include <petsc-private/kernels/blockinvert.h>
4 #include <petscbt.h>
5 #include <petscblaslapack.h>
6 #if defined(PETSC_THREADCOMM_ACTIVE)
7 #include <petscthreadcomm.h>
8 #endif
9 
10 #undef __FUNCT__
11 #define __FUNCT__ "MatIncreaseOverlap_SeqBAIJ"
12 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
13 {
14   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
15   PetscErrorCode ierr;
16   PetscInt       row,i,j,k,l,m,n,*nidx,isz,val,ival;
17   const PetscInt *idx;
18   PetscInt       start,end,*ai,*aj,bs,*nidx2;
19   PetscBT        table;
20 
21   PetscFunctionBegin;
22   m  = a->mbs;
23   ai = a->i;
24   aj = a->j;
25   bs = A->rmap->bs;
26 
27   if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
28 
29   ierr = PetscBTCreate(m,&table);CHKERRQ(ierr);
30   ierr = PetscMalloc1(m+1,&nidx);CHKERRQ(ierr);
31   ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr);
32 
33   for (i=0; i<is_max; i++) {
34     /* Initialise the two local arrays */
35     isz  = 0;
36     ierr = PetscBTMemzero(m,table);CHKERRQ(ierr);
37 
38     /* Extract the indices, assume there can be duplicate entries */
39     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
40     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
41 
42     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
43     for (j=0; j<n; ++j) {
44       ival = idx[j]/bs; /* convert the indices into block indices */
45       if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
46       if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival;
47     }
48     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
49     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
50 
51     k = 0;
52     for (j=0; j<ov; j++) { /* for each overlap*/
53       n = isz;
54       for (; k<n; k++) {  /* do only those rows in nidx[k], which are not done yet */
55         row   = nidx[k];
56         start = ai[row];
57         end   = ai[row+1];
58         for (l = start; l<end; l++) {
59           val = aj[l];
60           if (!PetscBTLookupSet(table,val)) nidx[isz++] = val;
61         }
62       }
63     }
64     /* expand the Index Set */
65     for (j=0; j<isz; j++) {
66       for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
67     }
68     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
69   }
70   ierr = PetscBTDestroy(&table);CHKERRQ(ierr);
71   ierr = PetscFree(nidx);CHKERRQ(ierr);
72   ierr = PetscFree(nidx2);CHKERRQ(ierr);
73   PetscFunctionReturn(0);
74 }
75 
76 #undef __FUNCT__
77 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ_Private"
78 PetscErrorCode MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
79 {
80   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*c;
81   PetscErrorCode ierr;
82   PetscInt       *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
83   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
84   const PetscInt *irow,*icol;
85   PetscInt       nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
86   PetscInt       *aj = a->j,*ai = a->i;
87   MatScalar      *mat_a;
88   Mat            C;
89   PetscBool      flag,sorted;
90 
91   PetscFunctionBegin;
92   ierr = ISSorted(iscol,&sorted);CHKERRQ(ierr);
93   if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
94 
95   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
96   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
97   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
98   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
99 
100   ierr  = PetscCalloc1(1+oldcols,&smap);CHKERRQ(ierr);
101   ssmap = smap;
102   ierr  = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr);
103   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
104   /* determine lens of each row */
105   for (i=0; i<nrows; i++) {
106     kstart  = ai[irow[i]];
107     kend    = kstart + a->ilen[irow[i]];
108     lens[i] = 0;
109     for (k=kstart; k<kend; k++) {
110       if (ssmap[aj[k]]) lens[i]++;
111     }
112   }
113   /* Create and fill new matrix */
114   if (scall == MAT_REUSE_MATRIX) {
115     c = (Mat_SeqBAIJ*)((*B)->data);
116 
117     if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
118     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr);
119     if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
120     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr);
121     C    = *B;
122   } else {
123     ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr);
124     ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
125     ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
126     ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(C,bs,0,lens);CHKERRQ(ierr);
127   }
128   c = (Mat_SeqBAIJ*)(C->data);
129   for (i=0; i<nrows; i++) {
130     row      = irow[i];
131     kstart   = ai[row];
132     kend     = kstart + a->ilen[row];
133     mat_i    = c->i[i];
134     mat_j    = c->j + mat_i;
135     mat_a    = c->a + mat_i*bs2;
136     mat_ilen = c->ilen + i;
137     for (k=kstart; k<kend; k++) {
138       if ((tcol=ssmap[a->j[k]])) {
139         *mat_j++ = tcol - 1;
140         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
141         mat_a   += bs2;
142         (*mat_ilen)++;
143       }
144     }
145   }
146   /* sort */
147   {
148     MatScalar *work;
149 
150     ierr = PetscMalloc1(bs2,&work);CHKERRQ(ierr);
151     for (i=0; i<nrows; i++) {
152       PetscInt ilen;
153       mat_i = c->i[i];
154       mat_j = c->j + mat_i;
155       mat_a = c->a + mat_i*bs2;
156       ilen  = c->ilen[i];
157       ierr  = PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);CHKERRQ(ierr);
158     }
159     ierr = PetscFree(work);CHKERRQ(ierr);
160   }
161 
162   /* Free work space */
163   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
164   ierr = PetscFree(smap);CHKERRQ(ierr);
165   ierr = PetscFree(lens);CHKERRQ(ierr);
166   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
167   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
168 
169   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
170   *B   = C;
171   PetscFunctionReturn(0);
172 }
173 
174 #undef __FUNCT__
175 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ"
176 PetscErrorCode MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
177 {
178   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
179   IS             is1,is2;
180   PetscErrorCode ierr;
181   PetscInt       *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count;
182   const PetscInt *irow,*icol;
183 
184   PetscFunctionBegin;
185   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
186   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
187   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
188   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
189 
190   /* Verify if the indices corespond to each element in a block
191    and form the IS with compressed IS */
192   ierr = PetscMalloc2(a->mbs,&vary,a->mbs,&iary);CHKERRQ(ierr);
193   ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr);
194   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
195   count = 0;
196   for (i=0; i<a->mbs; i++) {
197     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
198     if (vary[i]==bs) iary[count++] = i;
199   }
200   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr);
201 
202   ierr = PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));CHKERRQ(ierr);
203   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
204   count = 0;
205   for (i=0; i<a->mbs; i++) {
206     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
207     if (vary[i]==bs) iary[count++] = i;
208   }
209   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr);
210   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
211   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
212   ierr = PetscFree2(vary,iary);CHKERRQ(ierr);
213 
214   ierr = MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr);
215   ierr = ISDestroy(&is1);CHKERRQ(ierr);
216   ierr = ISDestroy(&is2);CHKERRQ(ierr);
217   PetscFunctionReturn(0);
218 }
219 
220 #undef __FUNCT__
221 #define __FUNCT__ "MatGetSubMatrices_SeqBAIJ"
222 PetscErrorCode MatGetSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
223 {
224   PetscErrorCode ierr;
225   PetscInt       i;
226 
227   PetscFunctionBegin;
228   if (scall == MAT_INITIAL_MATRIX) {
229     ierr = PetscMalloc1(n+1,B);CHKERRQ(ierr);
230   }
231 
232   for (i=0; i<n; i++) {
233     ierr = MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr);
234   }
235   PetscFunctionReturn(0);
236 }
237 
238 
239 /* -------------------------------------------------------*/
240 /* Should check that shapes of vectors and matrices match */
241 /* -------------------------------------------------------*/
242 
243 #if defined(PETSC_THREADCOMM_ACTIVE)
244 PetscErrorCode MatMult_SeqBAIJ_1_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz)
245 {
246   PetscErrorCode    ierr;
247   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
248   PetscScalar       *z;
249   const PetscScalar *x;
250   const MatScalar   *aa;
251   PetscInt          *trstarts=A->rmap->trstarts;
252   PetscInt          n,start,end,i;
253   const PetscInt    *aj,*ai;
254   PetscScalar       sum;
255 
256   ierr  = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
257   ierr  = VecGetArray(zz,&z);CHKERRQ(ierr);
258   start = trstarts[thread_id];
259   end   = trstarts[thread_id+1];
260   ai    = a->i;
261   for (i=start; i<end; i++) {
262     n   = ai[i+1] - ai[i];
263     aj  = a->j + ai[i];
264     aa  = a->a + ai[i];
265     sum = 0.0;
266     PetscSparseDensePlusDot(sum,x,aa,aj,n);
267     z[i] = sum;
268   }
269   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
270   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
271   return 0;
272 }
273 
274 #undef __FUNCT__
275 #define __FUNCT__ "MatMult_SeqBAIJ_1"
276 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
277 {
278   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
279   PetscScalar       *z,sum;
280   const PetscScalar *x;
281   const MatScalar   *v;
282   PetscErrorCode    ierr;
283   PetscInt          mbs,i,n;
284   const PetscInt    *idx,*ii,*ridx=NULL;
285   PetscBool         usecprow=a->compressedrow.use;
286 
287   PetscFunctionBegin;
288 
289   if (usecprow) {
290     ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
291     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
292     mbs  = a->compressedrow.nrows;
293     ii   = a->compressedrow.i;
294     ridx = a->compressedrow.rindex;
295     ierr = PetscMemzero(z,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
296     for (i=0; i<mbs; i++) {
297       n   = ii[i+1] - ii[i];
298       v   = a->a + ii[i];
299       idx = a->j + ii[i];
300       PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
301       PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
302       sum = 0.0;
303       PetscSparseDensePlusDot(sum,x,v,idx,n);
304       z[ridx[i]] = sum;
305     }
306     ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
307     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
308   } else {
309     ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_1_Kernel,3,A,xx,zz);CHKERRQ(ierr);
310   }
311   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
312   PetscFunctionReturn(0);
313 }
314 #else
315 #undef __FUNCT__
316 #define __FUNCT__ "MatMult_SeqBAIJ_1"
317 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
318 {
319   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
320   PetscScalar       *z,sum;
321   const PetscScalar *x;
322   const MatScalar   *v;
323   PetscErrorCode    ierr;
324   PetscInt          mbs,i,n;
325   const PetscInt    *idx,*ii,*ridx=NULL;
326   PetscBool         usecprow=a->compressedrow.use;
327 
328   PetscFunctionBegin;
329   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
330   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
331 
332   if (usecprow) {
333     mbs  = a->compressedrow.nrows;
334     ii   = a->compressedrow.i;
335     ridx = a->compressedrow.rindex;
336     ierr = PetscMemzero(z,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
337   } else {
338     mbs = a->mbs;
339     ii  = a->i;
340   }
341 
342   for (i=0; i<mbs; i++) {
343     n   = ii[1] - ii[0];
344     v   = a->a + ii[0];
345     idx = a->j + ii[0];
346     ii++;
347     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
348     PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
349     sum = 0.0;
350     PetscSparseDensePlusDot(sum,x,v,idx,n);
351     if (usecprow) {
352       z[ridx[i]] = sum;
353     } else {
354       z[i]        = sum;
355     }
356   }
357   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
358   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
359   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
360   PetscFunctionReturn(0);
361 }
362 #endif
363 
364 #if defined(PETSC_THREADCOMM_ACTIVE)
365 PetscErrorCode MatMult_SeqBAIJ_2_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz)
366 {
367   PetscErrorCode    ierr;
368   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
369   PetscScalar       *z,x1,x2,sum1,sum2;
370   const PetscScalar *x,*xb;
371   const MatScalar   *aa;
372   PetscInt          *trstarts=A->rmap->trstarts;
373   PetscInt          n,start,end,i,j;
374   const PetscInt    *aj,*ai;
375 
376   ierr   = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
377   ierr   = VecGetArray(zz,&z);CHKERRQ(ierr);
378   start  = trstarts[thread_id] / 2;
379   end    = trstarts[thread_id+1] / 2;
380   ai     = a->i;
381   for (i=start; i<end; i++) {
382     n    = ai[i+1] - ai[i];
383     aj   = a->j + ai[i];
384     aa   = a->a + ai[i]*4;
385     sum1 = 0.0; sum2 = 0.0;
386     for (j=0; j<n; j++) {
387       xb = x + 2*aj[j]; x1 = xb[0]; x2 = xb[1];
388       sum1 += aa[4*j]*x1   + aa[4*j+2]*x2;
389       sum2 += aa[4*j+1]*x1 + aa[4*j+3]*x2;
390     }
391     z[2*i] = sum1; z[2*i+1] = sum2;
392   }
393   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
394   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
395   return 0;
396 }
397 
398 #undef __FUNCT__
399 #define __FUNCT__ "MatMult_SeqBAIJ_2"
400 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
401 {
402   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
403   PetscScalar       *z,x1,x2,sum1,sum2;
404   const PetscScalar *x,*xb;
405   const MatScalar   *v;
406   PetscErrorCode    ierr;
407   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
408   PetscBool         usecprow=a->compressedrow.use;
409 
410   PetscFunctionBegin;
411 
412   if (usecprow) {
413     ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
414     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
415     mbs  = a->compressedrow.nrows;
416     ii   = a->compressedrow.i;
417     ridx = a->compressedrow.rindex;
418     for (i=0; i<mbs; i++) {
419       n    = ii[i+1] - ii[i];
420       idx  = a->j + ii[i];
421       v    = a->a + ii[i]*4;
422       sum1 = 0.0; sum2 = 0.0;
423       PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
424       PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
425       for (j=0; j<n; j++) {
426 	xb = x + 2*idx[j]; x1 = xb[0]; x2 = xb[1];
427 	sum1 += v[4*j]*x1   + v[4*j+2]*x2;
428 	sum2 += v[4*j+1]*x1 + v[4*j+3]*x2;
429       }
430       z[2*ridx[i]] = sum1; z[2*ridx[i]+1] = sum2;
431     }
432     ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
433     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
434   } else {
435     ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_2_Kernel,3,A,xx,zz);CHKERRQ(ierr);
436   }
437   ierr = PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);CHKERRQ(ierr);
438   PetscFunctionReturn(0);
439 }
440 #else
441 #undef __FUNCT__
442 #define __FUNCT__ "MatMult_SeqBAIJ_2"
443 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
444 {
445   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
446   PetscScalar       *z = 0,sum1,sum2,*zarray;
447   const PetscScalar *x,*xb;
448   PetscScalar       x1,x2;
449   const MatScalar   *v;
450   PetscErrorCode    ierr;
451   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
452   PetscBool         usecprow=a->compressedrow.use;
453 
454   PetscFunctionBegin;
455   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
456   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
457 
458   idx = a->j;
459   v   = a->a;
460   if (usecprow) {
461     mbs  = a->compressedrow.nrows;
462     ii   = a->compressedrow.i;
463     ridx = a->compressedrow.rindex;
464   } else {
465     mbs = a->mbs;
466     ii  = a->i;
467     z   = zarray;
468   }
469 
470   for (i=0; i<mbs; i++) {
471     n           = ii[1] - ii[0]; ii++;
472     sum1        = 0.0; sum2 = 0.0;
473     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
474     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
475     for (j=0; j<n; j++) {
476       xb    = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
477       sum1 += v[0]*x1 + v[2]*x2;
478       sum2 += v[1]*x1 + v[3]*x2;
479       v    += 4;
480     }
481     if (usecprow) z = zarray + 2*ridx[i];
482     z[0] = sum1; z[1] = sum2;
483     if (!usecprow) z += 2;
484   }
485   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
486   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
487   ierr = PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);CHKERRQ(ierr);
488   PetscFunctionReturn(0);
489 }
490 #endif
491 
492 #if defined(PETSC_THREADCOMM_ACTIVE)
493 PetscErrorCode MatMult_SeqBAIJ_3_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz)
494 {
495   PetscErrorCode    ierr;
496   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
497   PetscScalar       *z,x1,x2,x3,sum1,sum2,sum3;
498   const PetscScalar *x,*xb;
499   const MatScalar   *aa;
500   PetscInt          *trstarts=A->rmap->trstarts;
501   PetscInt          n,start,end,i,j;
502   const PetscInt    *aj,*ai;
503 
504   ierr   = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
505   ierr   = VecGetArray(zz,&z);CHKERRQ(ierr);
506   start  = trstarts[thread_id] / 3;
507   end    = trstarts[thread_id+1] / 3;
508   ai     = a->i;
509   for (i=start; i<end; i++) {
510     n    = ai[i+1] - ai[i];
511     aj   = a->j + ai[i];
512     aa   = a->a + ai[i]*9;
513     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
514     for (j=0; j<n; j++) {
515       xb = x + 3*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
516       sum1 += aa[9*j]*x1   + aa[9*j+3]*x2 + aa[9*j+6]*x3;
517       sum2 += aa[9*j+1]*x1 + aa[9*j+4]*x2 + aa[9*j+7]*x3;
518       sum3 += aa[9*j+2]*x1 + aa[9*j+5]*x2 + aa[9*j+8]*x3;
519     }
520     z[3*i] = sum1; z[3*i+1] = sum2; z[3*i+2] = sum3;
521   }
522   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
523   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
524   return 0;
525 }
526 
527 #undef __FUNCT__
528 #define __FUNCT__ "MatMult_SeqBAIJ_3"
529 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
530 {
531   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
532   PetscScalar       *z,sum1,sum2,sum3,x1,x2,x3;
533   const PetscScalar *x,*xb;
534   const MatScalar   *v;
535   PetscErrorCode    ierr;
536   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
537   PetscBool         usecprow=a->compressedrow.use;
538 
539 
540 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
541 #pragma disjoint(*v,*z,*xb)
542 #endif
543 
544   PetscFunctionBegin;
545 
546   if (usecprow) {
547     ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
548     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
549     mbs  = a->compressedrow.nrows;
550     ii   = a->compressedrow.i;
551     ridx = a->compressedrow.rindex;
552     for (i=0; i<mbs; i++) {
553       n    = ii[i+1] - ii[i];
554       idx  = a->j + ii[i];
555       v    = a->a + ii[i]*9;
556       sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
557       PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
558       PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
559       for (j=0; j<n; j++) {
560 	xb = x + 3*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
561 	sum1 += v[9*j]*x1   + v[9*j+3]*x2 + v[9*j+6]*x3;
562 	sum2 += v[9*j+1]*x1 + v[9*j+4]*x2 + v[9*j+7]*x3;
563 	sum3 += v[9*j+2]*x1 + v[9*j+5]*x2 + v[9*j+8]*x3;
564       }
565       z[3*ridx[i]] = sum1; z[3*ridx[i]+1] = sum2; z[3*ridx[i]+2] = sum3;
566     }
567     ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
568     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
569   } else {
570     ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_3_Kernel,3,A,xx,zz);CHKERRQ(ierr);
571   }
572   ierr = PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);CHKERRQ(ierr);
573   PetscFunctionReturn(0);
574 }
575 #else
576 #undef __FUNCT__
577 #define __FUNCT__ "MatMult_SeqBAIJ_3"
578 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
579 {
580   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
581   PetscScalar       *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray;
582   const PetscScalar *x,*xb;
583   const MatScalar   *v;
584   PetscErrorCode    ierr;
585   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
586   PetscBool         usecprow=a->compressedrow.use;
587 
588 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
589 #pragma disjoint(*v,*z,*xb)
590 #endif
591 
592   PetscFunctionBegin;
593   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
594   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
595 
596   idx = a->j;
597   v   = a->a;
598   if (usecprow) {
599     mbs  = a->compressedrow.nrows;
600     ii   = a->compressedrow.i;
601     ridx = a->compressedrow.rindex;
602   } else {
603     mbs = a->mbs;
604     ii  = a->i;
605     z   = zarray;
606   }
607 
608   for (i=0; i<mbs; i++) {
609     n           = ii[1] - ii[0]; ii++;
610     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0;
611     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
612     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
613     for (j=0; j<n; j++) {
614       xb = x + 3*(*idx++);
615       x1 = xb[0];
616       x2 = xb[1];
617       x3 = xb[2];
618 
619       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
620       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
621       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
622       v    += 9;
623     }
624     if (usecprow) z = zarray + 3*ridx[i];
625     z[0] = sum1; z[1] = sum2; z[2] = sum3;
626     if (!usecprow) z += 3;
627   }
628   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
629   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
630   ierr = PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);CHKERRQ(ierr);
631   PetscFunctionReturn(0);
632 }
633 #endif
634 
635 #if defined(PETSC_THREADCOMM_ACTIVE)
636 PetscErrorCode MatMult_SeqBAIJ_4_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz)
637 {
638   PetscErrorCode    ierr;
639   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
640   PetscScalar       *z,x1,x2,x3,x4,sum1,sum2,sum3,sum4;
641   const PetscScalar *x,*xb;
642   const MatScalar   *aa;
643   PetscInt          *trstarts=A->rmap->trstarts;
644   PetscInt          n,start,end,i,j;
645   const PetscInt    *aj,*ai;
646 
647   ierr   = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
648   ierr   = VecGetArray(zz,&z);CHKERRQ(ierr);
649   start  = trstarts[thread_id] / 4;
650   end    = trstarts[thread_id+1] / 4;
651   ai     = a->i;
652   for (i=start; i<end; i++) {
653     n    = ai[i+1] - ai[i];
654     aj   = a->j + ai[i];
655     aa   = a->a + ai[i]*16;
656     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
657     for (j=0; j<n; j++) {
658       xb = x + 4*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
659       sum1 += aa[16*j]*x1   + aa[16*j+4]*x2 + aa[16*j+8]*x3  + aa[16*j+12]*x4;
660       sum2 += aa[16*j+1]*x1 + aa[16*j+5]*x2 + aa[16*j+9]*x3  + aa[16*j+13]*x4;
661       sum3 += aa[16*j+2]*x1 + aa[16*j+6]*x2 + aa[16*j+10]*x3 + aa[16*j+14]*x4;
662       sum4 += aa[16*j+3]*x1 + aa[16*j+7]*x2 + aa[16*j+11]*x3 + aa[16*j+15]*x4;
663     }
664     z[4*i]   = sum1; z[4*i+1] = sum2;
665     z[4*i+2] = sum3; z[4*i+3] = sum4;
666   }
667   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
668   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
669   return 0;
670 }
671 
672 #undef __FUNCT__
673 #define __FUNCT__ "MatMult_SeqBAIJ_4"
674 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
675 {
676   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
677   PetscScalar       *z,x1,x2,x3,x4,sum1,sum2,sum3,sum4;
678   const PetscScalar *x,*xb;
679   const MatScalar   *v;
680   PetscErrorCode    ierr;
681   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
682   PetscBool         usecprow=a->compressedrow.use;
683 
684   PetscFunctionBegin;
685 
686   if (usecprow) {
687     ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
688     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
689     mbs  = a->compressedrow.nrows;
690     ii   = a->compressedrow.i;
691     ridx = a->compressedrow.rindex;
692     for (i=0; i<mbs; i++) {
693       n = ii[i+1] - ii[1];
694       idx  = a->j + ii[i];
695       v    = a->a + ii[i]*16;
696       sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
697       PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
698       PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
699       for (j=0; j<n; j++) {
700 	xb = x + 4*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
701 	sum1 += v[16*j]*x1   + v[16*j+4]*x2 + v[16*j+8]*x3  + v[16*j+12]*x4;
702 	sum2 += v[16*j+1]*x1 + v[16*j+5]*x2 + v[16*j+9]*x3  + v[16*j+13]*x4;
703 	sum3 += v[16*j+2]*x1 + v[16*j+6]*x2 + v[16*j+10]*x3 + v[16*j+14]*x4;
704 	sum4 += v[16*j+3]*x1 + v[16*j+7]*x2 + v[16*j+11]*x3 + v[16*j+15]*x4;
705       }
706       z[4*ridx[i]]   = sum1; z[4*ridx[i]+1] = sum2;
707       z[4*ridx[i]+2] = sum3; z[4*ridx[i]+3] = sum4;
708     }
709     ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
710     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
711   } else {
712     ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_4_Kernel,3,A,xx,zz);CHKERRQ(ierr);
713   }
714   ierr = PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);CHKERRQ(ierr);
715   PetscFunctionReturn(0);
716 }
717 #else
718 #undef __FUNCT__
719 #define __FUNCT__ "MatMult_SeqBAIJ_4"
720 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
721 {
722   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
723   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
724   const PetscScalar *x,*xb;
725   const MatScalar   *v;
726   PetscErrorCode    ierr;
727   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
728   PetscBool         usecprow=a->compressedrow.use;
729 
730   PetscFunctionBegin;
731   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
732   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
733 
734   idx = a->j;
735   v   = a->a;
736   if (usecprow) {
737     mbs  = a->compressedrow.nrows;
738     ii   = a->compressedrow.i;
739     ridx = a->compressedrow.rindex;
740   } else {
741     mbs = a->mbs;
742     ii  = a->i;
743     z   = zarray;
744   }
745 
746   for (i=0; i<mbs; i++) {
747     n = ii[1] - ii[0];
748     ii++;
749     sum1 = 0.0;
750     sum2 = 0.0;
751     sum3 = 0.0;
752     sum4 = 0.0;
753 
754     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
755     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
756     for (j=0; j<n; j++) {
757       xb    = x + 4*(*idx++);
758       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
759       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
760       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
761       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
762       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
763       v    += 16;
764     }
765     if (usecprow) z = zarray + 4*ridx[i];
766     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
767     if (!usecprow) z += 4;
768   }
769   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
770   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
771   ierr = PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);CHKERRQ(ierr);
772   PetscFunctionReturn(0);
773 }
774 #endif
775 
776 #if defined(PETSC_THREADCOMM_ACTIVE)
777 PetscErrorCode MatMult_SeqBAIJ_5_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz)
778 {
779   PetscErrorCode    ierr;
780   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
781   PetscScalar       *z,x1,x2,x3,x4,x5,sum1,sum2,sum3,sum4,sum5;
782   const PetscScalar *x,*xb;
783   const MatScalar   *aa;
784   PetscInt          *trstarts=A->rmap->trstarts;
785   PetscInt          n,start,end,i,j;
786   const PetscInt    *aj,*ai;
787 
788   ierr   = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
789   ierr   = VecGetArray(zz,&z);CHKERRQ(ierr);
790   start  = trstarts[thread_id] / 5;
791   end    = trstarts[thread_id+1] / 5;
792   ai     = a->i;
793   for (i=start; i<end; i++) {
794     n    = ai[i+1] - ai[i];
795     aj   = a->j + ai[i];
796     aa   = a->a + ai[i]*25;
797     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
798     for (j=0; j<n; j++) {
799       xb = x + 5*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
800       sum1 += aa[25*j]*x1   + aa[25*j+5]*x2 + aa[25*j+10]*x3 + aa[25*j+15]*x4 + aa[25*j+20]*x5;
801       sum2 += aa[25*j+1]*x1 + aa[25*j+6]*x2 + aa[25*j+11]*x3 + aa[25*j+16]*x4 + aa[25*j+21]*x5;
802       sum3 += aa[25*j+2]*x1 + aa[25*j+7]*x2 + aa[25*j+12]*x3 + aa[25*j+17]*x4 + aa[25*j+22]*x5;
803       sum4 += aa[25*j+3]*x1 + aa[25*j+8]*x2 + aa[25*j+13]*x3 + aa[25*j+18]*x4 + aa[25*j+23]*x5;
804       sum5 += aa[25*j+4]*x1 + aa[25*j+9]*x2 + aa[25*j+14]*x3 + aa[25*j+19]*x4 + aa[25*j+24]*x5;
805     }
806     z[5*i]   = sum1; z[5*i+1] = sum2; z[5*i+2] = sum3;
807     z[5*i+3] = sum4; z[5*i+4] = sum5;
808   }
809   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
810   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
811   return 0;
812 }
813 
814 #undef __FUNCT__
815 #define __FUNCT__ "MatMult_SeqBAIJ_5"
816 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
817 {
818   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
819   PetscScalar       *z,x1,x2,x3,x4,x5,sum1,sum2,sum3,sum4,sum5;
820   const PetscScalar *xb,*x;
821   const MatScalar   *v;
822   PetscErrorCode    ierr;
823   const PetscInt    *idx,*ii,*ridx=NULL;
824   PetscInt          mbs,i,j,n;
825   PetscBool         usecprow=a->compressedrow.use;
826 
827   PetscFunctionBegin;
828 
829   if (usecprow) {
830     ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
831     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
832     mbs  = a->compressedrow.nrows;
833     ii   = a->compressedrow.i;
834     ridx = a->compressedrow.rindex;
835     for (i=0; i<mbs; i++) {
836       n    = ii[i+1] - ii[i];
837       idx  = a->j + ii[i];
838       v    = a->a + ii[i]*25;
839       sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
840       PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
841       PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
842       for (j=0; j<n; j++) {
843 	xb = x + 5*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
844 	sum1 += v[25*j]*x1   + v[25*j+5]*x2 + v[25*j+10]*x3 + v[25*j+15]*x4 + v[25*j+20]*x5;
845 	sum2 += v[25*j+1]*x1 + v[25*j+6]*x2 + v[25*j+11]*x3 + v[25*j+16]*x4 + v[25*j+21]*x5;
846 	sum3 += v[25*j+2]*x1 + v[25*j+7]*x2 + v[25*j+12]*x3 + v[25*j+17]*x4 + v[25*j+22]*x5;
847 	sum4 += v[25*j+3]*x1 + v[25*j+8]*x2 + v[25*j+13]*x3 + v[25*j+18]*x4 + v[25*j+23]*x5;
848 	sum5 += v[25*j+4]*x1 + v[25*j+9]*x2 + v[25*j+14]*x3 + v[25*j+19]*x4 + v[25*j+24]*x5;
849       }
850       z[5*ridx[i]]   = sum1; z[5*ridx[i]+1] = sum2; z[5*ridx[i]+2] = sum3;
851       z[5*ridx[i]+3] = sum4; z[5*ridx[i]+4] = sum5;
852     }
853     ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
854     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
855   } else {
856     ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_5_Kernel,3,A,xx,zz);CHKERRQ(ierr);
857   }
858   ierr = PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);CHKERRQ(ierr);
859   PetscFunctionReturn(0);
860 }
861 #else
862 #undef __FUNCT__
863 #define __FUNCT__ "MatMult_SeqBAIJ_5"
864 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
865 {
866   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
867   PetscScalar       sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray;
868   const PetscScalar *xb,*x;
869   const MatScalar   *v;
870   PetscErrorCode    ierr;
871   const PetscInt    *idx,*ii,*ridx=NULL;
872   PetscInt          mbs,i,j,n;
873   PetscBool         usecprow=a->compressedrow.use;
874 
875   PetscFunctionBegin;
876   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
877   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
878 
879   idx = a->j;
880   v   = a->a;
881   if (usecprow) {
882     mbs  = a->compressedrow.nrows;
883     ii   = a->compressedrow.i;
884     ridx = a->compressedrow.rindex;
885   } else {
886     mbs = a->mbs;
887     ii  = a->i;
888     z   = zarray;
889   }
890 
891   for (i=0; i<mbs; i++) {
892     n           = ii[1] - ii[0]; ii++;
893     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
894     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
895     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
896     for (j=0; j<n; j++) {
897       xb    = x + 5*(*idx++);
898       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
899       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
900       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
901       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
902       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
903       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
904       v    += 25;
905     }
906     if (usecprow) z = zarray + 5*ridx[i];
907     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
908     if (!usecprow) z += 5;
909   }
910   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
911   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
912   ierr = PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);CHKERRQ(ierr);
913   PetscFunctionReturn(0);
914 }
915 #endif
916 
917 
918 #if defined(PETSC_THREADCOMM_ACTIVE)
919 PetscErrorCode MatMult_SeqBAIJ_6_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz)
920 {
921   PetscErrorCode    ierr;
922   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
923   PetscScalar       *z,x1,x2,x3,x4,x5,x6,sum1,sum2,sum3,sum4,sum5,sum6;
924   const PetscScalar *x,*xb;
925   const MatScalar   *aa;
926   PetscInt          *trstarts=A->rmap->trstarts;
927   PetscInt          n,start,end,i,j;
928   const PetscInt    *aj,*ai;
929 
930   ierr   = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
931   ierr   = VecGetArray(zz,&z);CHKERRQ(ierr);
932   start  = trstarts[thread_id] / 6;
933   end    = trstarts[thread_id+1] / 6;
934   ai     = a->i;
935   for (i=start; i<end; i++) {
936     n    = ai[i+1] - ai[i];
937     aj   = a->j + ai[i];
938     aa   = a->a + ai[i]*36;
939     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0;
940     for (j=0; j<n; j++) {
941       xb = x + 6*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
942       sum1 += aa[36*j]*x1   + aa[36*j+6]*x2  + aa[36*j+12]*x3 + aa[36*j+18]*x4 + aa[36*j+24]*x5 + aa[36*j+30]*x6;
943       sum2 += aa[36*j+1]*x1 + aa[36*j+7]*x2  + aa[36*j+13]*x3 + aa[36*j+19]*x4 + aa[36*j+25]*x5 + aa[36*j+31]*x6;
944       sum3 += aa[36*j+2]*x1 + aa[36*j+8]*x2  + aa[36*j+14]*x3 + aa[36*j+20]*x4 + aa[36*j+26]*x5 + aa[36*j+32]*x6;
945       sum4 += aa[36*j+3]*x1 + aa[36*j+9]*x2  + aa[36*j+15]*x3 + aa[36*j+21]*x4 + aa[36*j+27]*x5 + aa[36*j+33]*x6;
946       sum5 += aa[36*j+4]*x1 + aa[36*j+10]*x2 + aa[36*j+16]*x3 + aa[36*j+22]*x4 + aa[36*j+28]*x5 + aa[36*j+34]*x6;
947       sum6 += aa[36*j+5]*x1 + aa[36*j+11]*x2 + aa[36*j+17]*x3 + aa[36*j+23]*x4 + aa[36*j+29]*x5 + aa[36*j+35]*x6;
948     }
949     z[6*i]   = sum1; z[6*i+1] = sum2; z[6*i+2] = sum3;
950     z[6*i+3] = sum4; z[6*i+4] = sum5; z[6*i+5] = sum6;
951   }
952   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
953   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
954   return 0;
955 }
956 
957 #undef __FUNCT__
958 #define __FUNCT__ "MatMult_SeqBAIJ_6"
959 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
960 {
961   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
962   PetscScalar       *z,x1,x2,x3,x4,x5,x6,sum1,sum2,sum3,sum4,sum5,sum6;
963   const PetscScalar *x,*xb;
964   const MatScalar   *v;
965   PetscErrorCode    ierr;
966   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
967   PetscBool         usecprow=a->compressedrow.use;
968 
969   PetscFunctionBegin;
970 
971   if (usecprow) {
972     ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
973     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
974     mbs  = a->compressedrow.nrows;
975     ii   = a->compressedrow.i;
976     ridx = a->compressedrow.rindex;
977     for (i=0; i<mbs; i++) {
978       n  = ii[i+1] - ii[i];
979       idx  = a->j + ii[i];
980       v    = a->a + ii[i]*36;
981       sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0;
982       PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
983       PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
984       for (j=0; j<n; j++) {
985 	xb = x + 6*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
986 	sum1 += v[36*j]*x1   + v[36*j+6]*x2  + v[36*j+12]*x3 + v[36*j+18]*x4 + v[36*j+24]*x5 + v[36*j+30]*x6;
987 	sum2 += v[36*j+1]*x1 + v[36*j+7]*x2  + v[36*j+13]*x3 + v[36*j+19]*x4 + v[36*j+25]*x5 + v[36*j+31]*x6;
988 	sum3 += v[36*j+2]*x1 + v[36*j+8]*x2  + v[36*j+14]*x3 + v[36*j+20]*x4 + v[36*j+26]*x5 + v[36*j+32]*x6;
989 	sum4 += v[36*j+3]*x1 + v[36*j+9]*x2  + v[36*j+15]*x3 + v[36*j+21]*x4 + v[36*j+27]*x5 + v[36*j+33]*x6;
990 	sum5 += v[36*j+4]*x1 + v[36*j+10]*x2 + v[36*j+16]*x3 + v[36*j+22]*x4 + v[36*j+28]*x5 + v[36*j+34]*x6;
991 	sum6 += v[36*j+5]*x1 + v[36*j+11]*x2 + v[36*j+17]*x3 + v[36*j+23]*x4 + v[36*j+29]*x5 + v[36*j+35]*x6;
992       }
993       z[6*ridx[i]]   = sum1; z[6*ridx[i]+1] = sum2; z[6*ridx[i]+2] = sum3;
994       z[6*ridx[i]+3] = sum4; z[6*ridx[i]+4] = sum5; z[6*ridx[i]+5] = sum6;
995     }
996     ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
997     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
998   } else {
999     ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_6_Kernel,3,A,xx,zz);CHKERRQ(ierr);
1000   }
1001   ierr = PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);CHKERRQ(ierr);
1002   PetscFunctionReturn(0);
1003 }
1004 #else
1005 #undef __FUNCT__
1006 #define __FUNCT__ "MatMult_SeqBAIJ_6"
1007 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
1008 {
1009   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1010   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
1011   const PetscScalar *x,*xb;
1012   PetscScalar       x1,x2,x3,x4,x5,x6,*zarray;
1013   const MatScalar   *v;
1014   PetscErrorCode    ierr;
1015   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
1016   PetscBool         usecprow=a->compressedrow.use;
1017 
1018   PetscFunctionBegin;
1019   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1020   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1021 
1022   idx = a->j;
1023   v   = a->a;
1024   if (usecprow) {
1025     mbs  = a->compressedrow.nrows;
1026     ii   = a->compressedrow.i;
1027     ridx = a->compressedrow.rindex;
1028   } else {
1029     mbs = a->mbs;
1030     ii  = a->i;
1031     z   = zarray;
1032   }
1033 
1034   for (i=0; i<mbs; i++) {
1035     n  = ii[1] - ii[0];
1036     ii++;
1037     sum1 = 0.0;
1038     sum2 = 0.0;
1039     sum3 = 0.0;
1040     sum4 = 0.0;
1041     sum5 = 0.0;
1042     sum6 = 0.0;
1043 
1044     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1045     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1046     for (j=0; j<n; j++) {
1047       xb    = x + 6*(*idx++);
1048       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
1049       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
1050       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
1051       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
1052       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
1053       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
1054       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
1055       v    += 36;
1056     }
1057     if (usecprow) z = zarray + 6*ridx[i];
1058     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
1059     if (!usecprow) z += 6;
1060   }
1061 
1062   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1063   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1064   ierr = PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);CHKERRQ(ierr);
1065   PetscFunctionReturn(0);
1066 }
1067 #endif
1068 
1069 #if defined(PETSC_THREADCOMM_ACTIVE)
1070 PetscErrorCode MatMult_SeqBAIJ_7_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz)
1071 {
1072   PetscErrorCode    ierr;
1073   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1074   PetscScalar       *z,x1,x2,x3,x4,x5,x6,x7,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1075   const PetscScalar *x,*xb;
1076   const MatScalar   *aa;
1077   PetscInt          *trstarts=A->rmap->trstarts;
1078   PetscInt          n,start,end,i,j;
1079   const PetscInt    *aj,*ai;
1080 
1081   ierr   = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1082   ierr   = VecGetArray(zz,&z);CHKERRQ(ierr);
1083   start  = trstarts[thread_id] / 7;
1084   end    = trstarts[thread_id+1] / 7;
1085   ai     = a->i;
1086   for (i=start; i<end; i++) {
1087     n    = ai[i+1] - ai[i];
1088     aj   = a->j + ai[i];
1089     aa   = a->a + ai[i]*49;
1090     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1091     for (j=0; j<n; j++) {
1092       xb = x + 7*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1093       sum1 += aa[49*j]*x1   + aa[49*j+7]*x2  + aa[49*j+14]*x3 + aa[49*j+21]*x4 + aa[49*j+28]*x5 + aa[49*j+35]*x6 + aa[49*j+42]*x7;
1094       sum2 += aa[49*j+1]*x1 + aa[49*j+8]*x2  + aa[49*j+15]*x3 + aa[49*j+22]*x4 + aa[49*j+29]*x5 + aa[49*j+36]*x6 + aa[49*j+43]*x7;
1095       sum3 += aa[49*j+2]*x1 + aa[49*j+9]*x2  + aa[49*j+16]*x3 + aa[49*j+23]*x4 + aa[49*j+30]*x5 + aa[49*j+37]*x6 + aa[49*j+44]*x7;
1096       sum4 += aa[49*j+3]*x1 + aa[49*j+10]*x2 + aa[49*j+17]*x3 + aa[49*j+24]*x4 + aa[49*j+31]*x5 + aa[49*j+38]*x6 + aa[49*j+45]*x7;
1097       sum5 += aa[49*j+4]*x1 + aa[49*j+11]*x2 + aa[49*j+18]*x3 + aa[49*j+25]*x4 + aa[49*j+32]*x5 + aa[49*j+39]*x6 + aa[49*j+46]*x7;
1098       sum6 += aa[49*j+5]*x1 + aa[49*j+12]*x2 + aa[49*j+19]*x3 + aa[49*j+26]*x4 + aa[49*j+33]*x5 + aa[49*j+40]*x6 + aa[49*j+47]*x7;
1099       sum7 += aa[49*j+6]*x1 + aa[49*j+13]*x2 + aa[49*j+20]*x3 + aa[49*j+27]*x4 + aa[49*j+34]*x5 + aa[49*j+41]*x6 + aa[49*j+48]*x7;
1100     }
1101     z[7*i]   = sum1; z[7*i+1] = sum2; z[7*i+2] = sum3; z[7*i+3] = sum4;
1102     z[7*i+4] = sum5; z[7*i+5] = sum6; z[7*i+6] = sum7;
1103   }
1104   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1105   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1106   return 0;
1107 }
1108 
1109 #undef __FUNCT__
1110 #define __FUNCT__ "MatMult_SeqBAIJ_7"
1111 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
1112 {
1113   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1114   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1115   const PetscScalar *x,*xb;
1116   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*zarray;
1117   const MatScalar   *v;
1118   PetscErrorCode    ierr;
1119   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
1120   PetscBool         usecprow=a->compressedrow.use;
1121 
1122   PetscFunctionBegin;
1123 
1124   if (usecprow) {
1125     ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1126     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1127     mbs  = a->compressedrow.nrows;
1128     ii   = a->compressedrow.i;
1129     ridx = a->compressedrow.rindex;
1130     for (i=0; i<mbs; i++) {
1131       n    = ii[i+1] - ii[i];
1132       idx  = a->j + ii[i];
1133       v    = a->a + ii[i]*49;
1134       sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1135       PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1136       PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1137       for (j=0; j<n; j++) {
1138 	xb = x + 7*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1139 	sum1 += v[49*j]*x1   + v[49*j+7]*x2  + v[49*j+14]*x3 + v[49*j+21]*x4 + v[49*j+28]*x5 + v[49*j+35]*x6 + v[49*j+42]*x7;
1140 	sum2 += v[49*j+1]*x1 + v[49*j+8]*x2  + v[49*j+15]*x3 + v[49*j+22]*x4 + v[49*j+29]*x5 + v[49*j+36]*x6 + v[49*j+43]*x7;
1141 	sum3 += v[49*j+2]*x1 + v[49*j+9]*x2  + v[49*j+16]*x3 + v[49*j+23]*x4 + v[49*j+30]*x5 + v[49*j+37]*x6 + v[49*j+44]*x7;
1142 	sum4 += v[49*j+3]*x1 + v[49*j+10]*x2 + v[49*j+17]*x3 + v[49*j+24]*x4 + v[49*j+31]*x5 + v[49*j+38]*x6 + v[49*j+45]*x7;
1143 	sum5 += v[49*j+4]*x1 + v[49*j+11]*x2 + v[49*j+18]*x3 + v[49*j+25]*x4 + v[49*j+32]*x5 + v[49*j+39]*x6 + v[49*j+46]*x7;
1144 	sum6 += v[49*j+5]*x1 + v[49*j+12]*x2 + v[49*j+19]*x3 + v[49*j+26]*x4 + v[49*j+33]*x5 + v[49*j+40]*x6 + v[49*j+47]*x7;
1145 	sum7 += v[49*j+6]*x1 + v[49*j+13]*x2 + v[49*j+20]*x3 + v[49*j+27]*x4 + v[49*j+34]*x5 + v[49*j+41]*x6 + v[49*j+48]*x7;
1146       }
1147       z[7*ridx[i]]   = sum1; z[7*ridx[i]+1] = sum2; z[7*ridx[i]+2] = sum3; z[7*ridx[i]+3] = sum4;
1148       z[7*ridx[i]+4] = sum5; z[7*ridx[i]+5] = sum6; z[7*ridx[i]+6] = sum7;
1149     }
1150     ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1151     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1152   } else {
1153     ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_7_Kernel,3,A,xx,zz);CHKERRQ(ierr);
1154   }
1155   ierr = PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);CHKERRQ(ierr);
1156   PetscFunctionReturn(0);
1157 }
1158 #else
1159 #undef __FUNCT__
1160 #define __FUNCT__ "MatMult_SeqBAIJ_7"
1161 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
1162 {
1163   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1164   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1165   const PetscScalar *x,*xb;
1166   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*zarray;
1167   const MatScalar   *v;
1168   PetscErrorCode    ierr;
1169   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
1170   PetscBool         usecprow=a->compressedrow.use;
1171 
1172   PetscFunctionBegin;
1173   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1174   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1175 
1176   idx = a->j;
1177   v   = a->a;
1178   if (usecprow) {
1179     mbs  = a->compressedrow.nrows;
1180     ii   = a->compressedrow.i;
1181     ridx = a->compressedrow.rindex;
1182   } else {
1183     mbs = a->mbs;
1184     ii  = a->i;
1185     z   = zarray;
1186   }
1187 
1188   for (i=0; i<mbs; i++) {
1189     n  = ii[1] - ii[0];
1190     ii++;
1191     sum1 = 0.0;
1192     sum2 = 0.0;
1193     sum3 = 0.0;
1194     sum4 = 0.0;
1195     sum5 = 0.0;
1196     sum6 = 0.0;
1197     sum7 = 0.0;
1198 
1199     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1200     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1201     for (j=0; j<n; j++) {
1202       xb    = x + 7*(*idx++);
1203       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1204       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1205       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1206       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1207       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1208       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1209       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1210       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1211       v    += 49;
1212     }
1213     if (usecprow) z = zarray + 7*ridx[i];
1214     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1215     if (!usecprow) z += 7;
1216   }
1217 
1218   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1219   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1220   ierr = PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);CHKERRQ(ierr);
1221   PetscFunctionReturn(0);
1222 }
1223 #endif
1224 
1225 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
1226 /* Default MatMult for block size 15 */
1227 
1228 #undef __FUNCT__
1229 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver1"
1230 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz)
1231 {
1232   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1233   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1234   const PetscScalar *x,*xb;
1235   PetscScalar       *zarray,xv;
1236   const MatScalar   *v;
1237   PetscErrorCode    ierr;
1238   const PetscInt    *ii,*ij=a->j,*idx;
1239   PetscInt          mbs,i,j,k,n,*ridx=NULL;
1240   PetscBool         usecprow=a->compressedrow.use;
1241 
1242   PetscFunctionBegin;
1243   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1244   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1245 
1246   v = a->a;
1247   if (usecprow) {
1248     mbs  = a->compressedrow.nrows;
1249     ii   = a->compressedrow.i;
1250     ridx = a->compressedrow.rindex;
1251   } else {
1252     mbs = a->mbs;
1253     ii  = a->i;
1254     z   = zarray;
1255   }
1256 
1257   for (i=0; i<mbs; i++) {
1258     n    = ii[i+1] - ii[i];
1259     idx  = ij + ii[i];
1260     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1261     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1262 
1263     for (j=0; j<n; j++) {
1264       xb = x + 15*(idx[j]);
1265 
1266       for (k=0; k<15; k++) {
1267         xv     =  xb[k];
1268         sum1  += v[0]*xv;
1269         sum2  += v[1]*xv;
1270         sum3  += v[2]*xv;
1271         sum4  += v[3]*xv;
1272         sum5  += v[4]*xv;
1273         sum6  += v[5]*xv;
1274         sum7  += v[6]*xv;
1275         sum8  += v[7]*xv;
1276         sum9  += v[8]*xv;
1277         sum10 += v[9]*xv;
1278         sum11 += v[10]*xv;
1279         sum12 += v[11]*xv;
1280         sum13 += v[12]*xv;
1281         sum14 += v[13]*xv;
1282         sum15 += v[14]*xv;
1283         v     += 15;
1284       }
1285     }
1286     if (usecprow) z = zarray + 15*ridx[i];
1287     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1288     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1289 
1290     if (!usecprow) z += 15;
1291   }
1292 
1293   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1294   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1295   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
1296   PetscFunctionReturn(0);
1297 }
1298 
1299 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
1300 #undef __FUNCT__
1301 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver2"
1302 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz)
1303 {
1304   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1305   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1306   const PetscScalar *x,*xb;
1307   PetscScalar       x1,x2,x3,x4,*zarray;
1308   const MatScalar   *v;
1309   PetscErrorCode    ierr;
1310   const PetscInt    *ii,*ij=a->j,*idx;
1311   PetscInt          mbs,i,j,n,*ridx=NULL;
1312   PetscBool         usecprow=a->compressedrow.use;
1313 
1314   PetscFunctionBegin;
1315   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1316   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1317 
1318   v = a->a;
1319   if (usecprow) {
1320     mbs  = a->compressedrow.nrows;
1321     ii   = a->compressedrow.i;
1322     ridx = a->compressedrow.rindex;
1323   } else {
1324     mbs = a->mbs;
1325     ii  = a->i;
1326     z   = zarray;
1327   }
1328 
1329   for (i=0; i<mbs; i++) {
1330     n    = ii[i+1] - ii[i];
1331     idx  = ij + ii[i];
1332     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1333     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1334 
1335     for (j=0; j<n; j++) {
1336       xb = x + 15*(idx[j]);
1337       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1338 
1339       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
1340       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
1341       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
1342       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
1343       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
1344       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
1345       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
1346       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
1347       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
1348       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
1349       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
1350       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
1351       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
1352       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
1353       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
1354 
1355       v += 60;
1356 
1357       x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];
1358 
1359       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
1360       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
1361       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
1362       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
1363       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
1364       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
1365       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
1366       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
1367       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
1368       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
1369       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
1370       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
1371       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
1372       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
1373       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
1374       v     += 60;
1375 
1376       x1     = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
1377       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
1378       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
1379       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
1380       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
1381       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
1382       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
1383       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
1384       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
1385       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
1386       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
1387       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
1388       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
1389       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
1390       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
1391       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
1392       v     += 60;
1393 
1394       x1     = xb[12]; x2 = xb[13]; x3 = xb[14];
1395       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3;
1396       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3;
1397       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3;
1398       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3;
1399       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3;
1400       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3;
1401       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3;
1402       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3;
1403       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3;
1404       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3;
1405       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3;
1406       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3;
1407       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3;
1408       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3;
1409       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3;
1410       v     += 45;
1411     }
1412     if (usecprow) z = zarray + 15*ridx[i];
1413     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1414     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1415 
1416     if (!usecprow) z += 15;
1417   }
1418 
1419   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1420   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1421   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
1422   PetscFunctionReturn(0);
1423 }
1424 
1425 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
1426 #undef __FUNCT__
1427 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver3"
1428 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz)
1429 {
1430   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1431   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1432   const PetscScalar *x,*xb;
1433   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,*zarray;
1434   const MatScalar   *v;
1435   PetscErrorCode    ierr;
1436   const PetscInt    *ii,*ij=a->j,*idx;
1437   PetscInt          mbs,i,j,n,*ridx=NULL;
1438   PetscBool         usecprow=a->compressedrow.use;
1439 
1440   PetscFunctionBegin;
1441   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1442   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1443 
1444   v = a->a;
1445   if (usecprow) {
1446     mbs  = a->compressedrow.nrows;
1447     ii   = a->compressedrow.i;
1448     ridx = a->compressedrow.rindex;
1449   } else {
1450     mbs = a->mbs;
1451     ii  = a->i;
1452     z   = zarray;
1453   }
1454 
1455   for (i=0; i<mbs; i++) {
1456     n    = ii[i+1] - ii[i];
1457     idx  = ij + ii[i];
1458     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1459     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1460 
1461     for (j=0; j<n; j++) {
1462       xb = x + 15*(idx[j]);
1463       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1464       x8 = xb[7];
1465 
1466       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8;
1467       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8;
1468       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8;
1469       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8;
1470       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8;
1471       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8;
1472       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8;
1473       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8;
1474       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8;
1475       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8;
1476       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8;
1477       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8;
1478       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8;
1479       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8;
1480       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8;
1481       v     += 120;
1482 
1483       x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14];
1484 
1485       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7;
1486       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7;
1487       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7;
1488       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7;
1489       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7;
1490       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7;
1491       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7;
1492       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7;
1493       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7;
1494       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7;
1495       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7;
1496       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7;
1497       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7;
1498       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7;
1499       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7;
1500       v     += 105;
1501     }
1502     if (usecprow) z = zarray + 15*ridx[i];
1503     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1504     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1505 
1506     if (!usecprow) z += 15;
1507   }
1508 
1509   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1510   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1511   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
1512   PetscFunctionReturn(0);
1513 }
1514 
1515 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
1516 
1517 #undef __FUNCT__
1518 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver4"
1519 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz)
1520 {
1521   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1522   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1523   const PetscScalar *x,*xb;
1524   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray;
1525   const MatScalar   *v;
1526   PetscErrorCode    ierr;
1527   const PetscInt    *ii,*ij=a->j,*idx;
1528   PetscInt          mbs,i,j,n,*ridx=NULL;
1529   PetscBool         usecprow=a->compressedrow.use;
1530 
1531   PetscFunctionBegin;
1532   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1533   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1534 
1535   v = a->a;
1536   if (usecprow) {
1537     mbs  = a->compressedrow.nrows;
1538     ii   = a->compressedrow.i;
1539     ridx = a->compressedrow.rindex;
1540   } else {
1541     mbs = a->mbs;
1542     ii  = a->i;
1543     z   = zarray;
1544   }
1545 
1546   for (i=0; i<mbs; i++) {
1547     n    = ii[i+1] - ii[i];
1548     idx  = ij + ii[i];
1549     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1550     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1551 
1552     for (j=0; j<n; j++) {
1553       xb = x + 15*(idx[j]);
1554       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1555       x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14];
1556 
1557       sum1  +=  v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
1558       sum2  +=  v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
1559       sum3  +=  v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
1560       sum4  +=  v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
1561       sum5  += v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
1562       sum6  += v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
1563       sum7  += v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
1564       sum8  += v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
1565       sum9  += v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
1566       sum10 += v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
1567       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
1568       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
1569       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
1570       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
1571       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
1572       v     += 225;
1573     }
1574     if (usecprow) z = zarray + 15*ridx[i];
1575     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1576     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1577 
1578     if (!usecprow) z += 15;
1579   }
1580 
1581   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1582   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1583   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
1584   PetscFunctionReturn(0);
1585 }
1586 
1587 
1588 /*
1589     This will not work with MatScalar == float because it calls the BLAS
1590 */
1591 #undef __FUNCT__
1592 #define __FUNCT__ "MatMult_SeqBAIJ_N"
1593 PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
1594 {
1595   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1596   PetscScalar       *z = 0,*work,*workt,*zarray;
1597   const PetscScalar *x,*xb;
1598   const MatScalar   *v;
1599   PetscErrorCode    ierr;
1600   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1601   const PetscInt    *idx,*ii,*ridx=NULL;
1602   PetscInt          ncols,k;
1603   PetscBool         usecprow=a->compressedrow.use;
1604 
1605   PetscFunctionBegin;
1606   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1607   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1608 
1609   idx = a->j;
1610   v   = a->a;
1611   if (usecprow) {
1612     mbs  = a->compressedrow.nrows;
1613     ii   = a->compressedrow.i;
1614     ridx = a->compressedrow.rindex;
1615   } else {
1616     mbs = a->mbs;
1617     ii  = a->i;
1618     z   = zarray;
1619   }
1620 
1621   if (!a->mult_work) {
1622     k    = PetscMax(A->rmap->n,A->cmap->n);
1623     ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
1624   }
1625   work = a->mult_work;
1626   for (i=0; i<mbs; i++) {
1627     n           = ii[1] - ii[0]; ii++;
1628     ncols       = n*bs;
1629     workt       = work;
1630     for (j=0; j<n; j++) {
1631       xb = x + bs*(*idx++);
1632       for (k=0; k<bs; k++) workt[k] = xb[k];
1633       workt += bs;
1634     }
1635     if (usecprow) z = zarray + bs*ridx[i];
1636     PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
1637     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
1638     v += n*bs2;
1639     if (!usecprow) z += bs;
1640   }
1641   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1642   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1643   ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);CHKERRQ(ierr);
1644   PetscFunctionReturn(0);
1645 }
1646 
1647 #undef __FUNCT__
1648 #define __FUNCT__ "MatMultAdd_SeqBAIJ_1"
1649 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
1650 {
1651   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1652   const PetscScalar *x;
1653   PetscScalar       *y,*z,sum;
1654   const MatScalar   *v;
1655   PetscErrorCode    ierr;
1656   PetscInt          mbs=a->mbs,i,n,*ridx=NULL;
1657   const PetscInt    *idx,*ii;
1658   PetscBool         usecprow=a->compressedrow.use;
1659 
1660   PetscFunctionBegin;
1661   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1662   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
1663 
1664   idx = a->j;
1665   v   = a->a;
1666   if (usecprow) {
1667     if (zz != yy) {
1668       ierr = PetscMemcpy(z,y,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1669     }
1670     mbs  = a->compressedrow.nrows;
1671     ii   = a->compressedrow.i;
1672     ridx = a->compressedrow.rindex;
1673   } else {
1674     ii = a->i;
1675   }
1676 
1677   for (i=0; i<mbs; i++) {
1678     n = ii[1] - ii[0];
1679     ii++;
1680     if (!usecprow) {
1681       sum         = y[i];
1682     } else {
1683       sum = y[ridx[i]];
1684     }
1685     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1686     PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
1687     PetscSparseDensePlusDot(sum,x,v,idx,n);
1688     v   += n;
1689     idx += n;
1690     if (usecprow) {
1691       z[ridx[i]] = sum;
1692     } else {
1693       z[i] = sum;
1694     }
1695   }
1696   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1697   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
1698   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1699   PetscFunctionReturn(0);
1700 }
1701 
1702 #undef __FUNCT__
1703 #define __FUNCT__ "MatMultAdd_SeqBAIJ_2"
1704 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
1705 {
1706   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1707   PetscScalar       *y = 0,*z = 0,sum1,sum2;
1708   const PetscScalar *x,*xb;
1709   PetscScalar       x1,x2,*yarray,*zarray;
1710   const MatScalar   *v;
1711   PetscErrorCode    ierr;
1712   PetscInt          mbs = a->mbs,i,n,j;
1713   const PetscInt    *idx,*ii,*ridx = NULL;
1714   PetscBool         usecprow = a->compressedrow.use;
1715 
1716   PetscFunctionBegin;
1717   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1718   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1719 
1720   idx = a->j;
1721   v   = a->a;
1722   if (usecprow) {
1723     if (zz != yy) {
1724       ierr = PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1725     }
1726     mbs  = a->compressedrow.nrows;
1727     ii   = a->compressedrow.i;
1728     ridx = a->compressedrow.rindex;
1729     if (zz != yy) {
1730       ierr = PetscMemcpy(zarray,yarray,a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1731     }
1732   } else {
1733     ii = a->i;
1734     y  = yarray;
1735     z  = zarray;
1736   }
1737 
1738   for (i=0; i<mbs; i++) {
1739     n = ii[1] - ii[0]; ii++;
1740     if (usecprow) {
1741       z = zarray + 2*ridx[i];
1742       y = yarray + 2*ridx[i];
1743     }
1744     sum1 = y[0]; sum2 = y[1];
1745     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1746     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1747     for (j=0; j<n; j++) {
1748       xb = x + 2*(*idx++);
1749       x1 = xb[0];
1750       x2 = xb[1];
1751 
1752       sum1 += v[0]*x1 + v[2]*x2;
1753       sum2 += v[1]*x1 + v[3]*x2;
1754       v    += 4;
1755     }
1756     z[0] = sum1; z[1] = sum2;
1757     if (!usecprow) {
1758       z += 2; y += 2;
1759     }
1760   }
1761   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1762   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1763   ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr);
1764   PetscFunctionReturn(0);
1765 }
1766 
1767 #undef __FUNCT__
1768 #define __FUNCT__ "MatMultAdd_SeqBAIJ_3"
1769 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
1770 {
1771   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1772   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
1773   const PetscScalar *x,*xb;
1774   const MatScalar   *v;
1775   PetscErrorCode    ierr;
1776   PetscInt          mbs = a->mbs,i,j,n;
1777   const PetscInt    *idx,*ii,*ridx = NULL;
1778   PetscBool         usecprow = a->compressedrow.use;
1779 
1780   PetscFunctionBegin;
1781   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1782   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1783 
1784   idx = a->j;
1785   v   = a->a;
1786   if (usecprow) {
1787     if (zz != yy) {
1788       ierr = PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1789     }
1790     mbs  = a->compressedrow.nrows;
1791     ii   = a->compressedrow.i;
1792     ridx = a->compressedrow.rindex;
1793   } else {
1794     ii = a->i;
1795     y  = yarray;
1796     z  = zarray;
1797   }
1798 
1799   for (i=0; i<mbs; i++) {
1800     n = ii[1] - ii[0]; ii++;
1801     if (usecprow) {
1802       z = zarray + 3*ridx[i];
1803       y = yarray + 3*ridx[i];
1804     }
1805     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
1806     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1807     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1808     for (j=0; j<n; j++) {
1809       xb    = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1810       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
1811       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
1812       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
1813       v    += 9;
1814     }
1815     z[0] = sum1; z[1] = sum2; z[2] = sum3;
1816     if (!usecprow) {
1817       z += 3; y += 3;
1818     }
1819   }
1820   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1821   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1822   ierr = PetscLogFlops(18.0*a->nz);CHKERRQ(ierr);
1823   PetscFunctionReturn(0);
1824 }
1825 
1826 #undef __FUNCT__
1827 #define __FUNCT__ "MatMultAdd_SeqBAIJ_4"
1828 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
1829 {
1830   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1831   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
1832   const PetscScalar *x,*xb;
1833   const MatScalar   *v;
1834   PetscErrorCode    ierr;
1835   PetscInt          mbs = a->mbs,i,j,n;
1836   const PetscInt    *idx,*ii,*ridx=NULL;
1837   PetscBool         usecprow=a->compressedrow.use;
1838 
1839   PetscFunctionBegin;
1840   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1841   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1842 
1843   idx = a->j;
1844   v   = a->a;
1845   if (usecprow) {
1846     if (zz != yy) {
1847       ierr = PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1848     }
1849     mbs  = a->compressedrow.nrows;
1850     ii   = a->compressedrow.i;
1851     ridx = a->compressedrow.rindex;
1852   } else {
1853     ii = a->i;
1854     y  = yarray;
1855     z  = zarray;
1856   }
1857 
1858   for (i=0; i<mbs; i++) {
1859     n = ii[1] - ii[0]; ii++;
1860     if (usecprow) {
1861       z = zarray + 4*ridx[i];
1862       y = yarray + 4*ridx[i];
1863     }
1864     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
1865     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1866     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1867     for (j=0; j<n; j++) {
1868       xb    = x + 4*(*idx++);
1869       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1870       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1871       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1872       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1873       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1874       v    += 16;
1875     }
1876     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
1877     if (!usecprow) {
1878       z += 4; y += 4;
1879     }
1880   }
1881   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1882   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1883   ierr = PetscLogFlops(32.0*a->nz);CHKERRQ(ierr);
1884   PetscFunctionReturn(0);
1885 }
1886 
1887 #undef __FUNCT__
1888 #define __FUNCT__ "MatMultAdd_SeqBAIJ_5"
1889 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
1890 {
1891   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1892   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
1893   const PetscScalar *x,*xb;
1894   PetscScalar       *yarray,*zarray;
1895   const MatScalar   *v;
1896   PetscErrorCode    ierr;
1897   PetscInt          mbs = a->mbs,i,j,n;
1898   const PetscInt    *idx,*ii,*ridx = NULL;
1899   PetscBool         usecprow=a->compressedrow.use;
1900 
1901   PetscFunctionBegin;
1902   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1903   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1904 
1905   idx = a->j;
1906   v   = a->a;
1907   if (usecprow) {
1908     if (zz != yy) {
1909       ierr = PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1910     }
1911     mbs  = a->compressedrow.nrows;
1912     ii   = a->compressedrow.i;
1913     ridx = a->compressedrow.rindex;
1914   } else {
1915     ii = a->i;
1916     y  = yarray;
1917     z  = zarray;
1918   }
1919 
1920   for (i=0; i<mbs; i++) {
1921     n = ii[1] - ii[0]; ii++;
1922     if (usecprow) {
1923       z = zarray + 5*ridx[i];
1924       y = yarray + 5*ridx[i];
1925     }
1926     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
1927     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1928     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1929     for (j=0; j<n; j++) {
1930       xb    = x + 5*(*idx++);
1931       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
1932       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1933       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1934       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1935       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1936       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1937       v    += 25;
1938     }
1939     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
1940     if (!usecprow) {
1941       z += 5; y += 5;
1942     }
1943   }
1944   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1945   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1946   ierr = PetscLogFlops(50.0*a->nz);CHKERRQ(ierr);
1947   PetscFunctionReturn(0);
1948 }
1949 #undef __FUNCT__
1950 #define __FUNCT__ "MatMultAdd_SeqBAIJ_6"
1951 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
1952 {
1953   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1954   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
1955   const PetscScalar *x,*xb;
1956   PetscScalar       x1,x2,x3,x4,x5,x6,*yarray,*zarray;
1957   const MatScalar   *v;
1958   PetscErrorCode    ierr;
1959   PetscInt          mbs = a->mbs,i,j,n;
1960   const PetscInt    *idx,*ii,*ridx=NULL;
1961   PetscBool         usecprow=a->compressedrow.use;
1962 
1963   PetscFunctionBegin;
1964   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1965   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1966 
1967   idx = a->j;
1968   v   = a->a;
1969   if (usecprow) {
1970     if (zz != yy) {
1971       ierr = PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1972     }
1973     mbs  = a->compressedrow.nrows;
1974     ii   = a->compressedrow.i;
1975     ridx = a->compressedrow.rindex;
1976   } else {
1977     ii = a->i;
1978     y  = yarray;
1979     z  = zarray;
1980   }
1981 
1982   for (i=0; i<mbs; i++) {
1983     n = ii[1] - ii[0]; ii++;
1984     if (usecprow) {
1985       z = zarray + 6*ridx[i];
1986       y = yarray + 6*ridx[i];
1987     }
1988     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
1989     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1990     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1991     for (j=0; j<n; j++) {
1992       xb    = x + 6*(*idx++);
1993       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
1994       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
1995       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
1996       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
1997       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
1998       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
1999       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
2000       v    += 36;
2001     }
2002     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
2003     if (!usecprow) {
2004       z += 6; y += 6;
2005     }
2006   }
2007   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
2008   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
2009   ierr = PetscLogFlops(72.0*a->nz);CHKERRQ(ierr);
2010   PetscFunctionReturn(0);
2011 }
2012 
2013 #undef __FUNCT__
2014 #define __FUNCT__ "MatMultAdd_SeqBAIJ_7"
2015 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
2016 {
2017   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2018   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
2019   const PetscScalar *x,*xb;
2020   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
2021   const MatScalar   *v;
2022   PetscErrorCode    ierr;
2023   PetscInt          mbs = a->mbs,i,j,n;
2024   const PetscInt    *idx,*ii,*ridx = NULL;
2025   PetscBool         usecprow=a->compressedrow.use;
2026 
2027   PetscFunctionBegin;
2028   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
2029   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
2030 
2031   idx = a->j;
2032   v   = a->a;
2033   if (usecprow) {
2034     if (zz != yy) {
2035       ierr = PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
2036     }
2037     mbs  = a->compressedrow.nrows;
2038     ii   = a->compressedrow.i;
2039     ridx = a->compressedrow.rindex;
2040   } else {
2041     ii = a->i;
2042     y  = yarray;
2043     z  = zarray;
2044   }
2045 
2046   for (i=0; i<mbs; i++) {
2047     n = ii[1] - ii[0]; ii++;
2048     if (usecprow) {
2049       z = zarray + 7*ridx[i];
2050       y = yarray + 7*ridx[i];
2051     }
2052     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
2053     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
2054     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2055     for (j=0; j<n; j++) {
2056       xb    = x + 7*(*idx++);
2057       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
2058       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2059       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2060       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2061       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2062       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2063       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2064       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2065       v    += 49;
2066     }
2067     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
2068     if (!usecprow) {
2069       z += 7; y += 7;
2070     }
2071   }
2072   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
2073   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
2074   ierr = PetscLogFlops(98.0*a->nz);CHKERRQ(ierr);
2075   PetscFunctionReturn(0);
2076 }
2077 
2078 #undef __FUNCT__
2079 #define __FUNCT__ "MatMultAdd_SeqBAIJ_N"
2080 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
2081 {
2082   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2083   PetscScalar       *z = 0,*work,*workt,*zarray;
2084   const PetscScalar *x,*xb;
2085   const MatScalar   *v;
2086   PetscErrorCode    ierr;
2087   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
2088   PetscInt          ncols,k;
2089   const PetscInt    *ridx = NULL,*idx,*ii;
2090   PetscBool         usecprow = a->compressedrow.use;
2091 
2092   PetscFunctionBegin;
2093   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
2094   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
2095   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
2096 
2097   idx = a->j;
2098   v   = a->a;
2099   if (usecprow) {
2100     mbs  = a->compressedrow.nrows;
2101     ii   = a->compressedrow.i;
2102     ridx = a->compressedrow.rindex;
2103   } else {
2104     mbs = a->mbs;
2105     ii  = a->i;
2106     z   = zarray;
2107   }
2108 
2109   if (!a->mult_work) {
2110     k    = PetscMax(A->rmap->n,A->cmap->n);
2111     ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
2112   }
2113   work = a->mult_work;
2114   for (i=0; i<mbs; i++) {
2115     n     = ii[1] - ii[0]; ii++;
2116     ncols = n*bs;
2117     workt = work;
2118     for (j=0; j<n; j++) {
2119       xb = x + bs*(*idx++);
2120       for (k=0; k<bs; k++) workt[k] = xb[k];
2121       workt += bs;
2122     }
2123     if (usecprow) z = zarray + bs*ridx[i];
2124     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
2125     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
2126     v += n*bs2;
2127     if (!usecprow) z += bs;
2128   }
2129   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
2130   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
2131   ierr = PetscLogFlops(2.0*a->nz*bs2);CHKERRQ(ierr);
2132   PetscFunctionReturn(0);
2133 }
2134 
2135 #undef __FUNCT__
2136 #define __FUNCT__ "MatMultHermitianTranspose_SeqBAIJ"
2137 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
2138 {
2139   PetscScalar    zero = 0.0;
2140   PetscErrorCode ierr;
2141 
2142   PetscFunctionBegin;
2143   ierr = VecSet(zz,zero);CHKERRQ(ierr);
2144   ierr = MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr);
2145   PetscFunctionReturn(0);
2146 }
2147 
2148 #undef __FUNCT__
2149 #define __FUNCT__ "MatMultTranspose_SeqBAIJ"
2150 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
2151 {
2152   PetscScalar    zero = 0.0;
2153   PetscErrorCode ierr;
2154 
2155   PetscFunctionBegin;
2156   ierr = VecSet(zz,zero);CHKERRQ(ierr);
2157   ierr = MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr);
2158   PetscFunctionReturn(0);
2159 }
2160 
2161 #undef __FUNCT__
2162 #define __FUNCT__ "MatMultHermitianTransposeAdd_SeqBAIJ"
2163 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
2164 {
2165   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2166   PetscScalar       *zb,*z,x1,x2,x3,x4,x5;
2167   const PetscScalar *x,*xb = NULL;
2168   const MatScalar   *v;
2169   PetscErrorCode    ierr;
2170   PetscInt          mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2;
2171   const PetscInt    *idx,*ii,*ib,*ridx = NULL;
2172   Mat_CompressedRow cprow = a->compressedrow;
2173   PetscBool         usecprow = cprow.use;
2174 
2175   PetscFunctionBegin;
2176   if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); }
2177   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
2178   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
2179 
2180   idx = a->j;
2181   v   = a->a;
2182   if (usecprow) {
2183     mbs  = cprow.nrows;
2184     ii   = cprow.i;
2185     ridx = cprow.rindex;
2186   } else {
2187     mbs=a->mbs;
2188     ii = a->i;
2189     xb = x;
2190   }
2191 
2192   switch (bs) {
2193   case 1:
2194     for (i=0; i<mbs; i++) {
2195       if (usecprow) xb = x + ridx[i];
2196       x1 = xb[0];
2197       ib = idx + ii[0];
2198       n  = ii[1] - ii[0]; ii++;
2199       for (j=0; j<n; j++) {
2200         rval     = ib[j];
2201         z[rval] += PetscConj(*v) * x1;
2202         v++;
2203       }
2204       if (!usecprow) xb++;
2205     }
2206     break;
2207   case 2:
2208     for (i=0; i<mbs; i++) {
2209       if (usecprow) xb = x + 2*ridx[i];
2210       x1 = xb[0]; x2 = xb[1];
2211       ib = idx + ii[0];
2212       n  = ii[1] - ii[0]; ii++;
2213       for (j=0; j<n; j++) {
2214         rval       = ib[j]*2;
2215         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2;
2216         z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2;
2217         v         += 4;
2218       }
2219       if (!usecprow) xb += 2;
2220     }
2221     break;
2222   case 3:
2223     for (i=0; i<mbs; i++) {
2224       if (usecprow) xb = x + 3*ridx[i];
2225       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2226       ib = idx + ii[0];
2227       n  = ii[1] - ii[0]; ii++;
2228       for (j=0; j<n; j++) {
2229         rval       = ib[j]*3;
2230         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3;
2231         z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3;
2232         z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3;
2233         v         += 9;
2234       }
2235       if (!usecprow) xb += 3;
2236     }
2237     break;
2238   case 4:
2239     for (i=0; i<mbs; i++) {
2240       if (usecprow) xb = x + 4*ridx[i];
2241       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
2242       ib = idx + ii[0];
2243       n  = ii[1] - ii[0]; ii++;
2244       for (j=0; j<n; j++) {
2245         rval       = ib[j]*4;
2246         z[rval++] +=  PetscConj(v[0])*x1 + PetscConj(v[1])*x2  + PetscConj(v[2])*x3  + PetscConj(v[3])*x4;
2247         z[rval++] +=  PetscConj(v[4])*x1 + PetscConj(v[5])*x2  + PetscConj(v[6])*x3  + PetscConj(v[7])*x4;
2248         z[rval++] +=  PetscConj(v[8])*x1 + PetscConj(v[9])*x2  + PetscConj(v[10])*x3 + PetscConj(v[11])*x4;
2249         z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4;
2250         v         += 16;
2251       }
2252       if (!usecprow) xb += 4;
2253     }
2254     break;
2255   case 5:
2256     for (i=0; i<mbs; i++) {
2257       if (usecprow) xb = x + 5*ridx[i];
2258       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2259       x4 = xb[3]; x5 = xb[4];
2260       ib = idx + ii[0];
2261       n  = ii[1] - ii[0]; ii++;
2262       for (j=0; j<n; j++) {
2263         rval       = ib[j]*5;
2264         z[rval++] +=  PetscConj(v[0])*x1 +  PetscConj(v[1])*x2 +  PetscConj(v[2])*x3 +  PetscConj(v[3])*x4 +  PetscConj(v[4])*x5;
2265         z[rval++] +=  PetscConj(v[5])*x1 +  PetscConj(v[6])*x2 +  PetscConj(v[7])*x3 +  PetscConj(v[8])*x4 +  PetscConj(v[9])*x5;
2266         z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5;
2267         z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5;
2268         z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5;
2269         v         += 25;
2270       }
2271       if (!usecprow) xb += 5;
2272     }
2273     break;
2274   default: {      /* block sizes larger than 5 by 5 are handled by BLAS */
2275     PetscInt          ncols,k;
2276     PetscScalar       *work,*workt;
2277     const PetscScalar *xtmp;
2278 
2279     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet");
2280     if (!a->mult_work) {
2281       k    = PetscMax(A->rmap->n,A->cmap->n);
2282       ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
2283     }
2284     work = a->mult_work;
2285     xtmp = x;
2286     for (i=0; i<mbs; i++) {
2287       n     = ii[1] - ii[0]; ii++;
2288       ncols = n*bs;
2289       ierr  = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
2290       if (usecprow) xtmp = x + bs*ridx[i];
2291       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2292       /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
2293       v += n*bs2;
2294       if (!usecprow) xtmp += bs;
2295       workt = work;
2296       for (j=0; j<n; j++) {
2297         zb = z + bs*(*idx++);
2298         for (k=0; k<bs; k++) zb[k] += workt[k] ;
2299         workt += bs;
2300       }
2301     }
2302     }
2303   }
2304   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
2305   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
2306   ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr);
2307   PetscFunctionReturn(0);
2308 }
2309 
2310 #undef __FUNCT__
2311 #define __FUNCT__ "MatMultTransposeAdd_SeqBAIJ"
2312 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
2313 {
2314   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2315   PetscScalar       *zb,*z,x1,x2,x3,x4,x5;
2316   const PetscScalar *x,*xb = 0;
2317   const MatScalar   *v;
2318   PetscErrorCode    ierr;
2319   PetscInt          mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2;
2320   const PetscInt    *idx,*ii,*ib,*ridx = NULL;
2321   Mat_CompressedRow cprow   = a->compressedrow;
2322   PetscBool         usecprow=cprow.use;
2323 
2324   PetscFunctionBegin;
2325   if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); }
2326   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
2327   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
2328 
2329   idx = a->j;
2330   v   = a->a;
2331   if (usecprow) {
2332     mbs  = cprow.nrows;
2333     ii   = cprow.i;
2334     ridx = cprow.rindex;
2335   } else {
2336     mbs=a->mbs;
2337     ii = a->i;
2338     xb = x;
2339   }
2340 
2341   switch (bs) {
2342   case 1:
2343     for (i=0; i<mbs; i++) {
2344       if (usecprow) xb = x + ridx[i];
2345       x1 = xb[0];
2346       ib = idx + ii[0];
2347       n  = ii[1] - ii[0]; ii++;
2348       for (j=0; j<n; j++) {
2349         rval     = ib[j];
2350         z[rval] += *v * x1;
2351         v++;
2352       }
2353       if (!usecprow) xb++;
2354     }
2355     break;
2356   case 2:
2357     for (i=0; i<mbs; i++) {
2358       if (usecprow) xb = x + 2*ridx[i];
2359       x1 = xb[0]; x2 = xb[1];
2360       ib = idx + ii[0];
2361       n  = ii[1] - ii[0]; ii++;
2362       for (j=0; j<n; j++) {
2363         rval       = ib[j]*2;
2364         z[rval++] += v[0]*x1 + v[1]*x2;
2365         z[rval++] += v[2]*x1 + v[3]*x2;
2366         v         += 4;
2367       }
2368       if (!usecprow) xb += 2;
2369     }
2370     break;
2371   case 3:
2372     for (i=0; i<mbs; i++) {
2373       if (usecprow) xb = x + 3*ridx[i];
2374       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2375       ib = idx + ii[0];
2376       n  = ii[1] - ii[0]; ii++;
2377       for (j=0; j<n; j++) {
2378         rval       = ib[j]*3;
2379         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
2380         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
2381         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
2382         v         += 9;
2383       }
2384       if (!usecprow) xb += 3;
2385     }
2386     break;
2387   case 4:
2388     for (i=0; i<mbs; i++) {
2389       if (usecprow) xb = x + 4*ridx[i];
2390       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
2391       ib = idx + ii[0];
2392       n  = ii[1] - ii[0]; ii++;
2393       for (j=0; j<n; j++) {
2394         rval       = ib[j]*4;
2395         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
2396         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
2397         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
2398         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
2399         v         += 16;
2400       }
2401       if (!usecprow) xb += 4;
2402     }
2403     break;
2404   case 5:
2405     for (i=0; i<mbs; i++) {
2406       if (usecprow) xb = x + 5*ridx[i];
2407       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2408       x4 = xb[3]; x5 = xb[4];
2409       ib = idx + ii[0];
2410       n  = ii[1] - ii[0]; ii++;
2411       for (j=0; j<n; j++) {
2412         rval       = ib[j]*5;
2413         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
2414         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
2415         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
2416         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
2417         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
2418         v         += 25;
2419       }
2420       if (!usecprow) xb += 5;
2421     }
2422     break;
2423   default: {      /* block sizes larger then 5 by 5 are handled by BLAS */
2424     PetscInt          ncols,k;
2425     PetscScalar       *work,*workt;
2426     const PetscScalar *xtmp;
2427     if (!a->mult_work) {
2428       k    = PetscMax(A->rmap->n,A->cmap->n);
2429       ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
2430     }
2431     work = a->mult_work;
2432     xtmp = x;
2433     for (i=0; i<mbs; i++) {
2434       n     = ii[1] - ii[0]; ii++;
2435       ncols = n*bs;
2436       ierr  = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
2437       if (usecprow) xtmp = x + bs*ridx[i];
2438       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2439       /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
2440       v += n*bs2;
2441       if (!usecprow) xtmp += bs;
2442       workt = work;
2443       for (j=0; j<n; j++) {
2444         zb = z + bs*(*idx++);
2445         for (k=0; k<bs; k++) zb[k] += workt[k];
2446         workt += bs;
2447       }
2448     }
2449     }
2450   }
2451   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
2452   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
2453   ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr);
2454   PetscFunctionReturn(0);
2455 }
2456 
2457 #undef __FUNCT__
2458 #define __FUNCT__ "MatScale_SeqBAIJ"
2459 PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
2460 {
2461   Mat_SeqBAIJ    *a      = (Mat_SeqBAIJ*)inA->data;
2462   PetscInt       totalnz = a->bs2*a->nz;
2463   PetscScalar    oalpha  = alpha;
2464   PetscErrorCode ierr;
2465   PetscBLASInt   one = 1,tnz;
2466 
2467   PetscFunctionBegin;
2468   ierr = PetscBLASIntCast(totalnz,&tnz);CHKERRQ(ierr);
2469   PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one));
2470   ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
2471   PetscFunctionReturn(0);
2472 }
2473 
2474 #undef __FUNCT__
2475 #define __FUNCT__ "MatNorm_SeqBAIJ"
2476 PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
2477 {
2478   PetscErrorCode ierr;
2479   Mat_SeqBAIJ    *a  = (Mat_SeqBAIJ*)A->data;
2480   MatScalar      *v  = a->a;
2481   PetscReal      sum = 0.0;
2482   PetscInt       i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1;
2483 
2484   PetscFunctionBegin;
2485   if (type == NORM_FROBENIUS) {
2486     for (i=0; i< bs2*nz; i++) {
2487       sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
2488     }
2489     *norm = PetscSqrtReal(sum);
2490   } else if (type == NORM_1) { /* maximum column sum */
2491     PetscReal *tmp;
2492     PetscInt  *bcol = a->j;
2493     ierr = PetscCalloc1(A->cmap->n+1,&tmp);CHKERRQ(ierr);
2494     for (i=0; i<nz; i++) {
2495       for (j=0; j<bs; j++) {
2496         k1 = bs*(*bcol) + j; /* column index */
2497         for (k=0; k<bs; k++) {
2498           tmp[k1] += PetscAbsScalar(*v); v++;
2499         }
2500       }
2501       bcol++;
2502     }
2503     *norm = 0.0;
2504     for (j=0; j<A->cmap->n; j++) {
2505       if (tmp[j] > *norm) *norm = tmp[j];
2506     }
2507     ierr = PetscFree(tmp);CHKERRQ(ierr);
2508   } else if (type == NORM_INFINITY) { /* maximum row sum */
2509     *norm = 0.0;
2510     for (k=0; k<bs; k++) {
2511       for (j=0; j<a->mbs; j++) {
2512         v   = a->a + bs2*a->i[j] + k;
2513         sum = 0.0;
2514         for (i=0; i<a->i[j+1]-a->i[j]; i++) {
2515           for (k1=0; k1<bs; k1++) {
2516             sum += PetscAbsScalar(*v);
2517             v   += bs;
2518           }
2519         }
2520         if (sum > *norm) *norm = sum;
2521       }
2522     }
2523   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
2524   PetscFunctionReturn(0);
2525 }
2526 
2527 
2528 #undef __FUNCT__
2529 #define __FUNCT__ "MatEqual_SeqBAIJ"
2530 PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg)
2531 {
2532   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
2533   PetscErrorCode ierr;
2534 
2535   PetscFunctionBegin;
2536   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
2537   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
2538     *flg = PETSC_FALSE;
2539     PetscFunctionReturn(0);
2540   }
2541 
2542   /* if the a->i are the same */
2543   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr);
2544   if (!*flg) PetscFunctionReturn(0);
2545 
2546   /* if a->j are the same */
2547   ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr);
2548   if (!*flg) PetscFunctionReturn(0);
2549 
2550   /* if a->a are the same */
2551   ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
2552   PetscFunctionReturn(0);
2553 
2554 }
2555 
2556 #undef __FUNCT__
2557 #define __FUNCT__ "MatGetDiagonal_SeqBAIJ"
2558 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
2559 {
2560   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2561   PetscErrorCode ierr;
2562   PetscInt       i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
2563   PetscScalar    *x,zero = 0.0;
2564   MatScalar      *aa,*aa_j;
2565 
2566   PetscFunctionBegin;
2567   if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
2568   bs   = A->rmap->bs;
2569   aa   = a->a;
2570   ai   = a->i;
2571   aj   = a->j;
2572   ambs = a->mbs;
2573   bs2  = a->bs2;
2574 
2575   ierr = VecSet(v,zero);CHKERRQ(ierr);
2576   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
2577   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
2578   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
2579   for (i=0; i<ambs; i++) {
2580     for (j=ai[i]; j<ai[i+1]; j++) {
2581       if (aj[j] == i) {
2582         row  = i*bs;
2583         aa_j = aa+j*bs2;
2584         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
2585         break;
2586       }
2587     }
2588   }
2589   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
2590   PetscFunctionReturn(0);
2591 }
2592 
2593 #undef __FUNCT__
2594 #define __FUNCT__ "MatDiagonalScale_SeqBAIJ"
2595 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
2596 {
2597   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2598   const PetscScalar *l,*r,*li,*ri;
2599   PetscScalar       x;
2600   MatScalar         *aa, *v;
2601   PetscErrorCode    ierr;
2602   PetscInt          i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai;
2603   const PetscInt    *ai,*aj;
2604 
2605   PetscFunctionBegin;
2606   ai  = a->i;
2607   aj  = a->j;
2608   aa  = a->a;
2609   m   = A->rmap->n;
2610   n   = A->cmap->n;
2611   bs  = A->rmap->bs;
2612   mbs = a->mbs;
2613   bs2 = a->bs2;
2614   if (ll) {
2615     ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr);
2616     ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
2617     if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
2618     for (i=0; i<mbs; i++) { /* for each block row */
2619       M  = ai[i+1] - ai[i];
2620       li = l + i*bs;
2621       v  = aa + bs2*ai[i];
2622       for (j=0; j<M; j++) { /* for each block */
2623         for (k=0; k<bs2; k++) {
2624           (*v++) *= li[k%bs];
2625         }
2626       }
2627     }
2628     ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr);
2629     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2630   }
2631 
2632   if (rr) {
2633     ierr = VecGetArrayRead(rr,&r);CHKERRQ(ierr);
2634     ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr);
2635     if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
2636     for (i=0; i<mbs; i++) { /* for each block row */
2637       iai = ai[i];
2638       M   = ai[i+1] - iai;
2639       v   = aa + bs2*iai;
2640       for (j=0; j<M; j++) { /* for each block */
2641         ri = r + bs*aj[iai+j];
2642         for (k=0; k<bs; k++) {
2643           x = ri[k];
2644           for (tmp=0; tmp<bs; tmp++) v[tmp] *= x;
2645           v += bs;
2646         }
2647       }
2648     }
2649     ierr = VecRestoreArrayRead(rr,&r);CHKERRQ(ierr);
2650     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2651   }
2652   PetscFunctionReturn(0);
2653 }
2654 
2655 
2656 #undef __FUNCT__
2657 #define __FUNCT__ "MatGetInfo_SeqBAIJ"
2658 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
2659 {
2660   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2661 
2662   PetscFunctionBegin;
2663   info->block_size   = a->bs2;
2664   info->nz_allocated = a->bs2*a->maxnz;
2665   info->nz_used      = a->bs2*a->nz;
2666   info->nz_unneeded  = (double)(info->nz_allocated - info->nz_used);
2667   info->assemblies   = A->num_ass;
2668   info->mallocs      = A->info.mallocs;
2669   info->memory       = ((PetscObject)A)->mem;
2670   if (A->factortype) {
2671     info->fill_ratio_given  = A->info.fill_ratio_given;
2672     info->fill_ratio_needed = A->info.fill_ratio_needed;
2673     info->factor_mallocs    = A->info.factor_mallocs;
2674   } else {
2675     info->fill_ratio_given  = 0;
2676     info->fill_ratio_needed = 0;
2677     info->factor_mallocs    = 0;
2678   }
2679   PetscFunctionReturn(0);
2680 }
2681 
2682 
2683 #if defined(PETSC_THREADCOMM_ACTIVE)
2684 PetscErrorCode MatZeroEntries_SeqBAIJ_Kernel(PetscInt thread_id,Mat A)
2685 {
2686   PetscErrorCode ierr;
2687   PetscInt       *trstarts=A->rmap->trstarts;
2688   PetscInt       n,start,end;
2689   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
2690 
2691   start = trstarts[thread_id]/A->rmap->bs;
2692   end   = trstarts[thread_id+1]/A->rmap->bs;
2693   n     = a->i[end] - a->i[start];
2694   ierr  = PetscMemzero(a->a+a->bs2*a->i[start],a->bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
2695   return 0;
2696 }
2697 
2698 #undef __FUNCT__
2699 #define __FUNCT__ "MatZeroEntries_SeqBAIJ"
2700 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
2701 {
2702   PetscErrorCode ierr;
2703 
2704 
2705   PetscFunctionBegin;
2706   ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatZeroEntries_SeqBAIJ_Kernel,1,A);CHKERRQ(ierr);
2707   PetscFunctionReturn(0);
2708 }
2709 #else
2710 #undef __FUNCT__
2711 #define __FUNCT__ "MatZeroEntries_SeqBAIJ"
2712 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
2713 {
2714   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2715   PetscErrorCode ierr;
2716 
2717   PetscFunctionBegin;
2718   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
2719   PetscFunctionReturn(0);
2720 }
2721 #endif
2722