xref: /petsc/src/mat/impls/baij/seq/baij2.c (revision b60d1aa9dffe7f0839cca942e8a5760aa5de1720)
1 
2 #include <../src/mat/impls/baij/seq/baij.h>
3 #include <petsc-private/kernels/blockinvert.h>
4 #include <petscbt.h>
5 #include <petscblaslapack.h>
6 #if defined(PETSC_THREADCOMM_ACTIVE)
7 #include <petscthreadcomm.h>
8 #endif
9 
10 #undef __FUNCT__
11 #define __FUNCT__ "MatIncreaseOverlap_SeqBAIJ"
12 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
13 {
14   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
15   PetscErrorCode ierr;
16   PetscInt       row,i,j,k,l,m,n,*nidx,isz,val,ival;
17   const PetscInt *idx;
18   PetscInt       start,end,*ai,*aj,bs,*nidx2;
19   PetscBT        table;
20 
21   PetscFunctionBegin;
22   m  = a->mbs;
23   ai = a->i;
24   aj = a->j;
25   bs = A->rmap->bs;
26 
27   if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
28 
29   ierr = PetscBTCreate(m,&table);CHKERRQ(ierr);
30   ierr = PetscMalloc1(m+1,&nidx);CHKERRQ(ierr);
31   ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr);
32 
33   for (i=0; i<is_max; i++) {
34     /* Initialise the two local arrays */
35     isz  = 0;
36     ierr = PetscBTMemzero(m,table);CHKERRQ(ierr);
37 
38     /* Extract the indices, assume there can be duplicate entries */
39     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
40     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
41 
42     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
43     for (j=0; j<n; ++j) {
44       ival = idx[j]/bs; /* convert the indices into block indices */
45       if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
46       if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival;
47     }
48     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
49     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
50 
51     k = 0;
52     for (j=0; j<ov; j++) { /* for each overlap*/
53       n = isz;
54       for (; k<n; k++) {  /* do only those rows in nidx[k], which are not done yet */
55         row   = nidx[k];
56         start = ai[row];
57         end   = ai[row+1];
58         for (l = start; l<end; l++) {
59           val = aj[l];
60           if (!PetscBTLookupSet(table,val)) nidx[isz++] = val;
61         }
62       }
63     }
64     /* expand the Index Set */
65     for (j=0; j<isz; j++) {
66       for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
67     }
68     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
69   }
70   ierr = PetscBTDestroy(&table);CHKERRQ(ierr);
71   ierr = PetscFree(nidx);CHKERRQ(ierr);
72   ierr = PetscFree(nidx2);CHKERRQ(ierr);
73   PetscFunctionReturn(0);
74 }
75 
76 #undef __FUNCT__
77 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ_Private"
78 PetscErrorCode MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
79 {
80   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*c;
81   PetscErrorCode ierr;
82   PetscInt       *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
83   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
84   const PetscInt *irow,*icol;
85   PetscInt       nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
86   PetscInt       *aj = a->j,*ai = a->i;
87   MatScalar      *mat_a;
88   Mat            C;
89   PetscBool      flag;
90 
91   PetscFunctionBegin;
92 
93   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
94   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
95   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
96   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
97 
98   ierr  = PetscCalloc1(1+oldcols,&smap);CHKERRQ(ierr);
99   ssmap = smap;
100   ierr  = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr);
101   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
102   /* determine lens of each row */
103   for (i=0; i<nrows; i++) {
104     kstart  = ai[irow[i]];
105     kend    = kstart + a->ilen[irow[i]];
106     lens[i] = 0;
107     for (k=kstart; k<kend; k++) {
108       if (ssmap[aj[k]]) lens[i]++;
109     }
110   }
111   /* Create and fill new matrix */
112   if (scall == MAT_REUSE_MATRIX) {
113     c = (Mat_SeqBAIJ*)((*B)->data);
114 
115     if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
116     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr);
117     if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
118     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr);
119     C    = *B;
120   } else {
121     ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr);
122     ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
123     ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
124     ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(C,bs,0,lens);CHKERRQ(ierr);
125   }
126   c = (Mat_SeqBAIJ*)(C->data);
127   for (i=0; i<nrows; i++) {
128     row      = irow[i];
129     kstart   = ai[row];
130     kend     = kstart + a->ilen[row];
131     mat_i    = c->i[i];
132     mat_j    = c->j + mat_i;
133     mat_a    = c->a + mat_i*bs2;
134     mat_ilen = c->ilen + i;
135     for (k=kstart; k<kend; k++) {
136       if ((tcol=ssmap[a->j[k]])) {
137         *mat_j++ = tcol - 1;
138         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
139         mat_a   += bs2;
140         (*mat_ilen)++;
141       }
142     }
143   }
144   /* sort */
145   {
146     MatScalar *work;
147 
148     ierr = PetscMalloc1(bs2,&work);CHKERRQ(ierr);
149     for (i=0; i<nrows; i++) {
150       PetscInt ilen;
151       mat_i = c->i[i];
152       mat_j = c->j + mat_i;
153       mat_a = c->a + mat_i*bs2;
154       ilen  = c->ilen[i];
155       ierr  = PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);CHKERRQ(ierr);
156     }
157     ierr = PetscFree(work);CHKERRQ(ierr);
158   }
159 
160   /* Free work space */
161   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
162   ierr = PetscFree(smap);CHKERRQ(ierr);
163   ierr = PetscFree(lens);CHKERRQ(ierr);
164   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
165   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
166 
167   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
168   *B   = C;
169   PetscFunctionReturn(0);
170 }
171 
172 #undef __FUNCT__
173 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ"
174 PetscErrorCode MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
175 {
176   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
177   IS             is1,is2;
178   PetscErrorCode ierr;
179   PetscInt       *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count;
180   const PetscInt *irow,*icol;
181 
182   PetscFunctionBegin;
183   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
184   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
185   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
186   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
187 
188   /* Verify if the indices corespond to each element in a block
189    and form the IS with compressed IS */
190   ierr = PetscMalloc2(a->mbs,&vary,a->mbs,&iary);CHKERRQ(ierr);
191   ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr);
192   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
193   for (i=0; i<a->mbs; i++) {
194     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
195   }
196   count = 0;
197   for (i=0; i<nrows; i++) {
198     PetscInt j = irow[i] / bs;
199     if ((vary[j]--)==bs) iary[count++] = j;
200   }
201   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr);
202 
203   ierr = PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));CHKERRQ(ierr);
204   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
205   for (i=0; i<a->mbs; i++) {
206     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
207     if (vary[i]==bs) iary[count++] = i;
208   }
209   count = 0;
210   for (i=0; i<ncols; i++) {
211     PetscInt j = icol[i] / bs;
212     if ((vary[j]--)==bs) iary[count++] = j;
213   }
214   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr);
215   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
216   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
217   ierr = PetscFree2(vary,iary);CHKERRQ(ierr);
218 
219   ierr = MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr);
220   ierr = ISDestroy(&is1);CHKERRQ(ierr);
221   ierr = ISDestroy(&is2);CHKERRQ(ierr);
222   PetscFunctionReturn(0);
223 }
224 
225 #undef __FUNCT__
226 #define __FUNCT__ "MatGetSubMatrices_SeqBAIJ"
227 PetscErrorCode MatGetSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
228 {
229   PetscErrorCode ierr;
230   PetscInt       i;
231 
232   PetscFunctionBegin;
233   if (scall == MAT_INITIAL_MATRIX) {
234     ierr = PetscMalloc1(n+1,B);CHKERRQ(ierr);
235   }
236 
237   for (i=0; i<n; i++) {
238     ierr = MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr);
239   }
240   PetscFunctionReturn(0);
241 }
242 
243 
244 /* -------------------------------------------------------*/
245 /* Should check that shapes of vectors and matrices match */
246 /* -------------------------------------------------------*/
247 
248 #if defined(PETSC_THREADCOMM_ACTIVE)
249 PetscErrorCode MatMult_SeqBAIJ_1_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz)
250 {
251   PetscErrorCode    ierr;
252   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
253   PetscScalar       *z;
254   const PetscScalar *x;
255   const MatScalar   *aa;
256   PetscInt          *trstarts=A->rmap->trstarts;
257   PetscInt          n,start,end,i;
258   const PetscInt    *aj,*ai;
259   PetscScalar       sum;
260 
261   ierr  = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
262   ierr  = VecGetArray(zz,&z);CHKERRQ(ierr);
263   start = trstarts[thread_id];
264   end   = trstarts[thread_id+1];
265   ai    = a->i;
266   for (i=start; i<end; i++) {
267     n   = ai[i+1] - ai[i];
268     aj  = a->j + ai[i];
269     aa  = a->a + ai[i];
270     sum = 0.0;
271     PetscSparseDensePlusDot(sum,x,aa,aj,n);
272     z[i] = sum;
273   }
274   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
275   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
276   return 0;
277 }
278 
279 #undef __FUNCT__
280 #define __FUNCT__ "MatMult_SeqBAIJ_1"
281 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
282 {
283   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
284   PetscScalar       *z,sum;
285   const PetscScalar *x;
286   const MatScalar   *v;
287   PetscErrorCode    ierr;
288   PetscInt          mbs,i,n;
289   const PetscInt    *idx,*ii,*ridx=NULL;
290   PetscBool         usecprow=a->compressedrow.use;
291 
292   PetscFunctionBegin;
293 
294   if (usecprow) {
295     ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
296     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
297     mbs  = a->compressedrow.nrows;
298     ii   = a->compressedrow.i;
299     ridx = a->compressedrow.rindex;
300     ierr = PetscMemzero(z,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
301     for (i=0; i<mbs; i++) {
302       n   = ii[i+1] - ii[i];
303       v   = a->a + ii[i];
304       idx = a->j + ii[i];
305       PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
306       PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
307       sum = 0.0;
308       PetscSparseDensePlusDot(sum,x,v,idx,n);
309       z[ridx[i]] = sum;
310     }
311     ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
312     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
313   } else {
314     ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_1_Kernel,3,A,xx,zz);CHKERRQ(ierr);
315   }
316   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
317   PetscFunctionReturn(0);
318 }
319 #else
320 #undef __FUNCT__
321 #define __FUNCT__ "MatMult_SeqBAIJ_1"
322 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
323 {
324   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
325   PetscScalar       *z,sum;
326   const PetscScalar *x;
327   const MatScalar   *v;
328   PetscErrorCode    ierr;
329   PetscInt          mbs,i,n;
330   const PetscInt    *idx,*ii,*ridx=NULL;
331   PetscBool         usecprow=a->compressedrow.use;
332 
333   PetscFunctionBegin;
334   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
335   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
336 
337   if (usecprow) {
338     mbs  = a->compressedrow.nrows;
339     ii   = a->compressedrow.i;
340     ridx = a->compressedrow.rindex;
341     ierr = PetscMemzero(z,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
342   } else {
343     mbs = a->mbs;
344     ii  = a->i;
345   }
346 
347   for (i=0; i<mbs; i++) {
348     n   = ii[1] - ii[0];
349     v   = a->a + ii[0];
350     idx = a->j + ii[0];
351     ii++;
352     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
353     PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
354     sum = 0.0;
355     PetscSparseDensePlusDot(sum,x,v,idx,n);
356     if (usecprow) {
357       z[ridx[i]] = sum;
358     } else {
359       z[i]        = sum;
360     }
361   }
362   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
363   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
364   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
365   PetscFunctionReturn(0);
366 }
367 #endif
368 
369 #if defined(PETSC_THREADCOMM_ACTIVE)
370 PetscErrorCode MatMult_SeqBAIJ_2_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz)
371 {
372   PetscErrorCode    ierr;
373   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
374   PetscScalar       *z,x1,x2,sum1,sum2;
375   const PetscScalar *x,*xb;
376   const MatScalar   *aa;
377   PetscInt          *trstarts=A->rmap->trstarts;
378   PetscInt          n,start,end,i,j;
379   const PetscInt    *aj,*ai;
380 
381   ierr   = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
382   ierr   = VecGetArray(zz,&z);CHKERRQ(ierr);
383   start  = trstarts[thread_id] / 2;
384   end    = trstarts[thread_id+1] / 2;
385   ai     = a->i;
386   for (i=start; i<end; i++) {
387     n    = ai[i+1] - ai[i];
388     aj   = a->j + ai[i];
389     aa   = a->a + ai[i]*4;
390     sum1 = 0.0; sum2 = 0.0;
391     for (j=0; j<n; j++) {
392       xb = x + 2*aj[j]; x1 = xb[0]; x2 = xb[1];
393       sum1 += aa[4*j]*x1   + aa[4*j+2]*x2;
394       sum2 += aa[4*j+1]*x1 + aa[4*j+3]*x2;
395     }
396     z[2*i] = sum1; z[2*i+1] = sum2;
397   }
398   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
399   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
400   return 0;
401 }
402 
403 #undef __FUNCT__
404 #define __FUNCT__ "MatMult_SeqBAIJ_2"
405 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
406 {
407   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
408   PetscScalar       *z,x1,x2,sum1,sum2;
409   const PetscScalar *x,*xb;
410   const MatScalar   *v;
411   PetscErrorCode    ierr;
412   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
413   PetscBool         usecprow=a->compressedrow.use;
414 
415   PetscFunctionBegin;
416 
417   if (usecprow) {
418     ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
419     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
420     mbs  = a->compressedrow.nrows;
421     ii   = a->compressedrow.i;
422     ridx = a->compressedrow.rindex;
423     for (i=0; i<mbs; i++) {
424       n    = ii[i+1] - ii[i];
425       idx  = a->j + ii[i];
426       v    = a->a + ii[i]*4;
427       sum1 = 0.0; sum2 = 0.0;
428       PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
429       PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
430       for (j=0; j<n; j++) {
431 	xb = x + 2*idx[j]; x1 = xb[0]; x2 = xb[1];
432 	sum1 += v[4*j]*x1   + v[4*j+2]*x2;
433 	sum2 += v[4*j+1]*x1 + v[4*j+3]*x2;
434       }
435       z[2*ridx[i]] = sum1; z[2*ridx[i]+1] = sum2;
436     }
437     ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
438     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
439   } else {
440     ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_2_Kernel,3,A,xx,zz);CHKERRQ(ierr);
441   }
442   ierr = PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);CHKERRQ(ierr);
443   PetscFunctionReturn(0);
444 }
445 #else
446 #undef __FUNCT__
447 #define __FUNCT__ "MatMult_SeqBAIJ_2"
448 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
449 {
450   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
451   PetscScalar       *z = 0,sum1,sum2,*zarray;
452   const PetscScalar *x,*xb;
453   PetscScalar       x1,x2;
454   const MatScalar   *v;
455   PetscErrorCode    ierr;
456   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
457   PetscBool         usecprow=a->compressedrow.use;
458 
459   PetscFunctionBegin;
460   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
461   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
462 
463   idx = a->j;
464   v   = a->a;
465   if (usecprow) {
466     mbs  = a->compressedrow.nrows;
467     ii   = a->compressedrow.i;
468     ridx = a->compressedrow.rindex;
469   } else {
470     mbs = a->mbs;
471     ii  = a->i;
472     z   = zarray;
473   }
474 
475   for (i=0; i<mbs; i++) {
476     n           = ii[1] - ii[0]; ii++;
477     sum1        = 0.0; sum2 = 0.0;
478     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
479     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
480     for (j=0; j<n; j++) {
481       xb    = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
482       sum1 += v[0]*x1 + v[2]*x2;
483       sum2 += v[1]*x1 + v[3]*x2;
484       v    += 4;
485     }
486     if (usecprow) z = zarray + 2*ridx[i];
487     z[0] = sum1; z[1] = sum2;
488     if (!usecprow) z += 2;
489   }
490   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
491   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
492   ierr = PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);CHKERRQ(ierr);
493   PetscFunctionReturn(0);
494 }
495 #endif
496 
497 #if defined(PETSC_THREADCOMM_ACTIVE)
498 PetscErrorCode MatMult_SeqBAIJ_3_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz)
499 {
500   PetscErrorCode    ierr;
501   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
502   PetscScalar       *z,x1,x2,x3,sum1,sum2,sum3;
503   const PetscScalar *x,*xb;
504   const MatScalar   *aa;
505   PetscInt          *trstarts=A->rmap->trstarts;
506   PetscInt          n,start,end,i,j;
507   const PetscInt    *aj,*ai;
508 
509   ierr   = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
510   ierr   = VecGetArray(zz,&z);CHKERRQ(ierr);
511   start  = trstarts[thread_id] / 3;
512   end    = trstarts[thread_id+1] / 3;
513   ai     = a->i;
514   for (i=start; i<end; i++) {
515     n    = ai[i+1] - ai[i];
516     aj   = a->j + ai[i];
517     aa   = a->a + ai[i]*9;
518     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
519     for (j=0; j<n; j++) {
520       xb = x + 3*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
521       sum1 += aa[9*j]*x1   + aa[9*j+3]*x2 + aa[9*j+6]*x3;
522       sum2 += aa[9*j+1]*x1 + aa[9*j+4]*x2 + aa[9*j+7]*x3;
523       sum3 += aa[9*j+2]*x1 + aa[9*j+5]*x2 + aa[9*j+8]*x3;
524     }
525     z[3*i] = sum1; z[3*i+1] = sum2; z[3*i+2] = sum3;
526   }
527   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
528   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
529   return 0;
530 }
531 
532 #undef __FUNCT__
533 #define __FUNCT__ "MatMult_SeqBAIJ_3"
534 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
535 {
536   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
537   PetscScalar       *z,sum1,sum2,sum3,x1,x2,x3;
538   const PetscScalar *x,*xb;
539   const MatScalar   *v;
540   PetscErrorCode    ierr;
541   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
542   PetscBool         usecprow=a->compressedrow.use;
543 
544 
545 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
546 #pragma disjoint(*v,*z,*xb)
547 #endif
548 
549   PetscFunctionBegin;
550 
551   if (usecprow) {
552     ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
553     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
554     mbs  = a->compressedrow.nrows;
555     ii   = a->compressedrow.i;
556     ridx = a->compressedrow.rindex;
557     for (i=0; i<mbs; i++) {
558       n    = ii[i+1] - ii[i];
559       idx  = a->j + ii[i];
560       v    = a->a + ii[i]*9;
561       sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
562       PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
563       PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
564       for (j=0; j<n; j++) {
565 	xb = x + 3*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
566 	sum1 += v[9*j]*x1   + v[9*j+3]*x2 + v[9*j+6]*x3;
567 	sum2 += v[9*j+1]*x1 + v[9*j+4]*x2 + v[9*j+7]*x3;
568 	sum3 += v[9*j+2]*x1 + v[9*j+5]*x2 + v[9*j+8]*x3;
569       }
570       z[3*ridx[i]] = sum1; z[3*ridx[i]+1] = sum2; z[3*ridx[i]+2] = sum3;
571     }
572     ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
573     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
574   } else {
575     ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_3_Kernel,3,A,xx,zz);CHKERRQ(ierr);
576   }
577   ierr = PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);CHKERRQ(ierr);
578   PetscFunctionReturn(0);
579 }
580 #else
581 #undef __FUNCT__
582 #define __FUNCT__ "MatMult_SeqBAIJ_3"
583 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
584 {
585   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
586   PetscScalar       *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray;
587   const PetscScalar *x,*xb;
588   const MatScalar   *v;
589   PetscErrorCode    ierr;
590   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
591   PetscBool         usecprow=a->compressedrow.use;
592 
593 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
594 #pragma disjoint(*v,*z,*xb)
595 #endif
596 
597   PetscFunctionBegin;
598   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
599   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
600 
601   idx = a->j;
602   v   = a->a;
603   if (usecprow) {
604     mbs  = a->compressedrow.nrows;
605     ii   = a->compressedrow.i;
606     ridx = a->compressedrow.rindex;
607   } else {
608     mbs = a->mbs;
609     ii  = a->i;
610     z   = zarray;
611   }
612 
613   for (i=0; i<mbs; i++) {
614     n           = ii[1] - ii[0]; ii++;
615     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0;
616     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
617     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
618     for (j=0; j<n; j++) {
619       xb = x + 3*(*idx++);
620       x1 = xb[0];
621       x2 = xb[1];
622       x3 = xb[2];
623 
624       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
625       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
626       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
627       v    += 9;
628     }
629     if (usecprow) z = zarray + 3*ridx[i];
630     z[0] = sum1; z[1] = sum2; z[2] = sum3;
631     if (!usecprow) z += 3;
632   }
633   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
634   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
635   ierr = PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);CHKERRQ(ierr);
636   PetscFunctionReturn(0);
637 }
638 #endif
639 
640 #if defined(PETSC_THREADCOMM_ACTIVE)
641 PetscErrorCode MatMult_SeqBAIJ_4_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz)
642 {
643   PetscErrorCode    ierr;
644   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
645   PetscScalar       *z,x1,x2,x3,x4,sum1,sum2,sum3,sum4;
646   const PetscScalar *x,*xb;
647   const MatScalar   *aa;
648   PetscInt          *trstarts=A->rmap->trstarts;
649   PetscInt          n,start,end,i,j;
650   const PetscInt    *aj,*ai;
651 
652   ierr   = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
653   ierr   = VecGetArray(zz,&z);CHKERRQ(ierr);
654   start  = trstarts[thread_id] / 4;
655   end    = trstarts[thread_id+1] / 4;
656   ai     = a->i;
657   for (i=start; i<end; i++) {
658     n    = ai[i+1] - ai[i];
659     aj   = a->j + ai[i];
660     aa   = a->a + ai[i]*16;
661     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
662     for (j=0; j<n; j++) {
663       xb = x + 4*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
664       sum1 += aa[16*j]*x1   + aa[16*j+4]*x2 + aa[16*j+8]*x3  + aa[16*j+12]*x4;
665       sum2 += aa[16*j+1]*x1 + aa[16*j+5]*x2 + aa[16*j+9]*x3  + aa[16*j+13]*x4;
666       sum3 += aa[16*j+2]*x1 + aa[16*j+6]*x2 + aa[16*j+10]*x3 + aa[16*j+14]*x4;
667       sum4 += aa[16*j+3]*x1 + aa[16*j+7]*x2 + aa[16*j+11]*x3 + aa[16*j+15]*x4;
668     }
669     z[4*i]   = sum1; z[4*i+1] = sum2;
670     z[4*i+2] = sum3; z[4*i+3] = sum4;
671   }
672   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
673   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
674   return 0;
675 }
676 
677 #undef __FUNCT__
678 #define __FUNCT__ "MatMult_SeqBAIJ_4"
679 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
680 {
681   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
682   PetscScalar       *z,x1,x2,x3,x4,sum1,sum2,sum3,sum4;
683   const PetscScalar *x,*xb;
684   const MatScalar   *v;
685   PetscErrorCode    ierr;
686   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
687   PetscBool         usecprow=a->compressedrow.use;
688 
689   PetscFunctionBegin;
690 
691   if (usecprow) {
692     ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
693     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
694     mbs  = a->compressedrow.nrows;
695     ii   = a->compressedrow.i;
696     ridx = a->compressedrow.rindex;
697     for (i=0; i<mbs; i++) {
698       n = ii[i+1] - ii[1];
699       idx  = a->j + ii[i];
700       v    = a->a + ii[i]*16;
701       sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
702       PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
703       PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
704       for (j=0; j<n; j++) {
705 	xb = x + 4*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
706 	sum1 += v[16*j]*x1   + v[16*j+4]*x2 + v[16*j+8]*x3  + v[16*j+12]*x4;
707 	sum2 += v[16*j+1]*x1 + v[16*j+5]*x2 + v[16*j+9]*x3  + v[16*j+13]*x4;
708 	sum3 += v[16*j+2]*x1 + v[16*j+6]*x2 + v[16*j+10]*x3 + v[16*j+14]*x4;
709 	sum4 += v[16*j+3]*x1 + v[16*j+7]*x2 + v[16*j+11]*x3 + v[16*j+15]*x4;
710       }
711       z[4*ridx[i]]   = sum1; z[4*ridx[i]+1] = sum2;
712       z[4*ridx[i]+2] = sum3; z[4*ridx[i]+3] = sum4;
713     }
714     ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
715     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
716   } else {
717     ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_4_Kernel,3,A,xx,zz);CHKERRQ(ierr);
718   }
719   ierr = PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);CHKERRQ(ierr);
720   PetscFunctionReturn(0);
721 }
722 #else
723 #undef __FUNCT__
724 #define __FUNCT__ "MatMult_SeqBAIJ_4"
725 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
726 {
727   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
728   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
729   const PetscScalar *x,*xb;
730   const MatScalar   *v;
731   PetscErrorCode    ierr;
732   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
733   PetscBool         usecprow=a->compressedrow.use;
734 
735   PetscFunctionBegin;
736   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
737   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
738 
739   idx = a->j;
740   v   = a->a;
741   if (usecprow) {
742     mbs  = a->compressedrow.nrows;
743     ii   = a->compressedrow.i;
744     ridx = a->compressedrow.rindex;
745   } else {
746     mbs = a->mbs;
747     ii  = a->i;
748     z   = zarray;
749   }
750 
751   for (i=0; i<mbs; i++) {
752     n = ii[1] - ii[0];
753     ii++;
754     sum1 = 0.0;
755     sum2 = 0.0;
756     sum3 = 0.0;
757     sum4 = 0.0;
758 
759     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
760     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
761     for (j=0; j<n; j++) {
762       xb    = x + 4*(*idx++);
763       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
764       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
765       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
766       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
767       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
768       v    += 16;
769     }
770     if (usecprow) z = zarray + 4*ridx[i];
771     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
772     if (!usecprow) z += 4;
773   }
774   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
775   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
776   ierr = PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);CHKERRQ(ierr);
777   PetscFunctionReturn(0);
778 }
779 #endif
780 
781 #if defined(PETSC_THREADCOMM_ACTIVE)
782 PetscErrorCode MatMult_SeqBAIJ_5_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz)
783 {
784   PetscErrorCode    ierr;
785   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
786   PetscScalar       *z,x1,x2,x3,x4,x5,sum1,sum2,sum3,sum4,sum5;
787   const PetscScalar *x,*xb;
788   const MatScalar   *aa;
789   PetscInt          *trstarts=A->rmap->trstarts;
790   PetscInt          n,start,end,i,j;
791   const PetscInt    *aj,*ai;
792 
793   ierr   = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
794   ierr   = VecGetArray(zz,&z);CHKERRQ(ierr);
795   start  = trstarts[thread_id] / 5;
796   end    = trstarts[thread_id+1] / 5;
797   ai     = a->i;
798   for (i=start; i<end; i++) {
799     n    = ai[i+1] - ai[i];
800     aj   = a->j + ai[i];
801     aa   = a->a + ai[i]*25;
802     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
803     for (j=0; j<n; j++) {
804       xb = x + 5*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
805       sum1 += aa[25*j]*x1   + aa[25*j+5]*x2 + aa[25*j+10]*x3 + aa[25*j+15]*x4 + aa[25*j+20]*x5;
806       sum2 += aa[25*j+1]*x1 + aa[25*j+6]*x2 + aa[25*j+11]*x3 + aa[25*j+16]*x4 + aa[25*j+21]*x5;
807       sum3 += aa[25*j+2]*x1 + aa[25*j+7]*x2 + aa[25*j+12]*x3 + aa[25*j+17]*x4 + aa[25*j+22]*x5;
808       sum4 += aa[25*j+3]*x1 + aa[25*j+8]*x2 + aa[25*j+13]*x3 + aa[25*j+18]*x4 + aa[25*j+23]*x5;
809       sum5 += aa[25*j+4]*x1 + aa[25*j+9]*x2 + aa[25*j+14]*x3 + aa[25*j+19]*x4 + aa[25*j+24]*x5;
810     }
811     z[5*i]   = sum1; z[5*i+1] = sum2; z[5*i+2] = sum3;
812     z[5*i+3] = sum4; z[5*i+4] = sum5;
813   }
814   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
815   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
816   return 0;
817 }
818 
819 #undef __FUNCT__
820 #define __FUNCT__ "MatMult_SeqBAIJ_5"
821 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
822 {
823   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
824   PetscScalar       *z,x1,x2,x3,x4,x5,sum1,sum2,sum3,sum4,sum5;
825   const PetscScalar *xb,*x;
826   const MatScalar   *v;
827   PetscErrorCode    ierr;
828   const PetscInt    *idx,*ii,*ridx=NULL;
829   PetscInt          mbs,i,j,n;
830   PetscBool         usecprow=a->compressedrow.use;
831 
832   PetscFunctionBegin;
833 
834   if (usecprow) {
835     ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
836     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
837     mbs  = a->compressedrow.nrows;
838     ii   = a->compressedrow.i;
839     ridx = a->compressedrow.rindex;
840     for (i=0; i<mbs; i++) {
841       n    = ii[i+1] - ii[i];
842       idx  = a->j + ii[i];
843       v    = a->a + ii[i]*25;
844       sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
845       PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
846       PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
847       for (j=0; j<n; j++) {
848 	xb = x + 5*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
849 	sum1 += v[25*j]*x1   + v[25*j+5]*x2 + v[25*j+10]*x3 + v[25*j+15]*x4 + v[25*j+20]*x5;
850 	sum2 += v[25*j+1]*x1 + v[25*j+6]*x2 + v[25*j+11]*x3 + v[25*j+16]*x4 + v[25*j+21]*x5;
851 	sum3 += v[25*j+2]*x1 + v[25*j+7]*x2 + v[25*j+12]*x3 + v[25*j+17]*x4 + v[25*j+22]*x5;
852 	sum4 += v[25*j+3]*x1 + v[25*j+8]*x2 + v[25*j+13]*x3 + v[25*j+18]*x4 + v[25*j+23]*x5;
853 	sum5 += v[25*j+4]*x1 + v[25*j+9]*x2 + v[25*j+14]*x3 + v[25*j+19]*x4 + v[25*j+24]*x5;
854       }
855       z[5*ridx[i]]   = sum1; z[5*ridx[i]+1] = sum2; z[5*ridx[i]+2] = sum3;
856       z[5*ridx[i]+3] = sum4; z[5*ridx[i]+4] = sum5;
857     }
858     ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
859     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
860   } else {
861     ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_5_Kernel,3,A,xx,zz);CHKERRQ(ierr);
862   }
863   ierr = PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);CHKERRQ(ierr);
864   PetscFunctionReturn(0);
865 }
866 #else
867 #undef __FUNCT__
868 #define __FUNCT__ "MatMult_SeqBAIJ_5"
869 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
870 {
871   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
872   PetscScalar       sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray;
873   const PetscScalar *xb,*x;
874   const MatScalar   *v;
875   PetscErrorCode    ierr;
876   const PetscInt    *idx,*ii,*ridx=NULL;
877   PetscInt          mbs,i,j,n;
878   PetscBool         usecprow=a->compressedrow.use;
879 
880   PetscFunctionBegin;
881   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
882   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
883 
884   idx = a->j;
885   v   = a->a;
886   if (usecprow) {
887     mbs  = a->compressedrow.nrows;
888     ii   = a->compressedrow.i;
889     ridx = a->compressedrow.rindex;
890   } else {
891     mbs = a->mbs;
892     ii  = a->i;
893     z   = zarray;
894   }
895 
896   for (i=0; i<mbs; i++) {
897     n           = ii[1] - ii[0]; ii++;
898     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
899     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
900     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
901     for (j=0; j<n; j++) {
902       xb    = x + 5*(*idx++);
903       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
904       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
905       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
906       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
907       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
908       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
909       v    += 25;
910     }
911     if (usecprow) z = zarray + 5*ridx[i];
912     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
913     if (!usecprow) z += 5;
914   }
915   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
916   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
917   ierr = PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);CHKERRQ(ierr);
918   PetscFunctionReturn(0);
919 }
920 #endif
921 
922 
923 #if defined(PETSC_THREADCOMM_ACTIVE)
924 PetscErrorCode MatMult_SeqBAIJ_6_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz)
925 {
926   PetscErrorCode    ierr;
927   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
928   PetscScalar       *z,x1,x2,x3,x4,x5,x6,sum1,sum2,sum3,sum4,sum5,sum6;
929   const PetscScalar *x,*xb;
930   const MatScalar   *aa;
931   PetscInt          *trstarts=A->rmap->trstarts;
932   PetscInt          n,start,end,i,j;
933   const PetscInt    *aj,*ai;
934 
935   ierr   = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
936   ierr   = VecGetArray(zz,&z);CHKERRQ(ierr);
937   start  = trstarts[thread_id] / 6;
938   end    = trstarts[thread_id+1] / 6;
939   ai     = a->i;
940   for (i=start; i<end; i++) {
941     n    = ai[i+1] - ai[i];
942     aj   = a->j + ai[i];
943     aa   = a->a + ai[i]*36;
944     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0;
945     for (j=0; j<n; j++) {
946       xb = x + 6*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
947       sum1 += aa[36*j]*x1   + aa[36*j+6]*x2  + aa[36*j+12]*x3 + aa[36*j+18]*x4 + aa[36*j+24]*x5 + aa[36*j+30]*x6;
948       sum2 += aa[36*j+1]*x1 + aa[36*j+7]*x2  + aa[36*j+13]*x3 + aa[36*j+19]*x4 + aa[36*j+25]*x5 + aa[36*j+31]*x6;
949       sum3 += aa[36*j+2]*x1 + aa[36*j+8]*x2  + aa[36*j+14]*x3 + aa[36*j+20]*x4 + aa[36*j+26]*x5 + aa[36*j+32]*x6;
950       sum4 += aa[36*j+3]*x1 + aa[36*j+9]*x2  + aa[36*j+15]*x3 + aa[36*j+21]*x4 + aa[36*j+27]*x5 + aa[36*j+33]*x6;
951       sum5 += aa[36*j+4]*x1 + aa[36*j+10]*x2 + aa[36*j+16]*x3 + aa[36*j+22]*x4 + aa[36*j+28]*x5 + aa[36*j+34]*x6;
952       sum6 += aa[36*j+5]*x1 + aa[36*j+11]*x2 + aa[36*j+17]*x3 + aa[36*j+23]*x4 + aa[36*j+29]*x5 + aa[36*j+35]*x6;
953     }
954     z[6*i]   = sum1; z[6*i+1] = sum2; z[6*i+2] = sum3;
955     z[6*i+3] = sum4; z[6*i+4] = sum5; z[6*i+5] = sum6;
956   }
957   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
958   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
959   return 0;
960 }
961 
962 #undef __FUNCT__
963 #define __FUNCT__ "MatMult_SeqBAIJ_6"
964 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
965 {
966   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
967   PetscScalar       *z,x1,x2,x3,x4,x5,x6,sum1,sum2,sum3,sum4,sum5,sum6;
968   const PetscScalar *x,*xb;
969   const MatScalar   *v;
970   PetscErrorCode    ierr;
971   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
972   PetscBool         usecprow=a->compressedrow.use;
973 
974   PetscFunctionBegin;
975 
976   if (usecprow) {
977     ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
978     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
979     mbs  = a->compressedrow.nrows;
980     ii   = a->compressedrow.i;
981     ridx = a->compressedrow.rindex;
982     for (i=0; i<mbs; i++) {
983       n  = ii[i+1] - ii[i];
984       idx  = a->j + ii[i];
985       v    = a->a + ii[i]*36;
986       sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0;
987       PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
988       PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
989       for (j=0; j<n; j++) {
990 	xb = x + 6*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
991 	sum1 += v[36*j]*x1   + v[36*j+6]*x2  + v[36*j+12]*x3 + v[36*j+18]*x4 + v[36*j+24]*x5 + v[36*j+30]*x6;
992 	sum2 += v[36*j+1]*x1 + v[36*j+7]*x2  + v[36*j+13]*x3 + v[36*j+19]*x4 + v[36*j+25]*x5 + v[36*j+31]*x6;
993 	sum3 += v[36*j+2]*x1 + v[36*j+8]*x2  + v[36*j+14]*x3 + v[36*j+20]*x4 + v[36*j+26]*x5 + v[36*j+32]*x6;
994 	sum4 += v[36*j+3]*x1 + v[36*j+9]*x2  + v[36*j+15]*x3 + v[36*j+21]*x4 + v[36*j+27]*x5 + v[36*j+33]*x6;
995 	sum5 += v[36*j+4]*x1 + v[36*j+10]*x2 + v[36*j+16]*x3 + v[36*j+22]*x4 + v[36*j+28]*x5 + v[36*j+34]*x6;
996 	sum6 += v[36*j+5]*x1 + v[36*j+11]*x2 + v[36*j+17]*x3 + v[36*j+23]*x4 + v[36*j+29]*x5 + v[36*j+35]*x6;
997       }
998       z[6*ridx[i]]   = sum1; z[6*ridx[i]+1] = sum2; z[6*ridx[i]+2] = sum3;
999       z[6*ridx[i]+3] = sum4; z[6*ridx[i]+4] = sum5; z[6*ridx[i]+5] = sum6;
1000     }
1001     ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1002     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1003   } else {
1004     ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_6_Kernel,3,A,xx,zz);CHKERRQ(ierr);
1005   }
1006   ierr = PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);CHKERRQ(ierr);
1007   PetscFunctionReturn(0);
1008 }
1009 #else
1010 #undef __FUNCT__
1011 #define __FUNCT__ "MatMult_SeqBAIJ_6"
1012 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
1013 {
1014   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1015   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
1016   const PetscScalar *x,*xb;
1017   PetscScalar       x1,x2,x3,x4,x5,x6,*zarray;
1018   const MatScalar   *v;
1019   PetscErrorCode    ierr;
1020   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
1021   PetscBool         usecprow=a->compressedrow.use;
1022 
1023   PetscFunctionBegin;
1024   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1025   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1026 
1027   idx = a->j;
1028   v   = a->a;
1029   if (usecprow) {
1030     mbs  = a->compressedrow.nrows;
1031     ii   = a->compressedrow.i;
1032     ridx = a->compressedrow.rindex;
1033   } else {
1034     mbs = a->mbs;
1035     ii  = a->i;
1036     z   = zarray;
1037   }
1038 
1039   for (i=0; i<mbs; i++) {
1040     n  = ii[1] - ii[0];
1041     ii++;
1042     sum1 = 0.0;
1043     sum2 = 0.0;
1044     sum3 = 0.0;
1045     sum4 = 0.0;
1046     sum5 = 0.0;
1047     sum6 = 0.0;
1048 
1049     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1050     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1051     for (j=0; j<n; j++) {
1052       xb    = x + 6*(*idx++);
1053       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
1054       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
1055       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
1056       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
1057       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
1058       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
1059       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
1060       v    += 36;
1061     }
1062     if (usecprow) z = zarray + 6*ridx[i];
1063     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
1064     if (!usecprow) z += 6;
1065   }
1066 
1067   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1068   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1069   ierr = PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);CHKERRQ(ierr);
1070   PetscFunctionReturn(0);
1071 }
1072 #endif
1073 
1074 #if defined(PETSC_THREADCOMM_ACTIVE)
1075 PetscErrorCode MatMult_SeqBAIJ_7_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz)
1076 {
1077   PetscErrorCode    ierr;
1078   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1079   PetscScalar       *z,x1,x2,x3,x4,x5,x6,x7,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1080   const PetscScalar *x,*xb;
1081   const MatScalar   *aa;
1082   PetscInt          *trstarts=A->rmap->trstarts;
1083   PetscInt          n,start,end,i,j;
1084   const PetscInt    *aj,*ai;
1085 
1086   ierr   = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1087   ierr   = VecGetArray(zz,&z);CHKERRQ(ierr);
1088   start  = trstarts[thread_id] / 7;
1089   end    = trstarts[thread_id+1] / 7;
1090   ai     = a->i;
1091   for (i=start; i<end; i++) {
1092     n    = ai[i+1] - ai[i];
1093     aj   = a->j + ai[i];
1094     aa   = a->a + ai[i]*49;
1095     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1096     for (j=0; j<n; j++) {
1097       xb = x + 7*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1098       sum1 += aa[49*j]*x1   + aa[49*j+7]*x2  + aa[49*j+14]*x3 + aa[49*j+21]*x4 + aa[49*j+28]*x5 + aa[49*j+35]*x6 + aa[49*j+42]*x7;
1099       sum2 += aa[49*j+1]*x1 + aa[49*j+8]*x2  + aa[49*j+15]*x3 + aa[49*j+22]*x4 + aa[49*j+29]*x5 + aa[49*j+36]*x6 + aa[49*j+43]*x7;
1100       sum3 += aa[49*j+2]*x1 + aa[49*j+9]*x2  + aa[49*j+16]*x3 + aa[49*j+23]*x4 + aa[49*j+30]*x5 + aa[49*j+37]*x6 + aa[49*j+44]*x7;
1101       sum4 += aa[49*j+3]*x1 + aa[49*j+10]*x2 + aa[49*j+17]*x3 + aa[49*j+24]*x4 + aa[49*j+31]*x5 + aa[49*j+38]*x6 + aa[49*j+45]*x7;
1102       sum5 += aa[49*j+4]*x1 + aa[49*j+11]*x2 + aa[49*j+18]*x3 + aa[49*j+25]*x4 + aa[49*j+32]*x5 + aa[49*j+39]*x6 + aa[49*j+46]*x7;
1103       sum6 += aa[49*j+5]*x1 + aa[49*j+12]*x2 + aa[49*j+19]*x3 + aa[49*j+26]*x4 + aa[49*j+33]*x5 + aa[49*j+40]*x6 + aa[49*j+47]*x7;
1104       sum7 += aa[49*j+6]*x1 + aa[49*j+13]*x2 + aa[49*j+20]*x3 + aa[49*j+27]*x4 + aa[49*j+34]*x5 + aa[49*j+41]*x6 + aa[49*j+48]*x7;
1105     }
1106     z[7*i]   = sum1; z[7*i+1] = sum2; z[7*i+2] = sum3; z[7*i+3] = sum4;
1107     z[7*i+4] = sum5; z[7*i+5] = sum6; z[7*i+6] = sum7;
1108   }
1109   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1110   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1111   return 0;
1112 }
1113 
1114 #undef __FUNCT__
1115 #define __FUNCT__ "MatMult_SeqBAIJ_7"
1116 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
1117 {
1118   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1119   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1120   const PetscScalar *x,*xb;
1121   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*zarray;
1122   const MatScalar   *v;
1123   PetscErrorCode    ierr;
1124   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
1125   PetscBool         usecprow=a->compressedrow.use;
1126 
1127   PetscFunctionBegin;
1128 
1129   if (usecprow) {
1130     ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1131     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1132     mbs  = a->compressedrow.nrows;
1133     ii   = a->compressedrow.i;
1134     ridx = a->compressedrow.rindex;
1135     for (i=0; i<mbs; i++) {
1136       n    = ii[i+1] - ii[i];
1137       idx  = a->j + ii[i];
1138       v    = a->a + ii[i]*49;
1139       sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1140       PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1141       PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1142       for (j=0; j<n; j++) {
1143 	xb = x + 7*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1144 	sum1 += v[49*j]*x1   + v[49*j+7]*x2  + v[49*j+14]*x3 + v[49*j+21]*x4 + v[49*j+28]*x5 + v[49*j+35]*x6 + v[49*j+42]*x7;
1145 	sum2 += v[49*j+1]*x1 + v[49*j+8]*x2  + v[49*j+15]*x3 + v[49*j+22]*x4 + v[49*j+29]*x5 + v[49*j+36]*x6 + v[49*j+43]*x7;
1146 	sum3 += v[49*j+2]*x1 + v[49*j+9]*x2  + v[49*j+16]*x3 + v[49*j+23]*x4 + v[49*j+30]*x5 + v[49*j+37]*x6 + v[49*j+44]*x7;
1147 	sum4 += v[49*j+3]*x1 + v[49*j+10]*x2 + v[49*j+17]*x3 + v[49*j+24]*x4 + v[49*j+31]*x5 + v[49*j+38]*x6 + v[49*j+45]*x7;
1148 	sum5 += v[49*j+4]*x1 + v[49*j+11]*x2 + v[49*j+18]*x3 + v[49*j+25]*x4 + v[49*j+32]*x5 + v[49*j+39]*x6 + v[49*j+46]*x7;
1149 	sum6 += v[49*j+5]*x1 + v[49*j+12]*x2 + v[49*j+19]*x3 + v[49*j+26]*x4 + v[49*j+33]*x5 + v[49*j+40]*x6 + v[49*j+47]*x7;
1150 	sum7 += v[49*j+6]*x1 + v[49*j+13]*x2 + v[49*j+20]*x3 + v[49*j+27]*x4 + v[49*j+34]*x5 + v[49*j+41]*x6 + v[49*j+48]*x7;
1151       }
1152       z[7*ridx[i]]   = sum1; z[7*ridx[i]+1] = sum2; z[7*ridx[i]+2] = sum3; z[7*ridx[i]+3] = sum4;
1153       z[7*ridx[i]+4] = sum5; z[7*ridx[i]+5] = sum6; z[7*ridx[i]+6] = sum7;
1154     }
1155     ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1156     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1157   } else {
1158     ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_7_Kernel,3,A,xx,zz);CHKERRQ(ierr);
1159   }
1160   ierr = PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);CHKERRQ(ierr);
1161   PetscFunctionReturn(0);
1162 }
1163 #else
1164 #undef __FUNCT__
1165 #define __FUNCT__ "MatMult_SeqBAIJ_7"
1166 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
1167 {
1168   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1169   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1170   const PetscScalar *x,*xb;
1171   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*zarray;
1172   const MatScalar   *v;
1173   PetscErrorCode    ierr;
1174   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
1175   PetscBool         usecprow=a->compressedrow.use;
1176 
1177   PetscFunctionBegin;
1178   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1179   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1180 
1181   idx = a->j;
1182   v   = a->a;
1183   if (usecprow) {
1184     mbs  = a->compressedrow.nrows;
1185     ii   = a->compressedrow.i;
1186     ridx = a->compressedrow.rindex;
1187   } else {
1188     mbs = a->mbs;
1189     ii  = a->i;
1190     z   = zarray;
1191   }
1192 
1193   for (i=0; i<mbs; i++) {
1194     n  = ii[1] - ii[0];
1195     ii++;
1196     sum1 = 0.0;
1197     sum2 = 0.0;
1198     sum3 = 0.0;
1199     sum4 = 0.0;
1200     sum5 = 0.0;
1201     sum6 = 0.0;
1202     sum7 = 0.0;
1203 
1204     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1205     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1206     for (j=0; j<n; j++) {
1207       xb    = x + 7*(*idx++);
1208       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1209       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1210       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1211       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1212       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1213       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1214       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1215       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1216       v    += 49;
1217     }
1218     if (usecprow) z = zarray + 7*ridx[i];
1219     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1220     if (!usecprow) z += 7;
1221   }
1222 
1223   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1224   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1225   ierr = PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);CHKERRQ(ierr);
1226   PetscFunctionReturn(0);
1227 }
1228 #endif
1229 
1230 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
1231 /* Default MatMult for block size 15 */
1232 
1233 #undef __FUNCT__
1234 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver1"
1235 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz)
1236 {
1237   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1238   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1239   const PetscScalar *x,*xb;
1240   PetscScalar       *zarray,xv;
1241   const MatScalar   *v;
1242   PetscErrorCode    ierr;
1243   const PetscInt    *ii,*ij=a->j,*idx;
1244   PetscInt          mbs,i,j,k,n,*ridx=NULL;
1245   PetscBool         usecprow=a->compressedrow.use;
1246 
1247   PetscFunctionBegin;
1248   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1249   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1250 
1251   v = a->a;
1252   if (usecprow) {
1253     mbs  = a->compressedrow.nrows;
1254     ii   = a->compressedrow.i;
1255     ridx = a->compressedrow.rindex;
1256   } else {
1257     mbs = a->mbs;
1258     ii  = a->i;
1259     z   = zarray;
1260   }
1261 
1262   for (i=0; i<mbs; i++) {
1263     n    = ii[i+1] - ii[i];
1264     idx  = ij + ii[i];
1265     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1266     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1267 
1268     for (j=0; j<n; j++) {
1269       xb = x + 15*(idx[j]);
1270 
1271       for (k=0; k<15; k++) {
1272         xv     =  xb[k];
1273         sum1  += v[0]*xv;
1274         sum2  += v[1]*xv;
1275         sum3  += v[2]*xv;
1276         sum4  += v[3]*xv;
1277         sum5  += v[4]*xv;
1278         sum6  += v[5]*xv;
1279         sum7  += v[6]*xv;
1280         sum8  += v[7]*xv;
1281         sum9  += v[8]*xv;
1282         sum10 += v[9]*xv;
1283         sum11 += v[10]*xv;
1284         sum12 += v[11]*xv;
1285         sum13 += v[12]*xv;
1286         sum14 += v[13]*xv;
1287         sum15 += v[14]*xv;
1288         v     += 15;
1289       }
1290     }
1291     if (usecprow) z = zarray + 15*ridx[i];
1292     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1293     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1294 
1295     if (!usecprow) z += 15;
1296   }
1297 
1298   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1299   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1300   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
1301   PetscFunctionReturn(0);
1302 }
1303 
1304 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
1305 #undef __FUNCT__
1306 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver2"
1307 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz)
1308 {
1309   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1310   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1311   const PetscScalar *x,*xb;
1312   PetscScalar       x1,x2,x3,x4,*zarray;
1313   const MatScalar   *v;
1314   PetscErrorCode    ierr;
1315   const PetscInt    *ii,*ij=a->j,*idx;
1316   PetscInt          mbs,i,j,n,*ridx=NULL;
1317   PetscBool         usecprow=a->compressedrow.use;
1318 
1319   PetscFunctionBegin;
1320   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1321   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1322 
1323   v = a->a;
1324   if (usecprow) {
1325     mbs  = a->compressedrow.nrows;
1326     ii   = a->compressedrow.i;
1327     ridx = a->compressedrow.rindex;
1328   } else {
1329     mbs = a->mbs;
1330     ii  = a->i;
1331     z   = zarray;
1332   }
1333 
1334   for (i=0; i<mbs; i++) {
1335     n    = ii[i+1] - ii[i];
1336     idx  = ij + ii[i];
1337     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1338     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1339 
1340     for (j=0; j<n; j++) {
1341       xb = x + 15*(idx[j]);
1342       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1343 
1344       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
1345       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
1346       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
1347       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
1348       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
1349       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
1350       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
1351       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
1352       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
1353       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
1354       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
1355       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
1356       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
1357       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
1358       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
1359 
1360       v += 60;
1361 
1362       x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];
1363 
1364       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
1365       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
1366       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
1367       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
1368       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
1369       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
1370       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
1371       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
1372       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
1373       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
1374       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
1375       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
1376       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
1377       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
1378       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
1379       v     += 60;
1380 
1381       x1     = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
1382       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
1383       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
1384       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
1385       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
1386       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
1387       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
1388       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
1389       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
1390       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
1391       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
1392       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
1393       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
1394       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
1395       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
1396       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
1397       v     += 60;
1398 
1399       x1     = xb[12]; x2 = xb[13]; x3 = xb[14];
1400       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3;
1401       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3;
1402       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3;
1403       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3;
1404       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3;
1405       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3;
1406       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3;
1407       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3;
1408       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3;
1409       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3;
1410       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3;
1411       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3;
1412       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3;
1413       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3;
1414       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3;
1415       v     += 45;
1416     }
1417     if (usecprow) z = zarray + 15*ridx[i];
1418     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1419     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1420 
1421     if (!usecprow) z += 15;
1422   }
1423 
1424   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1425   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1426   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
1427   PetscFunctionReturn(0);
1428 }
1429 
1430 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
1431 #undef __FUNCT__
1432 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver3"
1433 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz)
1434 {
1435   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1436   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1437   const PetscScalar *x,*xb;
1438   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,*zarray;
1439   const MatScalar   *v;
1440   PetscErrorCode    ierr;
1441   const PetscInt    *ii,*ij=a->j,*idx;
1442   PetscInt          mbs,i,j,n,*ridx=NULL;
1443   PetscBool         usecprow=a->compressedrow.use;
1444 
1445   PetscFunctionBegin;
1446   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1447   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1448 
1449   v = a->a;
1450   if (usecprow) {
1451     mbs  = a->compressedrow.nrows;
1452     ii   = a->compressedrow.i;
1453     ridx = a->compressedrow.rindex;
1454   } else {
1455     mbs = a->mbs;
1456     ii  = a->i;
1457     z   = zarray;
1458   }
1459 
1460   for (i=0; i<mbs; i++) {
1461     n    = ii[i+1] - ii[i];
1462     idx  = ij + ii[i];
1463     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1464     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1465 
1466     for (j=0; j<n; j++) {
1467       xb = x + 15*(idx[j]);
1468       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1469       x8 = xb[7];
1470 
1471       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8;
1472       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8;
1473       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8;
1474       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8;
1475       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8;
1476       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8;
1477       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8;
1478       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8;
1479       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8;
1480       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8;
1481       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8;
1482       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8;
1483       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8;
1484       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8;
1485       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8;
1486       v     += 120;
1487 
1488       x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14];
1489 
1490       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7;
1491       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7;
1492       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7;
1493       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7;
1494       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7;
1495       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7;
1496       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7;
1497       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7;
1498       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7;
1499       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7;
1500       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7;
1501       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7;
1502       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7;
1503       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7;
1504       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7;
1505       v     += 105;
1506     }
1507     if (usecprow) z = zarray + 15*ridx[i];
1508     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1509     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1510 
1511     if (!usecprow) z += 15;
1512   }
1513 
1514   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1515   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1516   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
1517   PetscFunctionReturn(0);
1518 }
1519 
1520 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
1521 
1522 #undef __FUNCT__
1523 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver4"
1524 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz)
1525 {
1526   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1527   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1528   const PetscScalar *x,*xb;
1529   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray;
1530   const MatScalar   *v;
1531   PetscErrorCode    ierr;
1532   const PetscInt    *ii,*ij=a->j,*idx;
1533   PetscInt          mbs,i,j,n,*ridx=NULL;
1534   PetscBool         usecprow=a->compressedrow.use;
1535 
1536   PetscFunctionBegin;
1537   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1538   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1539 
1540   v = a->a;
1541   if (usecprow) {
1542     mbs  = a->compressedrow.nrows;
1543     ii   = a->compressedrow.i;
1544     ridx = a->compressedrow.rindex;
1545   } else {
1546     mbs = a->mbs;
1547     ii  = a->i;
1548     z   = zarray;
1549   }
1550 
1551   for (i=0; i<mbs; i++) {
1552     n    = ii[i+1] - ii[i];
1553     idx  = ij + ii[i];
1554     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1555     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1556 
1557     for (j=0; j<n; j++) {
1558       xb = x + 15*(idx[j]);
1559       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1560       x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14];
1561 
1562       sum1  +=  v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
1563       sum2  +=  v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
1564       sum3  +=  v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
1565       sum4  +=  v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
1566       sum5  += v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
1567       sum6  += v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
1568       sum7  += v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
1569       sum8  += v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
1570       sum9  += v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
1571       sum10 += v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
1572       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
1573       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
1574       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
1575       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
1576       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
1577       v     += 225;
1578     }
1579     if (usecprow) z = zarray + 15*ridx[i];
1580     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1581     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1582 
1583     if (!usecprow) z += 15;
1584   }
1585 
1586   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1587   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1588   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
1589   PetscFunctionReturn(0);
1590 }
1591 
1592 
1593 /*
1594     This will not work with MatScalar == float because it calls the BLAS
1595 */
1596 #undef __FUNCT__
1597 #define __FUNCT__ "MatMult_SeqBAIJ_N"
1598 PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
1599 {
1600   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1601   PetscScalar       *z = 0,*work,*workt,*zarray;
1602   const PetscScalar *x,*xb;
1603   const MatScalar   *v;
1604   PetscErrorCode    ierr;
1605   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1606   const PetscInt    *idx,*ii,*ridx=NULL;
1607   PetscInt          ncols,k;
1608   PetscBool         usecprow=a->compressedrow.use;
1609 
1610   PetscFunctionBegin;
1611   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1612   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1613 
1614   idx = a->j;
1615   v   = a->a;
1616   if (usecprow) {
1617     mbs  = a->compressedrow.nrows;
1618     ii   = a->compressedrow.i;
1619     ridx = a->compressedrow.rindex;
1620   } else {
1621     mbs = a->mbs;
1622     ii  = a->i;
1623     z   = zarray;
1624   }
1625 
1626   if (!a->mult_work) {
1627     k    = PetscMax(A->rmap->n,A->cmap->n);
1628     ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
1629   }
1630   work = a->mult_work;
1631   for (i=0; i<mbs; i++) {
1632     n           = ii[1] - ii[0]; ii++;
1633     ncols       = n*bs;
1634     workt       = work;
1635     for (j=0; j<n; j++) {
1636       xb = x + bs*(*idx++);
1637       for (k=0; k<bs; k++) workt[k] = xb[k];
1638       workt += bs;
1639     }
1640     if (usecprow) z = zarray + bs*ridx[i];
1641     PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
1642     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
1643     v += n*bs2;
1644     if (!usecprow) z += bs;
1645   }
1646   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1647   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1648   ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);CHKERRQ(ierr);
1649   PetscFunctionReturn(0);
1650 }
1651 
1652 #undef __FUNCT__
1653 #define __FUNCT__ "MatMultAdd_SeqBAIJ_1"
1654 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
1655 {
1656   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1657   const PetscScalar *x;
1658   PetscScalar       *y,*z,sum;
1659   const MatScalar   *v;
1660   PetscErrorCode    ierr;
1661   PetscInt          mbs=a->mbs,i,n,*ridx=NULL;
1662   const PetscInt    *idx,*ii;
1663   PetscBool         usecprow=a->compressedrow.use;
1664 
1665   PetscFunctionBegin;
1666   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1667   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
1668 
1669   idx = a->j;
1670   v   = a->a;
1671   if (usecprow) {
1672     if (zz != yy) {
1673       ierr = PetscMemcpy(z,y,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1674     }
1675     mbs  = a->compressedrow.nrows;
1676     ii   = a->compressedrow.i;
1677     ridx = a->compressedrow.rindex;
1678   } else {
1679     ii = a->i;
1680   }
1681 
1682   for (i=0; i<mbs; i++) {
1683     n = ii[1] - ii[0];
1684     ii++;
1685     if (!usecprow) {
1686       sum         = y[i];
1687     } else {
1688       sum = y[ridx[i]];
1689     }
1690     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1691     PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
1692     PetscSparseDensePlusDot(sum,x,v,idx,n);
1693     v   += n;
1694     idx += n;
1695     if (usecprow) {
1696       z[ridx[i]] = sum;
1697     } else {
1698       z[i] = sum;
1699     }
1700   }
1701   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1702   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
1703   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1704   PetscFunctionReturn(0);
1705 }
1706 
1707 #undef __FUNCT__
1708 #define __FUNCT__ "MatMultAdd_SeqBAIJ_2"
1709 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
1710 {
1711   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1712   PetscScalar       *y = 0,*z = 0,sum1,sum2;
1713   const PetscScalar *x,*xb;
1714   PetscScalar       x1,x2,*yarray,*zarray;
1715   const MatScalar   *v;
1716   PetscErrorCode    ierr;
1717   PetscInt          mbs = a->mbs,i,n,j;
1718   const PetscInt    *idx,*ii,*ridx = NULL;
1719   PetscBool         usecprow = a->compressedrow.use;
1720 
1721   PetscFunctionBegin;
1722   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1723   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1724 
1725   idx = a->j;
1726   v   = a->a;
1727   if (usecprow) {
1728     if (zz != yy) {
1729       ierr = PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1730     }
1731     mbs  = a->compressedrow.nrows;
1732     ii   = a->compressedrow.i;
1733     ridx = a->compressedrow.rindex;
1734     if (zz != yy) {
1735       ierr = PetscMemcpy(zarray,yarray,a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1736     }
1737   } else {
1738     ii = a->i;
1739     y  = yarray;
1740     z  = zarray;
1741   }
1742 
1743   for (i=0; i<mbs; i++) {
1744     n = ii[1] - ii[0]; ii++;
1745     if (usecprow) {
1746       z = zarray + 2*ridx[i];
1747       y = yarray + 2*ridx[i];
1748     }
1749     sum1 = y[0]; sum2 = y[1];
1750     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1751     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1752     for (j=0; j<n; j++) {
1753       xb = x + 2*(*idx++);
1754       x1 = xb[0];
1755       x2 = xb[1];
1756 
1757       sum1 += v[0]*x1 + v[2]*x2;
1758       sum2 += v[1]*x1 + v[3]*x2;
1759       v    += 4;
1760     }
1761     z[0] = sum1; z[1] = sum2;
1762     if (!usecprow) {
1763       z += 2; y += 2;
1764     }
1765   }
1766   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1767   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1768   ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr);
1769   PetscFunctionReturn(0);
1770 }
1771 
1772 #undef __FUNCT__
1773 #define __FUNCT__ "MatMultAdd_SeqBAIJ_3"
1774 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
1775 {
1776   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1777   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
1778   const PetscScalar *x,*xb;
1779   const MatScalar   *v;
1780   PetscErrorCode    ierr;
1781   PetscInt          mbs = a->mbs,i,j,n;
1782   const PetscInt    *idx,*ii,*ridx = NULL;
1783   PetscBool         usecprow = a->compressedrow.use;
1784 
1785   PetscFunctionBegin;
1786   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1787   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1788 
1789   idx = a->j;
1790   v   = a->a;
1791   if (usecprow) {
1792     if (zz != yy) {
1793       ierr = PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1794     }
1795     mbs  = a->compressedrow.nrows;
1796     ii   = a->compressedrow.i;
1797     ridx = a->compressedrow.rindex;
1798   } else {
1799     ii = a->i;
1800     y  = yarray;
1801     z  = zarray;
1802   }
1803 
1804   for (i=0; i<mbs; i++) {
1805     n = ii[1] - ii[0]; ii++;
1806     if (usecprow) {
1807       z = zarray + 3*ridx[i];
1808       y = yarray + 3*ridx[i];
1809     }
1810     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
1811     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1812     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1813     for (j=0; j<n; j++) {
1814       xb    = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1815       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
1816       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
1817       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
1818       v    += 9;
1819     }
1820     z[0] = sum1; z[1] = sum2; z[2] = sum3;
1821     if (!usecprow) {
1822       z += 3; y += 3;
1823     }
1824   }
1825   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1826   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1827   ierr = PetscLogFlops(18.0*a->nz);CHKERRQ(ierr);
1828   PetscFunctionReturn(0);
1829 }
1830 
1831 #undef __FUNCT__
1832 #define __FUNCT__ "MatMultAdd_SeqBAIJ_4"
1833 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
1834 {
1835   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1836   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
1837   const PetscScalar *x,*xb;
1838   const MatScalar   *v;
1839   PetscErrorCode    ierr;
1840   PetscInt          mbs = a->mbs,i,j,n;
1841   const PetscInt    *idx,*ii,*ridx=NULL;
1842   PetscBool         usecprow=a->compressedrow.use;
1843 
1844   PetscFunctionBegin;
1845   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1846   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1847 
1848   idx = a->j;
1849   v   = a->a;
1850   if (usecprow) {
1851     if (zz != yy) {
1852       ierr = PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1853     }
1854     mbs  = a->compressedrow.nrows;
1855     ii   = a->compressedrow.i;
1856     ridx = a->compressedrow.rindex;
1857   } else {
1858     ii = a->i;
1859     y  = yarray;
1860     z  = zarray;
1861   }
1862 
1863   for (i=0; i<mbs; i++) {
1864     n = ii[1] - ii[0]; ii++;
1865     if (usecprow) {
1866       z = zarray + 4*ridx[i];
1867       y = yarray + 4*ridx[i];
1868     }
1869     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
1870     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1871     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1872     for (j=0; j<n; j++) {
1873       xb    = x + 4*(*idx++);
1874       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1875       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1876       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1877       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1878       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1879       v    += 16;
1880     }
1881     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
1882     if (!usecprow) {
1883       z += 4; y += 4;
1884     }
1885   }
1886   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1887   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1888   ierr = PetscLogFlops(32.0*a->nz);CHKERRQ(ierr);
1889   PetscFunctionReturn(0);
1890 }
1891 
1892 #undef __FUNCT__
1893 #define __FUNCT__ "MatMultAdd_SeqBAIJ_5"
1894 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
1895 {
1896   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1897   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
1898   const PetscScalar *x,*xb;
1899   PetscScalar       *yarray,*zarray;
1900   const MatScalar   *v;
1901   PetscErrorCode    ierr;
1902   PetscInt          mbs = a->mbs,i,j,n;
1903   const PetscInt    *idx,*ii,*ridx = NULL;
1904   PetscBool         usecprow=a->compressedrow.use;
1905 
1906   PetscFunctionBegin;
1907   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1908   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1909 
1910   idx = a->j;
1911   v   = a->a;
1912   if (usecprow) {
1913     if (zz != yy) {
1914       ierr = PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1915     }
1916     mbs  = a->compressedrow.nrows;
1917     ii   = a->compressedrow.i;
1918     ridx = a->compressedrow.rindex;
1919   } else {
1920     ii = a->i;
1921     y  = yarray;
1922     z  = zarray;
1923   }
1924 
1925   for (i=0; i<mbs; i++) {
1926     n = ii[1] - ii[0]; ii++;
1927     if (usecprow) {
1928       z = zarray + 5*ridx[i];
1929       y = yarray + 5*ridx[i];
1930     }
1931     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
1932     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1933     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1934     for (j=0; j<n; j++) {
1935       xb    = x + 5*(*idx++);
1936       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
1937       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1938       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1939       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1940       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1941       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1942       v    += 25;
1943     }
1944     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
1945     if (!usecprow) {
1946       z += 5; y += 5;
1947     }
1948   }
1949   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1950   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1951   ierr = PetscLogFlops(50.0*a->nz);CHKERRQ(ierr);
1952   PetscFunctionReturn(0);
1953 }
1954 #undef __FUNCT__
1955 #define __FUNCT__ "MatMultAdd_SeqBAIJ_6"
1956 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
1957 {
1958   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1959   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
1960   const PetscScalar *x,*xb;
1961   PetscScalar       x1,x2,x3,x4,x5,x6,*yarray,*zarray;
1962   const MatScalar   *v;
1963   PetscErrorCode    ierr;
1964   PetscInt          mbs = a->mbs,i,j,n;
1965   const PetscInt    *idx,*ii,*ridx=NULL;
1966   PetscBool         usecprow=a->compressedrow.use;
1967 
1968   PetscFunctionBegin;
1969   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1970   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1971 
1972   idx = a->j;
1973   v   = a->a;
1974   if (usecprow) {
1975     if (zz != yy) {
1976       ierr = PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1977     }
1978     mbs  = a->compressedrow.nrows;
1979     ii   = a->compressedrow.i;
1980     ridx = a->compressedrow.rindex;
1981   } else {
1982     ii = a->i;
1983     y  = yarray;
1984     z  = zarray;
1985   }
1986 
1987   for (i=0; i<mbs; i++) {
1988     n = ii[1] - ii[0]; ii++;
1989     if (usecprow) {
1990       z = zarray + 6*ridx[i];
1991       y = yarray + 6*ridx[i];
1992     }
1993     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
1994     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1995     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1996     for (j=0; j<n; j++) {
1997       xb    = x + 6*(*idx++);
1998       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
1999       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
2000       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
2001       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
2002       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
2003       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
2004       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
2005       v    += 36;
2006     }
2007     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
2008     if (!usecprow) {
2009       z += 6; y += 6;
2010     }
2011   }
2012   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
2013   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
2014   ierr = PetscLogFlops(72.0*a->nz);CHKERRQ(ierr);
2015   PetscFunctionReturn(0);
2016 }
2017 
2018 #undef __FUNCT__
2019 #define __FUNCT__ "MatMultAdd_SeqBAIJ_7"
2020 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
2021 {
2022   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2023   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
2024   const PetscScalar *x,*xb;
2025   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
2026   const MatScalar   *v;
2027   PetscErrorCode    ierr;
2028   PetscInt          mbs = a->mbs,i,j,n;
2029   const PetscInt    *idx,*ii,*ridx = NULL;
2030   PetscBool         usecprow=a->compressedrow.use;
2031 
2032   PetscFunctionBegin;
2033   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
2034   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
2035 
2036   idx = a->j;
2037   v   = a->a;
2038   if (usecprow) {
2039     if (zz != yy) {
2040       ierr = PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
2041     }
2042     mbs  = a->compressedrow.nrows;
2043     ii   = a->compressedrow.i;
2044     ridx = a->compressedrow.rindex;
2045   } else {
2046     ii = a->i;
2047     y  = yarray;
2048     z  = zarray;
2049   }
2050 
2051   for (i=0; i<mbs; i++) {
2052     n = ii[1] - ii[0]; ii++;
2053     if (usecprow) {
2054       z = zarray + 7*ridx[i];
2055       y = yarray + 7*ridx[i];
2056     }
2057     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
2058     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
2059     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2060     for (j=0; j<n; j++) {
2061       xb    = x + 7*(*idx++);
2062       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
2063       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2064       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2065       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2066       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2067       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2068       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2069       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2070       v    += 49;
2071     }
2072     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
2073     if (!usecprow) {
2074       z += 7; y += 7;
2075     }
2076   }
2077   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
2078   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
2079   ierr = PetscLogFlops(98.0*a->nz);CHKERRQ(ierr);
2080   PetscFunctionReturn(0);
2081 }
2082 
2083 #undef __FUNCT__
2084 #define __FUNCT__ "MatMultAdd_SeqBAIJ_N"
2085 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
2086 {
2087   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2088   PetscScalar       *z = 0,*work,*workt,*zarray;
2089   const PetscScalar *x,*xb;
2090   const MatScalar   *v;
2091   PetscErrorCode    ierr;
2092   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
2093   PetscInt          ncols,k;
2094   const PetscInt    *ridx = NULL,*idx,*ii;
2095   PetscBool         usecprow = a->compressedrow.use;
2096 
2097   PetscFunctionBegin;
2098   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
2099   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
2100   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
2101 
2102   idx = a->j;
2103   v   = a->a;
2104   if (usecprow) {
2105     mbs  = a->compressedrow.nrows;
2106     ii   = a->compressedrow.i;
2107     ridx = a->compressedrow.rindex;
2108   } else {
2109     mbs = a->mbs;
2110     ii  = a->i;
2111     z   = zarray;
2112   }
2113 
2114   if (!a->mult_work) {
2115     k    = PetscMax(A->rmap->n,A->cmap->n);
2116     ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
2117   }
2118   work = a->mult_work;
2119   for (i=0; i<mbs; i++) {
2120     n     = ii[1] - ii[0]; ii++;
2121     ncols = n*bs;
2122     workt = work;
2123     for (j=0; j<n; j++) {
2124       xb = x + bs*(*idx++);
2125       for (k=0; k<bs; k++) workt[k] = xb[k];
2126       workt += bs;
2127     }
2128     if (usecprow) z = zarray + bs*ridx[i];
2129     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
2130     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
2131     v += n*bs2;
2132     if (!usecprow) z += bs;
2133   }
2134   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
2135   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
2136   ierr = PetscLogFlops(2.0*a->nz*bs2);CHKERRQ(ierr);
2137   PetscFunctionReturn(0);
2138 }
2139 
2140 #undef __FUNCT__
2141 #define __FUNCT__ "MatMultHermitianTranspose_SeqBAIJ"
2142 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
2143 {
2144   PetscScalar    zero = 0.0;
2145   PetscErrorCode ierr;
2146 
2147   PetscFunctionBegin;
2148   ierr = VecSet(zz,zero);CHKERRQ(ierr);
2149   ierr = MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr);
2150   PetscFunctionReturn(0);
2151 }
2152 
2153 #undef __FUNCT__
2154 #define __FUNCT__ "MatMultTranspose_SeqBAIJ"
2155 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
2156 {
2157   PetscScalar    zero = 0.0;
2158   PetscErrorCode ierr;
2159 
2160   PetscFunctionBegin;
2161   ierr = VecSet(zz,zero);CHKERRQ(ierr);
2162   ierr = MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr);
2163   PetscFunctionReturn(0);
2164 }
2165 
2166 #undef __FUNCT__
2167 #define __FUNCT__ "MatMultHermitianTransposeAdd_SeqBAIJ"
2168 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
2169 {
2170   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2171   PetscScalar       *zb,*z,x1,x2,x3,x4,x5;
2172   const PetscScalar *x,*xb = NULL;
2173   const MatScalar   *v;
2174   PetscErrorCode    ierr;
2175   PetscInt          mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2;
2176   const PetscInt    *idx,*ii,*ib,*ridx = NULL;
2177   Mat_CompressedRow cprow = a->compressedrow;
2178   PetscBool         usecprow = cprow.use;
2179 
2180   PetscFunctionBegin;
2181   if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); }
2182   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
2183   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
2184 
2185   idx = a->j;
2186   v   = a->a;
2187   if (usecprow) {
2188     mbs  = cprow.nrows;
2189     ii   = cprow.i;
2190     ridx = cprow.rindex;
2191   } else {
2192     mbs=a->mbs;
2193     ii = a->i;
2194     xb = x;
2195   }
2196 
2197   switch (bs) {
2198   case 1:
2199     for (i=0; i<mbs; i++) {
2200       if (usecprow) xb = x + ridx[i];
2201       x1 = xb[0];
2202       ib = idx + ii[0];
2203       n  = ii[1] - ii[0]; ii++;
2204       for (j=0; j<n; j++) {
2205         rval     = ib[j];
2206         z[rval] += PetscConj(*v) * x1;
2207         v++;
2208       }
2209       if (!usecprow) xb++;
2210     }
2211     break;
2212   case 2:
2213     for (i=0; i<mbs; i++) {
2214       if (usecprow) xb = x + 2*ridx[i];
2215       x1 = xb[0]; x2 = xb[1];
2216       ib = idx + ii[0];
2217       n  = ii[1] - ii[0]; ii++;
2218       for (j=0; j<n; j++) {
2219         rval       = ib[j]*2;
2220         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2;
2221         z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2;
2222         v         += 4;
2223       }
2224       if (!usecprow) xb += 2;
2225     }
2226     break;
2227   case 3:
2228     for (i=0; i<mbs; i++) {
2229       if (usecprow) xb = x + 3*ridx[i];
2230       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2231       ib = idx + ii[0];
2232       n  = ii[1] - ii[0]; ii++;
2233       for (j=0; j<n; j++) {
2234         rval       = ib[j]*3;
2235         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3;
2236         z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3;
2237         z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3;
2238         v         += 9;
2239       }
2240       if (!usecprow) xb += 3;
2241     }
2242     break;
2243   case 4:
2244     for (i=0; i<mbs; i++) {
2245       if (usecprow) xb = x + 4*ridx[i];
2246       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
2247       ib = idx + ii[0];
2248       n  = ii[1] - ii[0]; ii++;
2249       for (j=0; j<n; j++) {
2250         rval       = ib[j]*4;
2251         z[rval++] +=  PetscConj(v[0])*x1 + PetscConj(v[1])*x2  + PetscConj(v[2])*x3  + PetscConj(v[3])*x4;
2252         z[rval++] +=  PetscConj(v[4])*x1 + PetscConj(v[5])*x2  + PetscConj(v[6])*x3  + PetscConj(v[7])*x4;
2253         z[rval++] +=  PetscConj(v[8])*x1 + PetscConj(v[9])*x2  + PetscConj(v[10])*x3 + PetscConj(v[11])*x4;
2254         z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4;
2255         v         += 16;
2256       }
2257       if (!usecprow) xb += 4;
2258     }
2259     break;
2260   case 5:
2261     for (i=0; i<mbs; i++) {
2262       if (usecprow) xb = x + 5*ridx[i];
2263       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2264       x4 = xb[3]; x5 = xb[4];
2265       ib = idx + ii[0];
2266       n  = ii[1] - ii[0]; ii++;
2267       for (j=0; j<n; j++) {
2268         rval       = ib[j]*5;
2269         z[rval++] +=  PetscConj(v[0])*x1 +  PetscConj(v[1])*x2 +  PetscConj(v[2])*x3 +  PetscConj(v[3])*x4 +  PetscConj(v[4])*x5;
2270         z[rval++] +=  PetscConj(v[5])*x1 +  PetscConj(v[6])*x2 +  PetscConj(v[7])*x3 +  PetscConj(v[8])*x4 +  PetscConj(v[9])*x5;
2271         z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5;
2272         z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5;
2273         z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5;
2274         v         += 25;
2275       }
2276       if (!usecprow) xb += 5;
2277     }
2278     break;
2279   default: {      /* block sizes larger than 5 by 5 are handled by BLAS */
2280     PetscInt          ncols,k;
2281     PetscScalar       *work,*workt;
2282     const PetscScalar *xtmp;
2283 
2284     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet");
2285     if (!a->mult_work) {
2286       k    = PetscMax(A->rmap->n,A->cmap->n);
2287       ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
2288     }
2289     work = a->mult_work;
2290     xtmp = x;
2291     for (i=0; i<mbs; i++) {
2292       n     = ii[1] - ii[0]; ii++;
2293       ncols = n*bs;
2294       ierr  = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
2295       if (usecprow) xtmp = x + bs*ridx[i];
2296       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2297       /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
2298       v += n*bs2;
2299       if (!usecprow) xtmp += bs;
2300       workt = work;
2301       for (j=0; j<n; j++) {
2302         zb = z + bs*(*idx++);
2303         for (k=0; k<bs; k++) zb[k] += workt[k] ;
2304         workt += bs;
2305       }
2306     }
2307     }
2308   }
2309   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
2310   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
2311   ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr);
2312   PetscFunctionReturn(0);
2313 }
2314 
2315 #undef __FUNCT__
2316 #define __FUNCT__ "MatMultTransposeAdd_SeqBAIJ"
2317 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
2318 {
2319   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2320   PetscScalar       *zb,*z,x1,x2,x3,x4,x5;
2321   const PetscScalar *x,*xb = 0;
2322   const MatScalar   *v;
2323   PetscErrorCode    ierr;
2324   PetscInt          mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2;
2325   const PetscInt    *idx,*ii,*ib,*ridx = NULL;
2326   Mat_CompressedRow cprow   = a->compressedrow;
2327   PetscBool         usecprow=cprow.use;
2328 
2329   PetscFunctionBegin;
2330   if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); }
2331   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
2332   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
2333 
2334   idx = a->j;
2335   v   = a->a;
2336   if (usecprow) {
2337     mbs  = cprow.nrows;
2338     ii   = cprow.i;
2339     ridx = cprow.rindex;
2340   } else {
2341     mbs=a->mbs;
2342     ii = a->i;
2343     xb = x;
2344   }
2345 
2346   switch (bs) {
2347   case 1:
2348     for (i=0; i<mbs; i++) {
2349       if (usecprow) xb = x + ridx[i];
2350       x1 = xb[0];
2351       ib = idx + ii[0];
2352       n  = ii[1] - ii[0]; ii++;
2353       for (j=0; j<n; j++) {
2354         rval     = ib[j];
2355         z[rval] += *v * x1;
2356         v++;
2357       }
2358       if (!usecprow) xb++;
2359     }
2360     break;
2361   case 2:
2362     for (i=0; i<mbs; i++) {
2363       if (usecprow) xb = x + 2*ridx[i];
2364       x1 = xb[0]; x2 = xb[1];
2365       ib = idx + ii[0];
2366       n  = ii[1] - ii[0]; ii++;
2367       for (j=0; j<n; j++) {
2368         rval       = ib[j]*2;
2369         z[rval++] += v[0]*x1 + v[1]*x2;
2370         z[rval++] += v[2]*x1 + v[3]*x2;
2371         v         += 4;
2372       }
2373       if (!usecprow) xb += 2;
2374     }
2375     break;
2376   case 3:
2377     for (i=0; i<mbs; i++) {
2378       if (usecprow) xb = x + 3*ridx[i];
2379       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2380       ib = idx + ii[0];
2381       n  = ii[1] - ii[0]; ii++;
2382       for (j=0; j<n; j++) {
2383         rval       = ib[j]*3;
2384         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
2385         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
2386         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
2387         v         += 9;
2388       }
2389       if (!usecprow) xb += 3;
2390     }
2391     break;
2392   case 4:
2393     for (i=0; i<mbs; i++) {
2394       if (usecprow) xb = x + 4*ridx[i];
2395       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
2396       ib = idx + ii[0];
2397       n  = ii[1] - ii[0]; ii++;
2398       for (j=0; j<n; j++) {
2399         rval       = ib[j]*4;
2400         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
2401         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
2402         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
2403         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
2404         v         += 16;
2405       }
2406       if (!usecprow) xb += 4;
2407     }
2408     break;
2409   case 5:
2410     for (i=0; i<mbs; i++) {
2411       if (usecprow) xb = x + 5*ridx[i];
2412       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2413       x4 = xb[3]; x5 = xb[4];
2414       ib = idx + ii[0];
2415       n  = ii[1] - ii[0]; ii++;
2416       for (j=0; j<n; j++) {
2417         rval       = ib[j]*5;
2418         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
2419         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
2420         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
2421         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
2422         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
2423         v         += 25;
2424       }
2425       if (!usecprow) xb += 5;
2426     }
2427     break;
2428   default: {      /* block sizes larger then 5 by 5 are handled by BLAS */
2429     PetscInt          ncols,k;
2430     PetscScalar       *work,*workt;
2431     const PetscScalar *xtmp;
2432     if (!a->mult_work) {
2433       k    = PetscMax(A->rmap->n,A->cmap->n);
2434       ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
2435     }
2436     work = a->mult_work;
2437     xtmp = x;
2438     for (i=0; i<mbs; i++) {
2439       n     = ii[1] - ii[0]; ii++;
2440       ncols = n*bs;
2441       ierr  = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
2442       if (usecprow) xtmp = x + bs*ridx[i];
2443       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2444       /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
2445       v += n*bs2;
2446       if (!usecprow) xtmp += bs;
2447       workt = work;
2448       for (j=0; j<n; j++) {
2449         zb = z + bs*(*idx++);
2450         for (k=0; k<bs; k++) zb[k] += workt[k];
2451         workt += bs;
2452       }
2453     }
2454     }
2455   }
2456   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
2457   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
2458   ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr);
2459   PetscFunctionReturn(0);
2460 }
2461 
2462 #undef __FUNCT__
2463 #define __FUNCT__ "MatScale_SeqBAIJ"
2464 PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
2465 {
2466   Mat_SeqBAIJ    *a      = (Mat_SeqBAIJ*)inA->data;
2467   PetscInt       totalnz = a->bs2*a->nz;
2468   PetscScalar    oalpha  = alpha;
2469   PetscErrorCode ierr;
2470   PetscBLASInt   one = 1,tnz;
2471 
2472   PetscFunctionBegin;
2473   ierr = PetscBLASIntCast(totalnz,&tnz);CHKERRQ(ierr);
2474   PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one));
2475   ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
2476   PetscFunctionReturn(0);
2477 }
2478 
2479 #undef __FUNCT__
2480 #define __FUNCT__ "MatNorm_SeqBAIJ"
2481 PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
2482 {
2483   PetscErrorCode ierr;
2484   Mat_SeqBAIJ    *a  = (Mat_SeqBAIJ*)A->data;
2485   MatScalar      *v  = a->a;
2486   PetscReal      sum = 0.0;
2487   PetscInt       i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1;
2488 
2489   PetscFunctionBegin;
2490   if (type == NORM_FROBENIUS) {
2491     for (i=0; i< bs2*nz; i++) {
2492       sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
2493     }
2494     *norm = PetscSqrtReal(sum);
2495   } else if (type == NORM_1) { /* maximum column sum */
2496     PetscReal *tmp;
2497     PetscInt  *bcol = a->j;
2498     ierr = PetscCalloc1(A->cmap->n+1,&tmp);CHKERRQ(ierr);
2499     for (i=0; i<nz; i++) {
2500       for (j=0; j<bs; j++) {
2501         k1 = bs*(*bcol) + j; /* column index */
2502         for (k=0; k<bs; k++) {
2503           tmp[k1] += PetscAbsScalar(*v); v++;
2504         }
2505       }
2506       bcol++;
2507     }
2508     *norm = 0.0;
2509     for (j=0; j<A->cmap->n; j++) {
2510       if (tmp[j] > *norm) *norm = tmp[j];
2511     }
2512     ierr = PetscFree(tmp);CHKERRQ(ierr);
2513   } else if (type == NORM_INFINITY) { /* maximum row sum */
2514     *norm = 0.0;
2515     for (k=0; k<bs; k++) {
2516       for (j=0; j<a->mbs; j++) {
2517         v   = a->a + bs2*a->i[j] + k;
2518         sum = 0.0;
2519         for (i=0; i<a->i[j+1]-a->i[j]; i++) {
2520           for (k1=0; k1<bs; k1++) {
2521             sum += PetscAbsScalar(*v);
2522             v   += bs;
2523           }
2524         }
2525         if (sum > *norm) *norm = sum;
2526       }
2527     }
2528   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
2529   PetscFunctionReturn(0);
2530 }
2531 
2532 
2533 #undef __FUNCT__
2534 #define __FUNCT__ "MatEqual_SeqBAIJ"
2535 PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg)
2536 {
2537   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
2538   PetscErrorCode ierr;
2539 
2540   PetscFunctionBegin;
2541   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
2542   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
2543     *flg = PETSC_FALSE;
2544     PetscFunctionReturn(0);
2545   }
2546 
2547   /* if the a->i are the same */
2548   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr);
2549   if (!*flg) PetscFunctionReturn(0);
2550 
2551   /* if a->j are the same */
2552   ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr);
2553   if (!*flg) PetscFunctionReturn(0);
2554 
2555   /* if a->a are the same */
2556   ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
2557   PetscFunctionReturn(0);
2558 
2559 }
2560 
2561 #undef __FUNCT__
2562 #define __FUNCT__ "MatGetDiagonal_SeqBAIJ"
2563 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
2564 {
2565   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2566   PetscErrorCode ierr;
2567   PetscInt       i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
2568   PetscScalar    *x,zero = 0.0;
2569   MatScalar      *aa,*aa_j;
2570 
2571   PetscFunctionBegin;
2572   if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
2573   bs   = A->rmap->bs;
2574   aa   = a->a;
2575   ai   = a->i;
2576   aj   = a->j;
2577   ambs = a->mbs;
2578   bs2  = a->bs2;
2579 
2580   ierr = VecSet(v,zero);CHKERRQ(ierr);
2581   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
2582   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
2583   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
2584   for (i=0; i<ambs; i++) {
2585     for (j=ai[i]; j<ai[i+1]; j++) {
2586       if (aj[j] == i) {
2587         row  = i*bs;
2588         aa_j = aa+j*bs2;
2589         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
2590         break;
2591       }
2592     }
2593   }
2594   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
2595   PetscFunctionReturn(0);
2596 }
2597 
2598 #undef __FUNCT__
2599 #define __FUNCT__ "MatDiagonalScale_SeqBAIJ"
2600 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
2601 {
2602   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2603   const PetscScalar *l,*r,*li,*ri;
2604   PetscScalar       x;
2605   MatScalar         *aa, *v;
2606   PetscErrorCode    ierr;
2607   PetscInt          i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai;
2608   const PetscInt    *ai,*aj;
2609 
2610   PetscFunctionBegin;
2611   ai  = a->i;
2612   aj  = a->j;
2613   aa  = a->a;
2614   m   = A->rmap->n;
2615   n   = A->cmap->n;
2616   bs  = A->rmap->bs;
2617   mbs = a->mbs;
2618   bs2 = a->bs2;
2619   if (ll) {
2620     ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr);
2621     ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
2622     if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
2623     for (i=0; i<mbs; i++) { /* for each block row */
2624       M  = ai[i+1] - ai[i];
2625       li = l + i*bs;
2626       v  = aa + bs2*ai[i];
2627       for (j=0; j<M; j++) { /* for each block */
2628         for (k=0; k<bs2; k++) {
2629           (*v++) *= li[k%bs];
2630         }
2631       }
2632     }
2633     ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr);
2634     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2635   }
2636 
2637   if (rr) {
2638     ierr = VecGetArrayRead(rr,&r);CHKERRQ(ierr);
2639     ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr);
2640     if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
2641     for (i=0; i<mbs; i++) { /* for each block row */
2642       iai = ai[i];
2643       M   = ai[i+1] - iai;
2644       v   = aa + bs2*iai;
2645       for (j=0; j<M; j++) { /* for each block */
2646         ri = r + bs*aj[iai+j];
2647         for (k=0; k<bs; k++) {
2648           x = ri[k];
2649           for (tmp=0; tmp<bs; tmp++) v[tmp] *= x;
2650           v += bs;
2651         }
2652       }
2653     }
2654     ierr = VecRestoreArrayRead(rr,&r);CHKERRQ(ierr);
2655     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2656   }
2657   PetscFunctionReturn(0);
2658 }
2659 
2660 
2661 #undef __FUNCT__
2662 #define __FUNCT__ "MatGetInfo_SeqBAIJ"
2663 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
2664 {
2665   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2666 
2667   PetscFunctionBegin;
2668   info->block_size   = a->bs2;
2669   info->nz_allocated = a->bs2*a->maxnz;
2670   info->nz_used      = a->bs2*a->nz;
2671   info->nz_unneeded  = (double)(info->nz_allocated - info->nz_used);
2672   info->assemblies   = A->num_ass;
2673   info->mallocs      = A->info.mallocs;
2674   info->memory       = ((PetscObject)A)->mem;
2675   if (A->factortype) {
2676     info->fill_ratio_given  = A->info.fill_ratio_given;
2677     info->fill_ratio_needed = A->info.fill_ratio_needed;
2678     info->factor_mallocs    = A->info.factor_mallocs;
2679   } else {
2680     info->fill_ratio_given  = 0;
2681     info->fill_ratio_needed = 0;
2682     info->factor_mallocs    = 0;
2683   }
2684   PetscFunctionReturn(0);
2685 }
2686 
2687 
2688 #if defined(PETSC_THREADCOMM_ACTIVE)
2689 PetscErrorCode MatZeroEntries_SeqBAIJ_Kernel(PetscInt thread_id,Mat A)
2690 {
2691   PetscErrorCode ierr;
2692   PetscInt       *trstarts=A->rmap->trstarts;
2693   PetscInt       n,start,end;
2694   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
2695 
2696   start = trstarts[thread_id]/A->rmap->bs;
2697   end   = trstarts[thread_id+1]/A->rmap->bs;
2698   n     = a->i[end] - a->i[start];
2699   ierr  = PetscMemzero(a->a+a->bs2*a->i[start],a->bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
2700   return 0;
2701 }
2702 
2703 #undef __FUNCT__
2704 #define __FUNCT__ "MatZeroEntries_SeqBAIJ"
2705 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
2706 {
2707   PetscErrorCode ierr;
2708 
2709 
2710   PetscFunctionBegin;
2711   ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatZeroEntries_SeqBAIJ_Kernel,1,A);CHKERRQ(ierr);
2712   PetscFunctionReturn(0);
2713 }
2714 #else
2715 #undef __FUNCT__
2716 #define __FUNCT__ "MatZeroEntries_SeqBAIJ"
2717 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
2718 {
2719   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2720   PetscErrorCode ierr;
2721 
2722   PetscFunctionBegin;
2723   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
2724   PetscFunctionReturn(0);
2725 }
2726 #endif
2727