xref: /petsc/src/mat/impls/baij/seq/baij2.c (revision df750dc879e6b61f257e3b41356584ef8bf91d84)
1cac129eeSSatish Balay 
2c6db04a5SJed Brown #include <../src/mat/impls/baij/seq/baij.h>
3af0996ceSBarry Smith #include <petsc/private/kernels/blockinvert.h>
4c6db04a5SJed Brown #include <petscbt.h>
5c6db04a5SJed Brown #include <petscblaslapack.h>
6cac129eeSSatish Balay 
7690b6cddSBarry Smith PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
8a3192f15SSatish Balay {
9a3192f15SSatish Balay   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
106849ba73SBarry Smith   PetscErrorCode ierr;
115d0c19d7SBarry Smith   PetscInt       row,i,j,k,l,m,n,*nidx,isz,val,ival;
125d0c19d7SBarry Smith   const PetscInt *idx;
13690b6cddSBarry Smith   PetscInt       start,end,*ai,*aj,bs,*nidx2;
14f1af5d2fSBarry Smith   PetscBT        table;
15a3192f15SSatish Balay 
163a40ed3dSBarry Smith   PetscFunctionBegin;
17a3192f15SSatish Balay   m  = a->mbs;
18a3192f15SSatish Balay   ai = a->i;
19a3192f15SSatish Balay   aj = a->j;
20d0f46423SBarry Smith   bs = A->rmap->bs;
21a3192f15SSatish Balay 
22e32f2f54SBarry Smith   if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
23a3192f15SSatish Balay 
2453b8de81SBarry Smith   ierr = PetscBTCreate(m,&table);CHKERRQ(ierr);
25854ce69bSBarry Smith   ierr = PetscMalloc1(m+1,&nidx);CHKERRQ(ierr);
26854ce69bSBarry Smith   ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr);
27a3192f15SSatish Balay 
28a3192f15SSatish Balay   for (i=0; i<is_max; i++) {
29a3192f15SSatish Balay     /* Initialise the two local arrays */
30a3192f15SSatish Balay     isz  = 0;
316831982aSBarry Smith     ierr = PetscBTMemzero(m,table);CHKERRQ(ierr);
32a3192f15SSatish Balay 
33a3192f15SSatish Balay     /* Extract the indices, assume there can be duplicate entries */
34a3192f15SSatish Balay     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
35b9b97703SBarry Smith     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
36a3192f15SSatish Balay 
37a3192f15SSatish Balay     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
38a3192f15SSatish Balay     for (j=0; j<n; ++j) {
39218c64b6SSatish Balay       ival = idx[j]/bs; /* convert the indices into block indices */
40e32f2f54SBarry Smith       if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
4126fbe8dcSKarl Rupp       if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival;
42a3192f15SSatish Balay     }
43a3192f15SSatish Balay     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
446bf464f9SBarry Smith     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
45a3192f15SSatish Balay 
46a3192f15SSatish Balay     k = 0;
47a3192f15SSatish Balay     for (j=0; j<ov; j++) { /* for each overlap*/
48a3192f15SSatish Balay       n = isz;
49a3192f15SSatish Balay       for (; k<n; k++) {  /* do only those rows in nidx[k], which are not done yet */
50a3192f15SSatish Balay         row   = nidx[k];
51a3192f15SSatish Balay         start = ai[row];
52a3192f15SSatish Balay         end   = ai[row+1];
53a3192f15SSatish Balay         for (l = start; l<end; l++) {
54a3192f15SSatish Balay           val = aj[l];
5526fbe8dcSKarl Rupp           if (!PetscBTLookupSet(table,val)) nidx[isz++] = val;
56a3192f15SSatish Balay         }
57a3192f15SSatish Balay       }
58a3192f15SSatish Balay     }
59218c64b6SSatish Balay     /* expand the Index Set */
60218c64b6SSatish Balay     for (j=0; j<isz; j++) {
6126fbe8dcSKarl Rupp       for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
62218c64b6SSatish Balay     }
6370b3c8c7SBarry Smith     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
64a3192f15SSatish Balay   }
6594bacf5dSBarry Smith   ierr = PetscBTDestroy(&table);CHKERRQ(ierr);
66606d414cSSatish Balay   ierr = PetscFree(nidx);CHKERRQ(ierr);
67606d414cSSatish Balay   ierr = PetscFree(nidx2);CHKERRQ(ierr);
683a40ed3dSBarry Smith   PetscFunctionReturn(0);
69a3192f15SSatish Balay }
701c351548SSatish Balay 
714aa3045dSJed Brown PetscErrorCode MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
72736121d4SSatish Balay {
73736121d4SSatish Balay   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*c;
746849ba73SBarry Smith   PetscErrorCode ierr;
75690b6cddSBarry Smith   PetscInt       *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
76690b6cddSBarry Smith   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
775d0c19d7SBarry Smith   const PetscInt *irow,*icol;
785d0c19d7SBarry Smith   PetscInt       nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
79690b6cddSBarry Smith   PetscInt       *aj = a->j,*ai = a->i;
803f1db9ecSBarry Smith   MatScalar      *mat_a;
81736121d4SSatish Balay   Mat            C;
826041f1b1SToby Isaac   PetscBool      flag;
83736121d4SSatish Balay 
843a40ed3dSBarry Smith   PetscFunctionBegin;
85736121d4SSatish Balay   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
86218c64b6SSatish Balay   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
87b9b97703SBarry Smith   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
88b9b97703SBarry Smith   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
89736121d4SSatish Balay 
90854ce69bSBarry Smith   ierr  = PetscCalloc1(1+oldcols,&smap);CHKERRQ(ierr);
91736121d4SSatish Balay   ssmap = smap;
92854ce69bSBarry Smith   ierr  = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr);
93736121d4SSatish Balay   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
94736121d4SSatish Balay   /* determine lens of each row */
95736121d4SSatish Balay   for (i=0; i<nrows; i++) {
96736121d4SSatish Balay     kstart  = ai[irow[i]];
97736121d4SSatish Balay     kend    = kstart + a->ilen[irow[i]];
98736121d4SSatish Balay     lens[i] = 0;
99736121d4SSatish Balay     for (k=kstart; k<kend; k++) {
10026fbe8dcSKarl Rupp       if (ssmap[aj[k]]) lens[i]++;
101736121d4SSatish Balay     }
102736121d4SSatish Balay   }
103736121d4SSatish Balay   /* Create and fill new matrix */
104736121d4SSatish Balay   if (scall == MAT_REUSE_MATRIX) {
105736121d4SSatish Balay     c = (Mat_SeqBAIJ*)((*B)->data);
106736121d4SSatish Balay 
107e32f2f54SBarry Smith     if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
108690b6cddSBarry Smith     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr);
109e7e72b3dSBarry Smith     if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
110690b6cddSBarry Smith     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr);
111736121d4SSatish Balay     C    = *B;
1123a40ed3dSBarry Smith   } else {
113ce94432eSBarry Smith     ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr);
114f69a0ea3SMatthew Knepley     ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1157adad957SLisandro Dalcin     ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
116367daffbSBarry Smith     ierr = MatSeqBAIJSetPreallocation(C,bs,0,lens);CHKERRQ(ierr);
117736121d4SSatish Balay   }
118736121d4SSatish Balay   c = (Mat_SeqBAIJ*)(C->data);
119736121d4SSatish Balay   for (i=0; i<nrows; i++) {
120736121d4SSatish Balay     row      = irow[i];
121736121d4SSatish Balay     kstart   = ai[row];
122736121d4SSatish Balay     kend     = kstart + a->ilen[row];
123736121d4SSatish Balay     mat_i    = c->i[i];
124736121d4SSatish Balay     mat_j    = c->j + mat_i;
125218c64b6SSatish Balay     mat_a    = c->a + mat_i*bs2;
126736121d4SSatish Balay     mat_ilen = c->ilen + i;
127736121d4SSatish Balay     for (k=kstart; k<kend; k++) {
128736121d4SSatish Balay       if ((tcol=ssmap[a->j[k]])) {
129736121d4SSatish Balay         *mat_j++ = tcol - 1;
130549d3d68SSatish Balay         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
131549d3d68SSatish Balay         mat_a   += bs2;
132736121d4SSatish Balay         (*mat_ilen)++;
133736121d4SSatish Balay       }
134736121d4SSatish Balay     }
135736121d4SSatish Balay   }
136cdc6f3adSToby Isaac   /* sort */
137cdc6f3adSToby Isaac   {
138cdc6f3adSToby Isaac     MatScalar *work;
139cdc6f3adSToby Isaac     ierr = PetscMalloc1(bs2,&work);CHKERRQ(ierr);
140cdc6f3adSToby Isaac     for (i=0; i<nrows; i++) {
141cdc6f3adSToby Isaac       PetscInt ilen;
142cdc6f3adSToby Isaac       mat_i = c->i[i];
143cdc6f3adSToby Isaac       mat_j = c->j + mat_i;
144cdc6f3adSToby Isaac       mat_a = c->a + mat_i*bs2;
145cdc6f3adSToby Isaac       ilen  = c->ilen[i];
146cdc6f3adSToby Isaac       ierr  = PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);CHKERRQ(ierr);
147cdc6f3adSToby Isaac     }
148cdc6f3adSToby Isaac     ierr = PetscFree(work);CHKERRQ(ierr);
149cdc6f3adSToby Isaac   }
150218c64b6SSatish Balay 
151736121d4SSatish Balay   /* Free work space */
152736121d4SSatish Balay   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
153606d414cSSatish Balay   ierr = PetscFree(smap);CHKERRQ(ierr);
154606d414cSSatish Balay   ierr = PetscFree(lens);CHKERRQ(ierr);
1556d4a8577SBarry Smith   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1566d4a8577SBarry Smith   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
157736121d4SSatish Balay 
158736121d4SSatish Balay   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
159736121d4SSatish Balay   *B   = C;
1603a40ed3dSBarry Smith   PetscFunctionReturn(0);
161736121d4SSatish Balay }
162736121d4SSatish Balay 
1634aa3045dSJed Brown PetscErrorCode MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
164218c64b6SSatish Balay {
165218c64b6SSatish Balay   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
166218c64b6SSatish Balay   IS             is1,is2;
1676849ba73SBarry Smith   PetscErrorCode ierr;
168afebec48SHong Zhang   PetscInt       *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs,j;
1695d0c19d7SBarry Smith   const PetscInt *irow,*icol;
170218c64b6SSatish Balay 
1713a40ed3dSBarry Smith   PetscFunctionBegin;
172218c64b6SSatish Balay   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
173218c64b6SSatish Balay   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
174b9b97703SBarry Smith   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
175b9b97703SBarry Smith   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
176218c64b6SSatish Balay 
177218c64b6SSatish Balay   /* Verify if the indices corespond to each element in a block
178218c64b6SSatish Balay    and form the IS with compressed IS */
179f8ecb639SStefano Zampini   maxmnbs = PetscMax(a->mbs,a->nbs);
180f8ecb639SStefano Zampini   ierr = PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);CHKERRQ(ierr);
181fca92195SBarry Smith   ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr);
182218c64b6SSatish Balay   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
183218c64b6SSatish Balay   for (i=0; i<a->mbs; i++) {
184e32f2f54SBarry Smith     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
1856041f1b1SToby Isaac   }
1866041f1b1SToby Isaac   count = 0;
1876041f1b1SToby Isaac   for (i=0; i<nrows; i++) {
188afebec48SHong Zhang     j = irow[i] / bs;
1896041f1b1SToby Isaac     if ((vary[j]--)==bs) iary[count++] = j;
190218c64b6SSatish Balay   }
19170b3c8c7SBarry Smith   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr);
192218c64b6SSatish Balay 
193f8ecb639SStefano Zampini   ierr = PetscMemzero(vary,(a->nbs)*sizeof(PetscInt));CHKERRQ(ierr);
194218c64b6SSatish Balay   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
195f8ecb639SStefano Zampini   for (i=0; i<a->nbs; i++) {
196e32f2f54SBarry Smith     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
197218c64b6SSatish Balay   }
1986041f1b1SToby Isaac   count = 0;
1996041f1b1SToby Isaac   for (i=0; i<ncols; i++) {
200afebec48SHong Zhang     j = icol[i] / bs;
2016041f1b1SToby Isaac     if ((vary[j]--)==bs) iary[count++] = j;
2026041f1b1SToby Isaac   }
20370b3c8c7SBarry Smith   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr);
204218c64b6SSatish Balay   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
205218c64b6SSatish Balay   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
206fca92195SBarry Smith   ierr = PetscFree2(vary,iary);CHKERRQ(ierr);
207218c64b6SSatish Balay 
2084aa3045dSJed Brown   ierr = MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr);
2096bf464f9SBarry Smith   ierr = ISDestroy(&is1);CHKERRQ(ierr);
2106bf464f9SBarry Smith   ierr = ISDestroy(&is2);CHKERRQ(ierr);
2113a40ed3dSBarry Smith   PetscFunctionReturn(0);
212218c64b6SSatish Balay }
213218c64b6SSatish Balay 
214690b6cddSBarry Smith PetscErrorCode MatGetSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
215736121d4SSatish Balay {
2166849ba73SBarry Smith   PetscErrorCode ierr;
217690b6cddSBarry Smith   PetscInt       i;
218736121d4SSatish Balay 
2193a40ed3dSBarry Smith   PetscFunctionBegin;
220736121d4SSatish Balay   if (scall == MAT_INITIAL_MATRIX) {
221*df750dc8SHong Zhang     ierr = PetscCalloc1(n+1,B);CHKERRQ(ierr);
222736121d4SSatish Balay   }
223736121d4SSatish Balay 
224736121d4SSatish Balay   for (i=0; i<n; i++) {
2254aa3045dSJed Brown     ierr = MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr);
226736121d4SSatish Balay   }
2273a40ed3dSBarry Smith   PetscFunctionReturn(0);
228736121d4SSatish Balay }
229218c64b6SSatish Balay 
230218c64b6SSatish Balay 
2312d61bbb3SSatish Balay /* -------------------------------------------------------*/
2322d61bbb3SSatish Balay /* Should check that shapes of vectors and matrices match */
2332d61bbb3SSatish Balay /* -------------------------------------------------------*/
2342d61bbb3SSatish Balay 
235dfbe8321SBarry Smith PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
2362d61bbb3SSatish Balay {
2372d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
238d9fead3dSBarry Smith   PetscScalar       *z,sum;
239d9fead3dSBarry Smith   const PetscScalar *x;
240d9fead3dSBarry Smith   const MatScalar   *v;
2416849ba73SBarry Smith   PetscErrorCode    ierr;
2427c565772SBarry Smith   PetscInt          mbs,i,n;
2430298fd71SBarry Smith   const PetscInt    *idx,*ii,*ridx=NULL;
244ace3abfcSBarry Smith   PetscBool         usecprow=a->compressedrow.use;
2452d61bbb3SSatish Balay 
2462d61bbb3SSatish Balay   PetscFunctionBegin;
2473649974fSBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
2481ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
2492d61bbb3SSatish Balay 
25026e093fcSHong Zhang   if (usecprow) {
25126e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
25226e093fcSHong Zhang     ii   = a->compressedrow.i;
2537b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
254ef16e671SStefano Zampini     ierr = PetscMemzero(z,a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
25526e093fcSHong Zhang   } else {
25626e093fcSHong Zhang     mbs = a->mbs;
2572d61bbb3SSatish Balay     ii  = a->i;
25826e093fcSHong Zhang   }
2592d61bbb3SSatish Balay 
2602d61bbb3SSatish Balay   for (i=0; i<mbs; i++) {
261ee54c7eeSHong Zhang     n   = ii[1] - ii[0];
262ee54c7eeSHong Zhang     v   = a->a + ii[0];
263ee54c7eeSHong Zhang     idx = a->j + ii[0];
264ee54c7eeSHong Zhang     ii++;
265444d8c10SJed Brown     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
266444d8c10SJed Brown     PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2672d61bbb3SSatish Balay     sum = 0.0;
2682162cab8SBarry Smith     PetscSparseDensePlusDot(sum,x,v,idx,n);
26926e093fcSHong Zhang     if (usecprow) {
2707b2bb3b9SHong Zhang       z[ridx[i]] = sum;
27126e093fcSHong Zhang     } else {
2722d61bbb3SSatish Balay       z[i]       = sum;
2732d61bbb3SSatish Balay     }
27426e093fcSHong Zhang   }
2753649974fSBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
2761ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
2777c565772SBarry Smith   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
2782d61bbb3SSatish Balay   PetscFunctionReturn(0);
2792d61bbb3SSatish Balay }
2802d61bbb3SSatish Balay 
281dfbe8321SBarry Smith PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
2822d61bbb3SSatish Balay {
2832d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
284d9fead3dSBarry Smith   PetscScalar       *z = 0,sum1,sum2,*zarray;
285d9fead3dSBarry Smith   const PetscScalar *x,*xb;
28687828ca2SBarry Smith   PetscScalar       x1,x2;
287d9fead3dSBarry Smith   const MatScalar   *v;
288dfbe8321SBarry Smith   PetscErrorCode    ierr;
2897c565772SBarry Smith   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
290ace3abfcSBarry Smith   PetscBool         usecprow=a->compressedrow.use;
2912d61bbb3SSatish Balay 
2922d61bbb3SSatish Balay   PetscFunctionBegin;
2933649974fSBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
29426e093fcSHong Zhang   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
2952d61bbb3SSatish Balay 
2962d61bbb3SSatish Balay   idx = a->j;
2972d61bbb3SSatish Balay   v   = a->a;
29826e093fcSHong Zhang   if (usecprow) {
29926e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
30026e093fcSHong Zhang     ii   = a->compressedrow.i;
3017b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
302ef16e671SStefano Zampini     ierr = PetscMemzero(zarray,2*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
30326e093fcSHong Zhang   } else {
30426e093fcSHong Zhang     mbs = a->mbs;
3052d61bbb3SSatish Balay     ii  = a->i;
30626e093fcSHong Zhang     z   = zarray;
30726e093fcSHong Zhang   }
3082d61bbb3SSatish Balay 
3092d61bbb3SSatish Balay   for (i=0; i<mbs; i++) {
3102d61bbb3SSatish Balay     n           = ii[1] - ii[0]; ii++;
3112d61bbb3SSatish Balay     sum1        = 0.0; sum2 = 0.0;
312444d8c10SJed Brown     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
313444d8c10SJed Brown     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3142d61bbb3SSatish Balay     for (j=0; j<n; j++) {
3152d61bbb3SSatish Balay       xb    = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
3162d61bbb3SSatish Balay       sum1 += v[0]*x1 + v[2]*x2;
3172d61bbb3SSatish Balay       sum2 += v[1]*x1 + v[3]*x2;
3182d61bbb3SSatish Balay       v    += 4;
3192d61bbb3SSatish Balay     }
3207b2bb3b9SHong Zhang     if (usecprow) z = zarray + 2*ridx[i];
3212d61bbb3SSatish Balay     z[0] = sum1; z[1] = sum2;
32226e093fcSHong Zhang     if (!usecprow) z += 2;
3232d61bbb3SSatish Balay   }
3243649974fSBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
32526e093fcSHong Zhang   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
3267c565772SBarry Smith   ierr = PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);CHKERRQ(ierr);
3272d61bbb3SSatish Balay   PetscFunctionReturn(0);
3282d61bbb3SSatish Balay }
3292d61bbb3SSatish Balay 
330dfbe8321SBarry Smith PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
3312d61bbb3SSatish Balay {
3322d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
333d9fead3dSBarry Smith   PetscScalar       *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray;
334d9fead3dSBarry Smith   const PetscScalar *x,*xb;
335d9fead3dSBarry Smith   const MatScalar   *v;
336dfbe8321SBarry Smith   PetscErrorCode    ierr;
3377c565772SBarry Smith   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
338ace3abfcSBarry Smith   PetscBool         usecprow=a->compressedrow.use;
33926e093fcSHong Zhang 
340b6410449SSatish Balay #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
341fee21e36SBarry Smith #pragma disjoint(*v,*z,*xb)
342fee21e36SBarry Smith #endif
343fee21e36SBarry Smith 
3442d61bbb3SSatish Balay   PetscFunctionBegin;
3453649974fSBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
34626e093fcSHong Zhang   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
3472d61bbb3SSatish Balay 
3482d61bbb3SSatish Balay   idx = a->j;
3492d61bbb3SSatish Balay   v   = a->a;
35026e093fcSHong Zhang   if (usecprow) {
35126e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
35226e093fcSHong Zhang     ii   = a->compressedrow.i;
3537b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
354ef16e671SStefano Zampini     ierr = PetscMemzero(zarray,3*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
35526e093fcSHong Zhang   } else {
35626e093fcSHong Zhang     mbs = a->mbs;
3572d61bbb3SSatish Balay     ii  = a->i;
35826e093fcSHong Zhang     z   = zarray;
35926e093fcSHong Zhang   }
3602d61bbb3SSatish Balay 
3612d61bbb3SSatish Balay   for (i=0; i<mbs; i++) {
3622d61bbb3SSatish Balay     n           = ii[1] - ii[0]; ii++;
3632d61bbb3SSatish Balay     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0;
364444d8c10SJed Brown     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
365444d8c10SJed Brown     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3662d61bbb3SSatish Balay     for (j=0; j<n; j++) {
36726fbe8dcSKarl Rupp       xb = x + 3*(*idx++);
36826fbe8dcSKarl Rupp       x1 = xb[0];
36926fbe8dcSKarl Rupp       x2 = xb[1];
37026fbe8dcSKarl Rupp       x3 = xb[2];
37126fbe8dcSKarl Rupp 
3722d61bbb3SSatish Balay       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
3732d61bbb3SSatish Balay       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
3742d61bbb3SSatish Balay       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
3752d61bbb3SSatish Balay       v    += 9;
3762d61bbb3SSatish Balay     }
3777b2bb3b9SHong Zhang     if (usecprow) z = zarray + 3*ridx[i];
3782d61bbb3SSatish Balay     z[0] = sum1; z[1] = sum2; z[2] = sum3;
37926e093fcSHong Zhang     if (!usecprow) z += 3;
3802d61bbb3SSatish Balay   }
3813649974fSBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
38226e093fcSHong Zhang   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
3837c565772SBarry Smith   ierr = PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);CHKERRQ(ierr);
3842d61bbb3SSatish Balay   PetscFunctionReturn(0);
3852d61bbb3SSatish Balay }
3862d61bbb3SSatish Balay 
387dfbe8321SBarry Smith PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
3882d61bbb3SSatish Balay {
3892d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
390d9fead3dSBarry Smith   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
391d9fead3dSBarry Smith   const PetscScalar *x,*xb;
392d9fead3dSBarry Smith   const MatScalar   *v;
393dfbe8321SBarry Smith   PetscErrorCode    ierr;
3947c565772SBarry Smith   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
395ace3abfcSBarry Smith   PetscBool         usecprow=a->compressedrow.use;
3962d61bbb3SSatish Balay 
3972d61bbb3SSatish Balay   PetscFunctionBegin;
3983649974fSBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
39926e093fcSHong Zhang   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
4002d61bbb3SSatish Balay 
4012d61bbb3SSatish Balay   idx = a->j;
4022d61bbb3SSatish Balay   v   = a->a;
40326e093fcSHong Zhang   if (usecprow) {
40426e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
40526e093fcSHong Zhang     ii   = a->compressedrow.i;
4067b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
407ef16e671SStefano Zampini     ierr = PetscMemzero(zarray,4*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
40826e093fcSHong Zhang   } else {
40926e093fcSHong Zhang     mbs = a->mbs;
4102d61bbb3SSatish Balay     ii  = a->i;
41126e093fcSHong Zhang     z   = zarray;
41226e093fcSHong Zhang   }
4132d61bbb3SSatish Balay 
4142d61bbb3SSatish Balay   for (i=0; i<mbs; i++) {
41526fbe8dcSKarl Rupp     n = ii[1] - ii[0];
41626fbe8dcSKarl Rupp     ii++;
41726fbe8dcSKarl Rupp     sum1 = 0.0;
41826fbe8dcSKarl Rupp     sum2 = 0.0;
41926fbe8dcSKarl Rupp     sum3 = 0.0;
42026fbe8dcSKarl Rupp     sum4 = 0.0;
42126fbe8dcSKarl Rupp 
422444d8c10SJed Brown     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
423444d8c10SJed Brown     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
4242d61bbb3SSatish Balay     for (j=0; j<n; j++) {
4252d61bbb3SSatish Balay       xb    = x + 4*(*idx++);
4262d61bbb3SSatish Balay       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
4272d61bbb3SSatish Balay       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
4282d61bbb3SSatish Balay       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
4292d61bbb3SSatish Balay       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
4302d61bbb3SSatish Balay       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
4312d61bbb3SSatish Balay       v    += 16;
4322d61bbb3SSatish Balay     }
4337b2bb3b9SHong Zhang     if (usecprow) z = zarray + 4*ridx[i];
4342d61bbb3SSatish Balay     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
43526e093fcSHong Zhang     if (!usecprow) z += 4;
4362d61bbb3SSatish Balay   }
4373649974fSBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
43826e093fcSHong Zhang   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
4397c565772SBarry Smith   ierr = PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);CHKERRQ(ierr);
4402d61bbb3SSatish Balay   PetscFunctionReturn(0);
4412d61bbb3SSatish Balay }
4422d61bbb3SSatish Balay 
443dfbe8321SBarry Smith PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
4442d61bbb3SSatish Balay {
4452d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
446d9fead3dSBarry Smith   PetscScalar       sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray;
447d9fead3dSBarry Smith   const PetscScalar *xb,*x;
448d9fead3dSBarry Smith   const MatScalar   *v;
449dfbe8321SBarry Smith   PetscErrorCode    ierr;
4500298fd71SBarry Smith   const PetscInt    *idx,*ii,*ridx=NULL;
4517c565772SBarry Smith   PetscInt          mbs,i,j,n;
452ace3abfcSBarry Smith   PetscBool         usecprow=a->compressedrow.use;
4532d61bbb3SSatish Balay 
454433994e6SBarry Smith   PetscFunctionBegin;
4553649974fSBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
45626e093fcSHong Zhang   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
4572d61bbb3SSatish Balay 
4582d61bbb3SSatish Balay   idx = a->j;
4592d61bbb3SSatish Balay   v   = a->a;
46026e093fcSHong Zhang   if (usecprow) {
46126e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
46226e093fcSHong Zhang     ii   = a->compressedrow.i;
4637b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
464ef16e671SStefano Zampini     ierr = PetscMemzero(zarray,5*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
46526e093fcSHong Zhang   } else {
46626e093fcSHong Zhang     mbs = a->mbs;
4672d61bbb3SSatish Balay     ii  = a->i;
46826e093fcSHong Zhang     z   = zarray;
46926e093fcSHong Zhang   }
4702d61bbb3SSatish Balay 
4712d61bbb3SSatish Balay   for (i=0; i<mbs; i++) {
4722d61bbb3SSatish Balay     n           = ii[1] - ii[0]; ii++;
4732d61bbb3SSatish Balay     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
474444d8c10SJed Brown     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
475444d8c10SJed Brown     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
4762d61bbb3SSatish Balay     for (j=0; j<n; j++) {
4772d61bbb3SSatish Balay       xb    = x + 5*(*idx++);
4782d61bbb3SSatish Balay       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
4792d61bbb3SSatish Balay       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
4802d61bbb3SSatish Balay       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
4812d61bbb3SSatish Balay       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
4822d61bbb3SSatish Balay       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
4832d61bbb3SSatish Balay       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
4842d61bbb3SSatish Balay       v    += 25;
4852d61bbb3SSatish Balay     }
4867b2bb3b9SHong Zhang     if (usecprow) z = zarray + 5*ridx[i];
4872d61bbb3SSatish Balay     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
48826e093fcSHong Zhang     if (!usecprow) z += 5;
4892d61bbb3SSatish Balay   }
4903649974fSBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
49126e093fcSHong Zhang   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
4927c565772SBarry Smith   ierr = PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);CHKERRQ(ierr);
4932d61bbb3SSatish Balay   PetscFunctionReturn(0);
4942d61bbb3SSatish Balay }
4952d61bbb3SSatish Balay 
49615091d37SBarry Smith 
497d6232bceSMichael Lange 
498dfbe8321SBarry Smith PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
49915091d37SBarry Smith {
50015091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
501d9fead3dSBarry Smith   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
502d9fead3dSBarry Smith   const PetscScalar *x,*xb;
50326e093fcSHong Zhang   PetscScalar       x1,x2,x3,x4,x5,x6,*zarray;
504d9fead3dSBarry Smith   const MatScalar   *v;
505dfbe8321SBarry Smith   PetscErrorCode    ierr;
5067c565772SBarry Smith   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
507ace3abfcSBarry Smith   PetscBool         usecprow=a->compressedrow.use;
50815091d37SBarry Smith 
509433994e6SBarry Smith   PetscFunctionBegin;
5103649974fSBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
51126e093fcSHong Zhang   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
51215091d37SBarry Smith 
51315091d37SBarry Smith   idx = a->j;
51415091d37SBarry Smith   v   = a->a;
51526e093fcSHong Zhang   if (usecprow) {
51626e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
51726e093fcSHong Zhang     ii   = a->compressedrow.i;
5187b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
519ef16e671SStefano Zampini     ierr = PetscMemzero(zarray,6*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
52026e093fcSHong Zhang   } else {
52126e093fcSHong Zhang     mbs = a->mbs;
52215091d37SBarry Smith     ii  = a->i;
52326e093fcSHong Zhang     z   = zarray;
52426e093fcSHong Zhang   }
52515091d37SBarry Smith 
52615091d37SBarry Smith   for (i=0; i<mbs; i++) {
52726fbe8dcSKarl Rupp     n  = ii[1] - ii[0];
52826fbe8dcSKarl Rupp     ii++;
52926fbe8dcSKarl Rupp     sum1 = 0.0;
53026fbe8dcSKarl Rupp     sum2 = 0.0;
53126fbe8dcSKarl Rupp     sum3 = 0.0;
53226fbe8dcSKarl Rupp     sum4 = 0.0;
53326fbe8dcSKarl Rupp     sum5 = 0.0;
53426fbe8dcSKarl Rupp     sum6 = 0.0;
53526fbe8dcSKarl Rupp 
536444d8c10SJed Brown     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
537444d8c10SJed Brown     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
53815091d37SBarry Smith     for (j=0; j<n; j++) {
53915091d37SBarry Smith       xb    = x + 6*(*idx++);
54015091d37SBarry Smith       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
54115091d37SBarry Smith       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
54215091d37SBarry Smith       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
54315091d37SBarry Smith       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
54415091d37SBarry Smith       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
54515091d37SBarry Smith       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
54615091d37SBarry Smith       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
54715091d37SBarry Smith       v    += 36;
54815091d37SBarry Smith     }
5497b2bb3b9SHong Zhang     if (usecprow) z = zarray + 6*ridx[i];
55015091d37SBarry Smith     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
55126e093fcSHong Zhang     if (!usecprow) z += 6;
55215091d37SBarry Smith   }
55315091d37SBarry Smith 
5543649974fSBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
55526e093fcSHong Zhang   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
5567c565772SBarry Smith   ierr = PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);CHKERRQ(ierr);
55715091d37SBarry Smith   PetscFunctionReturn(0);
55815091d37SBarry Smith }
5598ab949d8SShri Abhyankar 
560dfbe8321SBarry Smith PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
5612d61bbb3SSatish Balay {
5622d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
563d9fead3dSBarry Smith   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
564d9fead3dSBarry Smith   const PetscScalar *x,*xb;
56526e093fcSHong Zhang   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*zarray;
566d9fead3dSBarry Smith   const MatScalar   *v;
567dfbe8321SBarry Smith   PetscErrorCode    ierr;
5687c565772SBarry Smith   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
569ace3abfcSBarry Smith   PetscBool         usecprow=a->compressedrow.use;
5702d61bbb3SSatish Balay 
571433994e6SBarry Smith   PetscFunctionBegin;
5723649974fSBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
57326e093fcSHong Zhang   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
5742d61bbb3SSatish Balay 
5752d61bbb3SSatish Balay   idx = a->j;
5762d61bbb3SSatish Balay   v   = a->a;
57726e093fcSHong Zhang   if (usecprow) {
57826e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
57926e093fcSHong Zhang     ii   = a->compressedrow.i;
5807b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
581ef16e671SStefano Zampini     ierr = PetscMemzero(zarray,7*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
58226e093fcSHong Zhang   } else {
58326e093fcSHong Zhang     mbs = a->mbs;
5842d61bbb3SSatish Balay     ii  = a->i;
58526e093fcSHong Zhang     z   = zarray;
58626e093fcSHong Zhang   }
5872d61bbb3SSatish Balay 
5882d61bbb3SSatish Balay   for (i=0; i<mbs; i++) {
58926fbe8dcSKarl Rupp     n  = ii[1] - ii[0];
59026fbe8dcSKarl Rupp     ii++;
59126fbe8dcSKarl Rupp     sum1 = 0.0;
59226fbe8dcSKarl Rupp     sum2 = 0.0;
59326fbe8dcSKarl Rupp     sum3 = 0.0;
59426fbe8dcSKarl Rupp     sum4 = 0.0;
59526fbe8dcSKarl Rupp     sum5 = 0.0;
59626fbe8dcSKarl Rupp     sum6 = 0.0;
59726fbe8dcSKarl Rupp     sum7 = 0.0;
59826fbe8dcSKarl Rupp 
599444d8c10SJed Brown     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
600444d8c10SJed Brown     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
6012d61bbb3SSatish Balay     for (j=0; j<n; j++) {
6022d61bbb3SSatish Balay       xb    = x + 7*(*idx++);
6032d61bbb3SSatish Balay       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
6042d61bbb3SSatish Balay       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
6052d61bbb3SSatish Balay       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
6062d61bbb3SSatish Balay       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
6072d61bbb3SSatish Balay       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
6082d61bbb3SSatish Balay       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
6092d61bbb3SSatish Balay       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
6102d61bbb3SSatish Balay       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
6112d61bbb3SSatish Balay       v    += 49;
6122d61bbb3SSatish Balay     }
6137b2bb3b9SHong Zhang     if (usecprow) z = zarray + 7*ridx[i];
6142d61bbb3SSatish Balay     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
61526e093fcSHong Zhang     if (!usecprow) z += 7;
6162d61bbb3SSatish Balay   }
6172d61bbb3SSatish Balay 
6183649974fSBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
61926e093fcSHong Zhang   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
6207c565772SBarry Smith   ierr = PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);CHKERRQ(ierr);
6212d61bbb3SSatish Balay   PetscFunctionReturn(0);
6222d61bbb3SSatish Balay }
6232d61bbb3SSatish Balay 
6248ab949d8SShri Abhyankar /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
625832cc040SShri Abhyankar /* Default MatMult for block size 15 */
6268ab949d8SShri Abhyankar 
6278ab949d8SShri Abhyankar PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz)
6288ab949d8SShri Abhyankar {
6298ab949d8SShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
6308ab949d8SShri Abhyankar   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
6318ab949d8SShri Abhyankar   const PetscScalar *x,*xb;
63253ef36baSBarry Smith   PetscScalar       *zarray,xv;
6338ab949d8SShri Abhyankar   const MatScalar   *v;
6348ab949d8SShri Abhyankar   PetscErrorCode    ierr;
6358ab949d8SShri Abhyankar   const PetscInt    *ii,*ij=a->j,*idx;
6367c565772SBarry Smith   PetscInt          mbs,i,j,k,n,*ridx=NULL;
637ace3abfcSBarry Smith   PetscBool         usecprow=a->compressedrow.use;
6388ab949d8SShri Abhyankar 
6398ab949d8SShri Abhyankar   PetscFunctionBegin;
6403649974fSBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
6418ab949d8SShri Abhyankar   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
6428ab949d8SShri Abhyankar 
6438ab949d8SShri Abhyankar   v = a->a;
6448ab949d8SShri Abhyankar   if (usecprow) {
6458ab949d8SShri Abhyankar     mbs  = a->compressedrow.nrows;
6468ab949d8SShri Abhyankar     ii   = a->compressedrow.i;
6478ab949d8SShri Abhyankar     ridx = a->compressedrow.rindex;
648ef16e671SStefano Zampini     ierr = PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
6498ab949d8SShri Abhyankar   } else {
6508ab949d8SShri Abhyankar     mbs = a->mbs;
6518ab949d8SShri Abhyankar     ii  = a->i;
6528ab949d8SShri Abhyankar     z   = zarray;
6538ab949d8SShri Abhyankar   }
6548ab949d8SShri Abhyankar 
6558ab949d8SShri Abhyankar   for (i=0; i<mbs; i++) {
6568ab949d8SShri Abhyankar     n    = ii[i+1] - ii[i];
6578ab949d8SShri Abhyankar     idx  = ij + ii[i];
6588ab949d8SShri Abhyankar     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
6598ab949d8SShri Abhyankar     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
6608ab949d8SShri Abhyankar 
6618ab949d8SShri Abhyankar     for (j=0; j<n; j++) {
6628ab949d8SShri Abhyankar       xb = x + 15*(idx[j]);
6638ab949d8SShri Abhyankar 
6648ab949d8SShri Abhyankar       for (k=0; k<15; k++) {
66553ef36baSBarry Smith         xv     =  xb[k];
66653ef36baSBarry Smith         sum1  += v[0]*xv;
66753ef36baSBarry Smith         sum2  += v[1]*xv;
66853ef36baSBarry Smith         sum3  += v[2]*xv;
66953ef36baSBarry Smith         sum4  += v[3]*xv;
67053ef36baSBarry Smith         sum5  += v[4]*xv;
67153ef36baSBarry Smith         sum6  += v[5]*xv;
67253ef36baSBarry Smith         sum7  += v[6]*xv;
67353ef36baSBarry Smith         sum8  += v[7]*xv;
67453ef36baSBarry Smith         sum9  += v[8]*xv;
67553ef36baSBarry Smith         sum10 += v[9]*xv;
67653ef36baSBarry Smith         sum11 += v[10]*xv;
67753ef36baSBarry Smith         sum12 += v[11]*xv;
67853ef36baSBarry Smith         sum13 += v[12]*xv;
67953ef36baSBarry Smith         sum14 += v[13]*xv;
68053ef36baSBarry Smith         sum15 += v[14]*xv;
6818ab949d8SShri Abhyankar         v     += 15;
6828ab949d8SShri Abhyankar       }
6838ab949d8SShri Abhyankar     }
6848ab949d8SShri Abhyankar     if (usecprow) z = zarray + 15*ridx[i];
6858ab949d8SShri Abhyankar     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
6868ab949d8SShri Abhyankar     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
6878ab949d8SShri Abhyankar 
6888ab949d8SShri Abhyankar     if (!usecprow) z += 15;
6898ab949d8SShri Abhyankar   }
6908ab949d8SShri Abhyankar 
6913649974fSBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
6928ab949d8SShri Abhyankar   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
6937c565772SBarry Smith   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
6948ab949d8SShri Abhyankar   PetscFunctionReturn(0);
6958ab949d8SShri Abhyankar }
6968ab949d8SShri Abhyankar 
6978ab949d8SShri Abhyankar /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
6988ab949d8SShri Abhyankar PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz)
6998ab949d8SShri Abhyankar {
7008ab949d8SShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
7018ab949d8SShri Abhyankar   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
7028ab949d8SShri Abhyankar   const PetscScalar *x,*xb;
7030b8f6341SShri Abhyankar   PetscScalar       x1,x2,x3,x4,*zarray;
7048ab949d8SShri Abhyankar   const MatScalar   *v;
7058ab949d8SShri Abhyankar   PetscErrorCode    ierr;
7068ab949d8SShri Abhyankar   const PetscInt    *ii,*ij=a->j,*idx;
7077c565772SBarry Smith   PetscInt          mbs,i,j,n,*ridx=NULL;
708ace3abfcSBarry Smith   PetscBool         usecprow=a->compressedrow.use;
7098ab949d8SShri Abhyankar 
7108ab949d8SShri Abhyankar   PetscFunctionBegin;
7113649974fSBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
7128ab949d8SShri Abhyankar   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
7138ab949d8SShri Abhyankar 
7148ab949d8SShri Abhyankar   v = a->a;
7158ab949d8SShri Abhyankar   if (usecprow) {
7168ab949d8SShri Abhyankar     mbs  = a->compressedrow.nrows;
7178ab949d8SShri Abhyankar     ii   = a->compressedrow.i;
7188ab949d8SShri Abhyankar     ridx = a->compressedrow.rindex;
719ef16e671SStefano Zampini     ierr = PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
7208ab949d8SShri Abhyankar   } else {
7218ab949d8SShri Abhyankar     mbs = a->mbs;
7228ab949d8SShri Abhyankar     ii  = a->i;
7238ab949d8SShri Abhyankar     z   = zarray;
7248ab949d8SShri Abhyankar   }
7258ab949d8SShri Abhyankar 
7268ab949d8SShri Abhyankar   for (i=0; i<mbs; i++) {
7278ab949d8SShri Abhyankar     n    = ii[i+1] - ii[i];
7288ab949d8SShri Abhyankar     idx  = ij + ii[i];
7298ab949d8SShri Abhyankar     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
7308ab949d8SShri Abhyankar     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
7318ab949d8SShri Abhyankar 
7328ab949d8SShri Abhyankar     for (j=0; j<n; j++) {
7338ab949d8SShri Abhyankar       xb = x + 15*(idx[j]);
7340b8f6341SShri Abhyankar       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
7358ab949d8SShri Abhyankar 
7368ab949d8SShri Abhyankar       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
7378ab949d8SShri Abhyankar       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
7388ab949d8SShri Abhyankar       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
7398ab949d8SShri Abhyankar       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
7408ab949d8SShri Abhyankar       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
7418ab949d8SShri Abhyankar       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
7428ab949d8SShri Abhyankar       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
7438ab949d8SShri Abhyankar       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
7448ab949d8SShri Abhyankar       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
7458ab949d8SShri Abhyankar       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
7468ab949d8SShri Abhyankar       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
7478ab949d8SShri Abhyankar       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
7488ab949d8SShri Abhyankar       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
7498ab949d8SShri Abhyankar       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
7508ab949d8SShri Abhyankar       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
7518ab949d8SShri Abhyankar 
7528ab949d8SShri Abhyankar       v += 60;
7538ab949d8SShri Abhyankar 
7540b8f6341SShri Abhyankar       x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];
7558ab949d8SShri Abhyankar 
7568ab949d8SShri Abhyankar       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
7578ab949d8SShri Abhyankar       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
7588ab949d8SShri Abhyankar       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
7598ab949d8SShri Abhyankar       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
7608ab949d8SShri Abhyankar       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
7618ab949d8SShri Abhyankar       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
7628ab949d8SShri Abhyankar       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
7638ab949d8SShri Abhyankar       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
7648ab949d8SShri Abhyankar       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
7658ab949d8SShri Abhyankar       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
7668ab949d8SShri Abhyankar       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
7678ab949d8SShri Abhyankar       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
7688ab949d8SShri Abhyankar       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
7698ab949d8SShri Abhyankar       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
7708ab949d8SShri Abhyankar       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
7718ab949d8SShri Abhyankar       v     += 60;
7728ab949d8SShri Abhyankar 
7730b8f6341SShri Abhyankar       x1     = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
7740b8f6341SShri Abhyankar       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
7750b8f6341SShri Abhyankar       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
7760b8f6341SShri Abhyankar       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
7770b8f6341SShri Abhyankar       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
7780b8f6341SShri Abhyankar       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
7790b8f6341SShri Abhyankar       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
7800b8f6341SShri Abhyankar       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
7810b8f6341SShri Abhyankar       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
7820b8f6341SShri Abhyankar       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
7830b8f6341SShri Abhyankar       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
7840b8f6341SShri Abhyankar       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
7850b8f6341SShri Abhyankar       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
7860b8f6341SShri Abhyankar       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
7870b8f6341SShri Abhyankar       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
7880b8f6341SShri Abhyankar       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
7890b8f6341SShri Abhyankar       v     += 60;
7900b8f6341SShri Abhyankar 
7910b8f6341SShri Abhyankar       x1     = xb[12]; x2 = xb[13]; x3 = xb[14];
7928ab949d8SShri Abhyankar       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3;
7938ab949d8SShri Abhyankar       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3;
7948ab949d8SShri Abhyankar       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3;
7958ab949d8SShri Abhyankar       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3;
7968ab949d8SShri Abhyankar       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3;
7978ab949d8SShri Abhyankar       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3;
7988ab949d8SShri Abhyankar       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3;
7998ab949d8SShri Abhyankar       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3;
8008ab949d8SShri Abhyankar       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3;
8018ab949d8SShri Abhyankar       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3;
8028ab949d8SShri Abhyankar       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3;
8038ab949d8SShri Abhyankar       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3;
8048ab949d8SShri Abhyankar       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3;
8058ab949d8SShri Abhyankar       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3;
8068ab949d8SShri Abhyankar       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3;
8078ab949d8SShri Abhyankar       v     += 45;
8088ab949d8SShri Abhyankar     }
8098ab949d8SShri Abhyankar     if (usecprow) z = zarray + 15*ridx[i];
8108ab949d8SShri Abhyankar     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
8118ab949d8SShri Abhyankar     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
8128ab949d8SShri Abhyankar 
8138ab949d8SShri Abhyankar     if (!usecprow) z += 15;
8148ab949d8SShri Abhyankar   }
8158ab949d8SShri Abhyankar 
8163649974fSBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
8178ab949d8SShri Abhyankar   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
8187c565772SBarry Smith   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
8198ab949d8SShri Abhyankar   PetscFunctionReturn(0);
8208ab949d8SShri Abhyankar }
8218ab949d8SShri Abhyankar 
8228ab949d8SShri Abhyankar /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
8238ab949d8SShri Abhyankar PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz)
8248ab949d8SShri Abhyankar {
8258ab949d8SShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
8268ab949d8SShri Abhyankar   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
8278ab949d8SShri Abhyankar   const PetscScalar *x,*xb;
8280b8f6341SShri Abhyankar   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,*zarray;
8298ab949d8SShri Abhyankar   const MatScalar   *v;
8308ab949d8SShri Abhyankar   PetscErrorCode    ierr;
8318ab949d8SShri Abhyankar   const PetscInt    *ii,*ij=a->j,*idx;
8327c565772SBarry Smith   PetscInt          mbs,i,j,n,*ridx=NULL;
833ace3abfcSBarry Smith   PetscBool         usecprow=a->compressedrow.use;
8348ab949d8SShri Abhyankar 
8358ab949d8SShri Abhyankar   PetscFunctionBegin;
8363649974fSBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
8378ab949d8SShri Abhyankar   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
8388ab949d8SShri Abhyankar 
8398ab949d8SShri Abhyankar   v = a->a;
8408ab949d8SShri Abhyankar   if (usecprow) {
8418ab949d8SShri Abhyankar     mbs  = a->compressedrow.nrows;
8428ab949d8SShri Abhyankar     ii   = a->compressedrow.i;
8438ab949d8SShri Abhyankar     ridx = a->compressedrow.rindex;
844ef16e671SStefano Zampini     ierr = PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
8458ab949d8SShri Abhyankar   } else {
8468ab949d8SShri Abhyankar     mbs = a->mbs;
8478ab949d8SShri Abhyankar     ii  = a->i;
8488ab949d8SShri Abhyankar     z   = zarray;
8498ab949d8SShri Abhyankar   }
8508ab949d8SShri Abhyankar 
8518ab949d8SShri Abhyankar   for (i=0; i<mbs; i++) {
8528ab949d8SShri Abhyankar     n    = ii[i+1] - ii[i];
8538ab949d8SShri Abhyankar     idx  = ij + ii[i];
8548ab949d8SShri Abhyankar     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
8558ab949d8SShri Abhyankar     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
8568ab949d8SShri Abhyankar 
8578ab949d8SShri Abhyankar     for (j=0; j<n; j++) {
8588ab949d8SShri Abhyankar       xb = x + 15*(idx[j]);
8598ab949d8SShri Abhyankar       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
8600b8f6341SShri Abhyankar       x8 = xb[7];
8618ab949d8SShri Abhyankar 
8628ab949d8SShri Abhyankar       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8;
8638ab949d8SShri Abhyankar       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8;
8648ab949d8SShri Abhyankar       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8;
8658ab949d8SShri Abhyankar       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8;
8668ab949d8SShri Abhyankar       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8;
8678ab949d8SShri Abhyankar       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8;
8688ab949d8SShri Abhyankar       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8;
8698ab949d8SShri Abhyankar       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8;
8708ab949d8SShri Abhyankar       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8;
8718ab949d8SShri Abhyankar       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8;
8728ab949d8SShri Abhyankar       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8;
8738ab949d8SShri Abhyankar       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8;
8748ab949d8SShri Abhyankar       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8;
8758ab949d8SShri Abhyankar       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8;
8768ab949d8SShri Abhyankar       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8;
8778ab949d8SShri Abhyankar       v     += 120;
8788ab949d8SShri Abhyankar 
8790b8f6341SShri Abhyankar       x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14];
8800b8f6341SShri Abhyankar 
8818ab949d8SShri Abhyankar       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7;
8828ab949d8SShri Abhyankar       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7;
8838ab949d8SShri Abhyankar       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7;
8848ab949d8SShri Abhyankar       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7;
8858ab949d8SShri Abhyankar       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7;
8868ab949d8SShri Abhyankar       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7;
8878ab949d8SShri Abhyankar       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7;
8888ab949d8SShri Abhyankar       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7;
8898ab949d8SShri Abhyankar       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7;
8908ab949d8SShri Abhyankar       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7;
8918ab949d8SShri Abhyankar       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7;
8928ab949d8SShri Abhyankar       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7;
8938ab949d8SShri Abhyankar       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7;
8948ab949d8SShri Abhyankar       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7;
8958ab949d8SShri Abhyankar       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7;
8968ab949d8SShri Abhyankar       v     += 105;
8978ab949d8SShri Abhyankar     }
8988ab949d8SShri Abhyankar     if (usecprow) z = zarray + 15*ridx[i];
8998ab949d8SShri Abhyankar     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
9008ab949d8SShri Abhyankar     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
9018ab949d8SShri Abhyankar 
9028ab949d8SShri Abhyankar     if (!usecprow) z += 15;
9038ab949d8SShri Abhyankar   }
9048ab949d8SShri Abhyankar 
9053649974fSBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
9068ab949d8SShri Abhyankar   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
9077c565772SBarry Smith   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
9088ab949d8SShri Abhyankar   PetscFunctionReturn(0);
9098ab949d8SShri Abhyankar }
9108ab949d8SShri Abhyankar 
9118ab949d8SShri Abhyankar /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
9128ab949d8SShri Abhyankar 
9138ab949d8SShri Abhyankar PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz)
9148ab949d8SShri Abhyankar {
9158ab949d8SShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
9168ab949d8SShri Abhyankar   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
9178ab949d8SShri Abhyankar   const PetscScalar *x,*xb;
9188ab949d8SShri Abhyankar   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray;
9198ab949d8SShri Abhyankar   const MatScalar   *v;
9208ab949d8SShri Abhyankar   PetscErrorCode    ierr;
9218ab949d8SShri Abhyankar   const PetscInt    *ii,*ij=a->j,*idx;
9227c565772SBarry Smith   PetscInt          mbs,i,j,n,*ridx=NULL;
923ace3abfcSBarry Smith   PetscBool         usecprow=a->compressedrow.use;
9248ab949d8SShri Abhyankar 
9258ab949d8SShri Abhyankar   PetscFunctionBegin;
9263649974fSBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
9278ab949d8SShri Abhyankar   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
9288ab949d8SShri Abhyankar 
9298ab949d8SShri Abhyankar   v = a->a;
9308ab949d8SShri Abhyankar   if (usecprow) {
9318ab949d8SShri Abhyankar     mbs  = a->compressedrow.nrows;
9328ab949d8SShri Abhyankar     ii   = a->compressedrow.i;
9338ab949d8SShri Abhyankar     ridx = a->compressedrow.rindex;
934ef16e671SStefano Zampini     ierr = PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
9358ab949d8SShri Abhyankar   } else {
9368ab949d8SShri Abhyankar     mbs = a->mbs;
9378ab949d8SShri Abhyankar     ii  = a->i;
9388ab949d8SShri Abhyankar     z   = zarray;
9398ab949d8SShri Abhyankar   }
9408ab949d8SShri Abhyankar 
9418ab949d8SShri Abhyankar   for (i=0; i<mbs; i++) {
9428ab949d8SShri Abhyankar     n    = ii[i+1] - ii[i];
9438ab949d8SShri Abhyankar     idx  = ij + ii[i];
9448ab949d8SShri Abhyankar     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
9458ab949d8SShri Abhyankar     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
9468ab949d8SShri Abhyankar 
9478ab949d8SShri Abhyankar     for (j=0; j<n; j++) {
9488ab949d8SShri Abhyankar       xb = x + 15*(idx[j]);
9498ab949d8SShri Abhyankar       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
9508ab949d8SShri Abhyankar       x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14];
9518ab949d8SShri Abhyankar 
9528ab949d8SShri Abhyankar       sum1  +=  v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
9538ab949d8SShri Abhyankar       sum2  +=  v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
9548ab949d8SShri Abhyankar       sum3  +=  v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
9558ab949d8SShri Abhyankar       sum4  +=  v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
9568ab949d8SShri Abhyankar       sum5  += v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
9578ab949d8SShri Abhyankar       sum6  += v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
9588ab949d8SShri Abhyankar       sum7  += v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
9598ab949d8SShri Abhyankar       sum8  += v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
9608ab949d8SShri Abhyankar       sum9  += v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
9618ab949d8SShri Abhyankar       sum10 += v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
9628ab949d8SShri Abhyankar       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
9638ab949d8SShri Abhyankar       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
9648ab949d8SShri Abhyankar       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
9658ab949d8SShri Abhyankar       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
9668ab949d8SShri Abhyankar       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
9678ab949d8SShri Abhyankar       v     += 225;
9688ab949d8SShri Abhyankar     }
9698ab949d8SShri Abhyankar     if (usecprow) z = zarray + 15*ridx[i];
9708ab949d8SShri Abhyankar     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
9718ab949d8SShri Abhyankar     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
9728ab949d8SShri Abhyankar 
9738ab949d8SShri Abhyankar     if (!usecprow) z += 15;
9748ab949d8SShri Abhyankar   }
9758ab949d8SShri Abhyankar 
9763649974fSBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
9778ab949d8SShri Abhyankar   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
9787c565772SBarry Smith   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
9798ab949d8SShri Abhyankar   PetscFunctionReturn(0);
9808ab949d8SShri Abhyankar }
9818ab949d8SShri Abhyankar 
9828ab949d8SShri Abhyankar 
9833f1db9ecSBarry Smith /*
9843f1db9ecSBarry Smith     This will not work with MatScalar == float because it calls the BLAS
9853f1db9ecSBarry Smith */
986dfbe8321SBarry Smith PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
9872d61bbb3SSatish Balay {
9882d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
989d9ca1df4SBarry Smith   PetscScalar       *z = 0,*work,*workt,*zarray;
990d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
991d9ca1df4SBarry Smith   const MatScalar   *v;
992dfbe8321SBarry Smith   PetscErrorCode    ierr;
993d9ca1df4SBarry Smith   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
994d9ca1df4SBarry Smith   const PetscInt    *idx,*ii,*ridx=NULL;
995d9ca1df4SBarry Smith   PetscInt          ncols,k;
996ace3abfcSBarry Smith   PetscBool         usecprow=a->compressedrow.use;
9972d61bbb3SSatish Balay 
9982d61bbb3SSatish Balay   PetscFunctionBegin;
999d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
100026e093fcSHong Zhang   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
10012d61bbb3SSatish Balay 
10022d61bbb3SSatish Balay   idx = a->j;
10032d61bbb3SSatish Balay   v   = a->a;
100426e093fcSHong Zhang   if (usecprow) {
100526e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
100626e093fcSHong Zhang     ii   = a->compressedrow.i;
10077b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
1008ef16e671SStefano Zampini     ierr = PetscMemzero(zarray,bs*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
100926e093fcSHong Zhang   } else {
101026e093fcSHong Zhang     mbs = a->mbs;
10112d61bbb3SSatish Balay     ii  = a->i;
101226e093fcSHong Zhang     z   = zarray;
101326e093fcSHong Zhang   }
1014218c64b6SSatish Balay 
10152d61bbb3SSatish Balay   if (!a->mult_work) {
1016d0f46423SBarry Smith     k    = PetscMax(A->rmap->n,A->cmap->n);
1017854ce69bSBarry Smith     ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
10182d61bbb3SSatish Balay   }
10192d61bbb3SSatish Balay   work = a->mult_work;
10202d61bbb3SSatish Balay   for (i=0; i<mbs; i++) {
10212d61bbb3SSatish Balay     n           = ii[1] - ii[0]; ii++;
10222d61bbb3SSatish Balay     ncols       = n*bs;
10232d61bbb3SSatish Balay     workt       = work;
10242d61bbb3SSatish Balay     for (j=0; j<n; j++) {
10252d61bbb3SSatish Balay       xb = x + bs*(*idx++);
10262d61bbb3SSatish Balay       for (k=0; k<bs; k++) workt[k] = xb[k];
10272d61bbb3SSatish Balay       workt += bs;
10282d61bbb3SSatish Balay     }
10297b2bb3b9SHong Zhang     if (usecprow) z = zarray + bs*ridx[i];
103096b95a6bSBarry Smith     PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
103171044d3cSBarry Smith     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
10322d61bbb3SSatish Balay     v += n*bs2;
103326e093fcSHong Zhang     if (!usecprow) z += bs;
10342d61bbb3SSatish Balay   }
1035d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
103626e093fcSHong Zhang   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
10377c565772SBarry Smith   ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);CHKERRQ(ierr);
10382d61bbb3SSatish Balay   PetscFunctionReturn(0);
10392d61bbb3SSatish Balay }
10402d61bbb3SSatish Balay 
1041dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
10422d61bbb3SSatish Balay {
10432d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1044122f12eaSBarry Smith   const PetscScalar *x;
1045122f12eaSBarry Smith   PetscScalar       *y,*z,sum;
1046122f12eaSBarry Smith   const MatScalar   *v;
1047dfbe8321SBarry Smith   PetscErrorCode    ierr;
10487c565772SBarry Smith   PetscInt          mbs=a->mbs,i,n,*ridx=NULL;
1049122f12eaSBarry Smith   const PetscInt    *idx,*ii;
1050ace3abfcSBarry Smith   PetscBool         usecprow=a->compressedrow.use;
10512d61bbb3SSatish Balay 
10522d61bbb3SSatish Balay   PetscFunctionBegin;
10533649974fSBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1054d9ca1df4SBarry Smith   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
10552d61bbb3SSatish Balay 
10562d61bbb3SSatish Balay   idx = a->j;
10572d61bbb3SSatish Balay   v   = a->a;
105826e093fcSHong Zhang   if (usecprow) {
10594eb6d288SHong Zhang     if (zz != yy) {
1060122f12eaSBarry Smith       ierr = PetscMemcpy(z,y,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
10614eb6d288SHong Zhang     }
106226e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
106326e093fcSHong Zhang     ii   = a->compressedrow.i;
10647b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
106526e093fcSHong Zhang   } else {
10662d61bbb3SSatish Balay     ii = a->i;
106726e093fcSHong Zhang   }
10682d61bbb3SSatish Balay 
10692d61bbb3SSatish Balay   for (i=0; i<mbs; i++) {
1070122f12eaSBarry Smith     n = ii[1] - ii[0];
1071122f12eaSBarry Smith     ii++;
107226e093fcSHong Zhang     if (!usecprow) {
1073122f12eaSBarry Smith       sum         = y[i];
1074122f12eaSBarry Smith     } else {
1075122f12eaSBarry Smith       sum = y[ridx[i]];
1076122f12eaSBarry Smith     }
1077444d8c10SJed Brown     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1078444d8c10SJed Brown     PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
1079122f12eaSBarry Smith     PetscSparseDensePlusDot(sum,x,v,idx,n);
1080122f12eaSBarry Smith     v   += n;
1081122f12eaSBarry Smith     idx += n;
1082122f12eaSBarry Smith     if (usecprow) {
1083122f12eaSBarry Smith       z[ridx[i]] = sum;
1084122f12eaSBarry Smith     } else {
1085122f12eaSBarry Smith       z[i] = sum;
108626e093fcSHong Zhang     }
10872d61bbb3SSatish Balay   }
10883649974fSBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1089d9ca1df4SBarry Smith   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
10907c565772SBarry Smith   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
10912d61bbb3SSatish Balay   PetscFunctionReturn(0);
10922d61bbb3SSatish Balay }
10932d61bbb3SSatish Balay 
1094dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
10952d61bbb3SSatish Balay {
10962d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1097d9ca1df4SBarry Smith   PetscScalar       *y = 0,*z = 0,sum1,sum2;
1098d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
109926e093fcSHong Zhang   PetscScalar       x1,x2,*yarray,*zarray;
1100d9ca1df4SBarry Smith   const MatScalar   *v;
1101dfbe8321SBarry Smith   PetscErrorCode    ierr;
1102d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,j;
1103d9ca1df4SBarry Smith   const PetscInt    *idx,*ii,*ridx = NULL;
1104ace3abfcSBarry Smith   PetscBool         usecprow = a->compressedrow.use;
11052d61bbb3SSatish Balay 
11062d61bbb3SSatish Balay   PetscFunctionBegin;
1107d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1108d9ca1df4SBarry Smith   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
11092d61bbb3SSatish Balay 
11102d61bbb3SSatish Balay   idx = a->j;
11112d61bbb3SSatish Balay   v   = a->a;
111226e093fcSHong Zhang   if (usecprow) {
11134eb6d288SHong Zhang     if (zz != yy) {
11144eb6d288SHong Zhang       ierr = PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
11154eb6d288SHong Zhang     }
111626e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
111726e093fcSHong Zhang     ii   = a->compressedrow.i;
11187b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
111926e093fcSHong Zhang   } else {
11202d61bbb3SSatish Balay     ii = a->i;
112126e093fcSHong Zhang     y  = yarray;
112226e093fcSHong Zhang     z  = zarray;
112326e093fcSHong Zhang   }
11242d61bbb3SSatish Balay 
11252d61bbb3SSatish Balay   for (i=0; i<mbs; i++) {
11262d61bbb3SSatish Balay     n = ii[1] - ii[0]; ii++;
112726e093fcSHong Zhang     if (usecprow) {
11287b2bb3b9SHong Zhang       z = zarray + 2*ridx[i];
11297b2bb3b9SHong Zhang       y = yarray + 2*ridx[i];
113026e093fcSHong Zhang     }
11312d61bbb3SSatish Balay     sum1 = y[0]; sum2 = y[1];
1132444d8c10SJed Brown     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1133444d8c10SJed Brown     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
11342d61bbb3SSatish Balay     for (j=0; j<n; j++) {
113526fbe8dcSKarl Rupp       xb = x + 2*(*idx++);
113626fbe8dcSKarl Rupp       x1 = xb[0];
113726fbe8dcSKarl Rupp       x2 = xb[1];
113826fbe8dcSKarl Rupp 
11392d61bbb3SSatish Balay       sum1 += v[0]*x1 + v[2]*x2;
11402d61bbb3SSatish Balay       sum2 += v[1]*x1 + v[3]*x2;
11412d61bbb3SSatish Balay       v    += 4;
11422d61bbb3SSatish Balay     }
11432d61bbb3SSatish Balay     z[0] = sum1; z[1] = sum2;
114426e093fcSHong Zhang     if (!usecprow) {
11452d61bbb3SSatish Balay       z += 2; y += 2;
11462d61bbb3SSatish Balay     }
114726e093fcSHong Zhang   }
1148d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1149d9ca1df4SBarry Smith   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1150dc0b31edSSatish Balay   ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr);
11512d61bbb3SSatish Balay   PetscFunctionReturn(0);
11522d61bbb3SSatish Balay }
11532d61bbb3SSatish Balay 
1154dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
11552d61bbb3SSatish Balay {
11562d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1157d9ca1df4SBarry Smith   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
1158d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
1159d9ca1df4SBarry Smith   const MatScalar   *v;
1160dfbe8321SBarry Smith   PetscErrorCode    ierr;
1161d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,j,n;
1162d9ca1df4SBarry Smith   const PetscInt    *idx,*ii,*ridx = NULL;
1163ace3abfcSBarry Smith   PetscBool         usecprow = a->compressedrow.use;
11642d61bbb3SSatish Balay 
11652d61bbb3SSatish Balay   PetscFunctionBegin;
1166d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1167d9ca1df4SBarry Smith   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
11682d61bbb3SSatish Balay 
11692d61bbb3SSatish Balay   idx = a->j;
11702d61bbb3SSatish Balay   v   = a->a;
117126e093fcSHong Zhang   if (usecprow) {
11724eb6d288SHong Zhang     if (zz != yy) {
11734eb6d288SHong Zhang       ierr = PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
11744eb6d288SHong Zhang     }
117526e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
117626e093fcSHong Zhang     ii   = a->compressedrow.i;
11777b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
117826e093fcSHong Zhang   } else {
11792d61bbb3SSatish Balay     ii = a->i;
118026e093fcSHong Zhang     y  = yarray;
118126e093fcSHong Zhang     z  = zarray;
118226e093fcSHong Zhang   }
11832d61bbb3SSatish Balay 
11842d61bbb3SSatish Balay   for (i=0; i<mbs; i++) {
11852d61bbb3SSatish Balay     n = ii[1] - ii[0]; ii++;
118626e093fcSHong Zhang     if (usecprow) {
11877b2bb3b9SHong Zhang       z = zarray + 3*ridx[i];
11887b2bb3b9SHong Zhang       y = yarray + 3*ridx[i];
118926e093fcSHong Zhang     }
11902d61bbb3SSatish Balay     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
1191444d8c10SJed Brown     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1192444d8c10SJed Brown     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
11932d61bbb3SSatish Balay     for (j=0; j<n; j++) {
11942d61bbb3SSatish Balay       xb    = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
11952d61bbb3SSatish Balay       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
11962d61bbb3SSatish Balay       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
11972d61bbb3SSatish Balay       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
11982d61bbb3SSatish Balay       v    += 9;
11992d61bbb3SSatish Balay     }
12002d61bbb3SSatish Balay     z[0] = sum1; z[1] = sum2; z[2] = sum3;
120126e093fcSHong Zhang     if (!usecprow) {
12022d61bbb3SSatish Balay       z += 3; y += 3;
12032d61bbb3SSatish Balay     }
120426e093fcSHong Zhang   }
1205d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1206d9ca1df4SBarry Smith   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1207dc0b31edSSatish Balay   ierr = PetscLogFlops(18.0*a->nz);CHKERRQ(ierr);
12082d61bbb3SSatish Balay   PetscFunctionReturn(0);
12092d61bbb3SSatish Balay }
12102d61bbb3SSatish Balay 
1211dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
12122d61bbb3SSatish Balay {
12132d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1214d9ca1df4SBarry Smith   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
1215d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
1216d9ca1df4SBarry Smith   const MatScalar   *v;
1217dfbe8321SBarry Smith   PetscErrorCode    ierr;
1218d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,j,n;
1219d9ca1df4SBarry Smith   const PetscInt    *idx,*ii,*ridx=NULL;
1220ace3abfcSBarry Smith   PetscBool         usecprow=a->compressedrow.use;
12212d61bbb3SSatish Balay 
12222d61bbb3SSatish Balay   PetscFunctionBegin;
1223d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1224d9ca1df4SBarry Smith   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
12252d61bbb3SSatish Balay 
12262d61bbb3SSatish Balay   idx = a->j;
12272d61bbb3SSatish Balay   v   = a->a;
122826e093fcSHong Zhang   if (usecprow) {
12294eb6d288SHong Zhang     if (zz != yy) {
12304eb6d288SHong Zhang       ierr = PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
12314eb6d288SHong Zhang     }
123226e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
123326e093fcSHong Zhang     ii   = a->compressedrow.i;
12347b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
123526e093fcSHong Zhang   } else {
12362d61bbb3SSatish Balay     ii = a->i;
123726e093fcSHong Zhang     y  = yarray;
123826e093fcSHong Zhang     z  = zarray;
123926e093fcSHong Zhang   }
12402d61bbb3SSatish Balay 
12412d61bbb3SSatish Balay   for (i=0; i<mbs; i++) {
12422d61bbb3SSatish Balay     n = ii[1] - ii[0]; ii++;
124326e093fcSHong Zhang     if (usecprow) {
12447b2bb3b9SHong Zhang       z = zarray + 4*ridx[i];
12457b2bb3b9SHong Zhang       y = yarray + 4*ridx[i];
124626e093fcSHong Zhang     }
12472d61bbb3SSatish Balay     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
1248444d8c10SJed Brown     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1249444d8c10SJed Brown     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
12502d61bbb3SSatish Balay     for (j=0; j<n; j++) {
12512d61bbb3SSatish Balay       xb    = x + 4*(*idx++);
12522d61bbb3SSatish Balay       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
12532d61bbb3SSatish Balay       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
12542d61bbb3SSatish Balay       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
12552d61bbb3SSatish Balay       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
12562d61bbb3SSatish Balay       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
12572d61bbb3SSatish Balay       v    += 16;
12582d61bbb3SSatish Balay     }
12592d61bbb3SSatish Balay     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
126026e093fcSHong Zhang     if (!usecprow) {
12612d61bbb3SSatish Balay       z += 4; y += 4;
12622d61bbb3SSatish Balay     }
126326e093fcSHong Zhang   }
1264d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1265d9ca1df4SBarry Smith   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1266dc0b31edSSatish Balay   ierr = PetscLogFlops(32.0*a->nz);CHKERRQ(ierr);
12672d61bbb3SSatish Balay   PetscFunctionReturn(0);
12682d61bbb3SSatish Balay }
12692d61bbb3SSatish Balay 
1270dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
12712d61bbb3SSatish Balay {
12722d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1273d9ca1df4SBarry Smith   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
1274d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
127526e093fcSHong Zhang   PetscScalar       *yarray,*zarray;
1276d9ca1df4SBarry Smith   const MatScalar   *v;
1277dfbe8321SBarry Smith   PetscErrorCode    ierr;
1278d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,j,n;
1279d9ca1df4SBarry Smith   const PetscInt    *idx,*ii,*ridx = NULL;
1280ace3abfcSBarry Smith   PetscBool         usecprow=a->compressedrow.use;
12812d61bbb3SSatish Balay 
12822d61bbb3SSatish Balay   PetscFunctionBegin;
1283d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1284d9ca1df4SBarry Smith   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
12852d61bbb3SSatish Balay 
12862d61bbb3SSatish Balay   idx = a->j;
12872d61bbb3SSatish Balay   v   = a->a;
128826e093fcSHong Zhang   if (usecprow) {
12894eb6d288SHong Zhang     if (zz != yy) {
12904eb6d288SHong Zhang       ierr = PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
12914eb6d288SHong Zhang     }
129226e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
129326e093fcSHong Zhang     ii   = a->compressedrow.i;
12947b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
129526e093fcSHong Zhang   } else {
12962d61bbb3SSatish Balay     ii = a->i;
129726e093fcSHong Zhang     y  = yarray;
129826e093fcSHong Zhang     z  = zarray;
129926e093fcSHong Zhang   }
13002d61bbb3SSatish Balay 
13012d61bbb3SSatish Balay   for (i=0; i<mbs; i++) {
13022d61bbb3SSatish Balay     n = ii[1] - ii[0]; ii++;
130326e093fcSHong Zhang     if (usecprow) {
13047b2bb3b9SHong Zhang       z = zarray + 5*ridx[i];
13057b2bb3b9SHong Zhang       y = yarray + 5*ridx[i];
130626e093fcSHong Zhang     }
13072d61bbb3SSatish Balay     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
1308444d8c10SJed Brown     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1309444d8c10SJed Brown     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
13102d61bbb3SSatish Balay     for (j=0; j<n; j++) {
13112d61bbb3SSatish Balay       xb    = x + 5*(*idx++);
13122d61bbb3SSatish Balay       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
13132d61bbb3SSatish Balay       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
13142d61bbb3SSatish Balay       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
13152d61bbb3SSatish Balay       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
13162d61bbb3SSatish Balay       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
13172d61bbb3SSatish Balay       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
13182d61bbb3SSatish Balay       v    += 25;
13192d61bbb3SSatish Balay     }
13202d61bbb3SSatish Balay     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
132126e093fcSHong Zhang     if (!usecprow) {
13222d61bbb3SSatish Balay       z += 5; y += 5;
13232d61bbb3SSatish Balay     }
132426e093fcSHong Zhang   }
1325d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1326d9ca1df4SBarry Smith   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1327dc0b31edSSatish Balay   ierr = PetscLogFlops(50.0*a->nz);CHKERRQ(ierr);
13282d61bbb3SSatish Balay   PetscFunctionReturn(0);
13292d61bbb3SSatish Balay }
1330dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
133115091d37SBarry Smith {
133215091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1333d9ca1df4SBarry Smith   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
1334d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
133526e093fcSHong Zhang   PetscScalar       x1,x2,x3,x4,x5,x6,*yarray,*zarray;
1336d9ca1df4SBarry Smith   const MatScalar   *v;
1337dfbe8321SBarry Smith   PetscErrorCode    ierr;
1338d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,j,n;
1339d9ca1df4SBarry Smith   const PetscInt    *idx,*ii,*ridx=NULL;
1340ace3abfcSBarry Smith   PetscBool         usecprow=a->compressedrow.use;
134115091d37SBarry Smith 
134215091d37SBarry Smith   PetscFunctionBegin;
1343d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1344d9ca1df4SBarry Smith   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
134515091d37SBarry Smith 
134615091d37SBarry Smith   idx = a->j;
134715091d37SBarry Smith   v   = a->a;
134826e093fcSHong Zhang   if (usecprow) {
13494eb6d288SHong Zhang     if (zz != yy) {
13504eb6d288SHong Zhang       ierr = PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
13514eb6d288SHong Zhang     }
135226e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
135326e093fcSHong Zhang     ii   = a->compressedrow.i;
13547b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
135526e093fcSHong Zhang   } else {
135615091d37SBarry Smith     ii = a->i;
135726e093fcSHong Zhang     y  = yarray;
135826e093fcSHong Zhang     z  = zarray;
135926e093fcSHong Zhang   }
136015091d37SBarry Smith 
136115091d37SBarry Smith   for (i=0; i<mbs; i++) {
136215091d37SBarry Smith     n = ii[1] - ii[0]; ii++;
136326e093fcSHong Zhang     if (usecprow) {
13647b2bb3b9SHong Zhang       z = zarray + 6*ridx[i];
13657b2bb3b9SHong Zhang       y = yarray + 6*ridx[i];
136626e093fcSHong Zhang     }
136715091d37SBarry Smith     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
1368444d8c10SJed Brown     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1369444d8c10SJed Brown     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
137015091d37SBarry Smith     for (j=0; j<n; j++) {
13713b95cb0eSSatish Balay       xb    = x + 6*(*idx++);
137215091d37SBarry Smith       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
137315091d37SBarry Smith       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
137415091d37SBarry Smith       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
137515091d37SBarry Smith       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
137615091d37SBarry Smith       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
137715091d37SBarry Smith       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
137815091d37SBarry Smith       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
137915091d37SBarry Smith       v    += 36;
138015091d37SBarry Smith     }
138115091d37SBarry Smith     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
138226e093fcSHong Zhang     if (!usecprow) {
138315091d37SBarry Smith       z += 6; y += 6;
138415091d37SBarry Smith     }
138526e093fcSHong Zhang   }
1386d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1387d9ca1df4SBarry Smith   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1388dc0b31edSSatish Balay   ierr = PetscLogFlops(72.0*a->nz);CHKERRQ(ierr);
138915091d37SBarry Smith   PetscFunctionReturn(0);
139015091d37SBarry Smith }
13912d61bbb3SSatish Balay 
1392dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
13932d61bbb3SSatish Balay {
13942d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1395d9ca1df4SBarry Smith   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1396d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
139726e093fcSHong Zhang   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
1398d9ca1df4SBarry Smith   const MatScalar   *v;
1399dfbe8321SBarry Smith   PetscErrorCode    ierr;
1400d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,j,n;
1401d9ca1df4SBarry Smith   const PetscInt    *idx,*ii,*ridx = NULL;
1402ace3abfcSBarry Smith   PetscBool         usecprow=a->compressedrow.use;
14032d61bbb3SSatish Balay 
14042d61bbb3SSatish Balay   PetscFunctionBegin;
1405d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1406d9ca1df4SBarry Smith   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
14072d61bbb3SSatish Balay 
14082d61bbb3SSatish Balay   idx = a->j;
14092d61bbb3SSatish Balay   v   = a->a;
141026e093fcSHong Zhang   if (usecprow) {
14114eb6d288SHong Zhang     if (zz != yy) {
14124eb6d288SHong Zhang       ierr = PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
14134eb6d288SHong Zhang     }
141426e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
141526e093fcSHong Zhang     ii   = a->compressedrow.i;
14167b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
141726e093fcSHong Zhang   } else {
14182d61bbb3SSatish Balay     ii = a->i;
141926e093fcSHong Zhang     y  = yarray;
142026e093fcSHong Zhang     z  = zarray;
142126e093fcSHong Zhang   }
14222d61bbb3SSatish Balay 
14232d61bbb3SSatish Balay   for (i=0; i<mbs; i++) {
14242d61bbb3SSatish Balay     n = ii[1] - ii[0]; ii++;
142526e093fcSHong Zhang     if (usecprow) {
14267b2bb3b9SHong Zhang       z = zarray + 7*ridx[i];
14277b2bb3b9SHong Zhang       y = yarray + 7*ridx[i];
142826e093fcSHong Zhang     }
14292d61bbb3SSatish Balay     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1430444d8c10SJed Brown     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1431444d8c10SJed Brown     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
14322d61bbb3SSatish Balay     for (j=0; j<n; j++) {
14332d61bbb3SSatish Balay       xb    = x + 7*(*idx++);
14342d61bbb3SSatish Balay       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
14352d61bbb3SSatish Balay       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
14362d61bbb3SSatish Balay       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
14372d61bbb3SSatish Balay       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
14382d61bbb3SSatish Balay       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
14392d61bbb3SSatish Balay       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
14402d61bbb3SSatish Balay       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
14412d61bbb3SSatish Balay       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
14422d61bbb3SSatish Balay       v    += 49;
14432d61bbb3SSatish Balay     }
14442d61bbb3SSatish Balay     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
144526e093fcSHong Zhang     if (!usecprow) {
14462d61bbb3SSatish Balay       z += 7; y += 7;
14472d61bbb3SSatish Balay     }
144826e093fcSHong Zhang   }
1449d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1450d9ca1df4SBarry Smith   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1451dc0b31edSSatish Balay   ierr = PetscLogFlops(98.0*a->nz);CHKERRQ(ierr);
14522d61bbb3SSatish Balay   PetscFunctionReturn(0);
14532d61bbb3SSatish Balay }
1454218c64b6SSatish Balay 
1455dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
14562d61bbb3SSatish Balay {
14572d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1458d9ca1df4SBarry Smith   PetscScalar       *z = 0,*work,*workt,*zarray;
1459d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
1460d9ca1df4SBarry Smith   const MatScalar   *v;
14616849ba73SBarry Smith   PetscErrorCode    ierr;
1462d9ca1df4SBarry Smith   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1463d9ca1df4SBarry Smith   PetscInt          ncols,k;
1464d9ca1df4SBarry Smith   const PetscInt    *ridx = NULL,*idx,*ii;
1465ace3abfcSBarry Smith   PetscBool         usecprow = a->compressedrow.use;
1466218c64b6SSatish Balay 
14672d61bbb3SSatish Balay   PetscFunctionBegin;
1468343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1469d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
147026e093fcSHong Zhang   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
14712d61bbb3SSatish Balay 
14722d61bbb3SSatish Balay   idx = a->j;
14732d61bbb3SSatish Balay   v   = a->a;
147426e093fcSHong Zhang   if (usecprow) {
147526e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
147626e093fcSHong Zhang     ii   = a->compressedrow.i;
14777b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
147826e093fcSHong Zhang   } else {
147926e093fcSHong Zhang     mbs = a->mbs;
14802d61bbb3SSatish Balay     ii  = a->i;
148126e093fcSHong Zhang     z   = zarray;
148226e093fcSHong Zhang   }
14832d61bbb3SSatish Balay 
14842d61bbb3SSatish Balay   if (!a->mult_work) {
1485d0f46423SBarry Smith     k    = PetscMax(A->rmap->n,A->cmap->n);
1486854ce69bSBarry Smith     ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
14872d61bbb3SSatish Balay   }
14882d61bbb3SSatish Balay   work = a->mult_work;
14892d61bbb3SSatish Balay   for (i=0; i<mbs; i++) {
14902d61bbb3SSatish Balay     n     = ii[1] - ii[0]; ii++;
14912d61bbb3SSatish Balay     ncols = n*bs;
14922d61bbb3SSatish Balay     workt = work;
14932d61bbb3SSatish Balay     for (j=0; j<n; j++) {
14942d61bbb3SSatish Balay       xb = x + bs*(*idx++);
14952d61bbb3SSatish Balay       for (k=0; k<bs; k++) workt[k] = xb[k];
14962d61bbb3SSatish Balay       workt += bs;
14972d61bbb3SSatish Balay     }
14987b2bb3b9SHong Zhang     if (usecprow) z = zarray + bs*ridx[i];
149996b95a6bSBarry Smith     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
150071044d3cSBarry Smith     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
15012d61bbb3SSatish Balay     v += n*bs2;
150226fbe8dcSKarl Rupp     if (!usecprow) z += bs;
150326e093fcSHong Zhang   }
1504d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
150526e093fcSHong Zhang   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1506dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*a->nz*bs2);CHKERRQ(ierr);
15072d61bbb3SSatish Balay   PetscFunctionReturn(0);
15082d61bbb3SSatish Balay }
15092d61bbb3SSatish Balay 
1510547795f9SHong Zhang PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1511547795f9SHong Zhang {
1512547795f9SHong Zhang   PetscScalar    zero = 0.0;
1513547795f9SHong Zhang   PetscErrorCode ierr;
1514547795f9SHong Zhang 
1515547795f9SHong Zhang   PetscFunctionBegin;
1516547795f9SHong Zhang   ierr = VecSet(zz,zero);CHKERRQ(ierr);
1517547795f9SHong Zhang   ierr = MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr);
1518547795f9SHong Zhang   PetscFunctionReturn(0);
1519547795f9SHong Zhang }
1520547795f9SHong Zhang 
1521dfbe8321SBarry Smith PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
15222d61bbb3SSatish Balay {
15233447b6efSHong Zhang   PetscScalar    zero = 0.0;
15246849ba73SBarry Smith   PetscErrorCode ierr;
15252d61bbb3SSatish Balay 
15262d61bbb3SSatish Balay   PetscFunctionBegin;
15272dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
15283447b6efSHong Zhang   ierr = MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr);
15292d61bbb3SSatish Balay   PetscFunctionReturn(0);
15302d61bbb3SSatish Balay }
15312d61bbb3SSatish Balay 
1532547795f9SHong Zhang PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1533547795f9SHong Zhang {
1534547795f9SHong Zhang   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1535b8c08b77SHong Zhang   PetscScalar       *z,x1,x2,x3,x4,x5;
1536d9ca1df4SBarry Smith   const PetscScalar *x,*xb = NULL;
1537d9ca1df4SBarry Smith   const MatScalar   *v;
1538547795f9SHong Zhang   PetscErrorCode    ierr;
1539b8c08b77SHong Zhang   PetscInt          mbs,i,rval,bs=A->rmap->bs,j,n;
1540d9ca1df4SBarry Smith   const PetscInt    *idx,*ii,*ib,*ridx = NULL;
1541547795f9SHong Zhang   Mat_CompressedRow cprow = a->compressedrow;
1542ace3abfcSBarry Smith   PetscBool         usecprow = cprow.use;
1543547795f9SHong Zhang 
1544547795f9SHong Zhang   PetscFunctionBegin;
1545343551c4SBarry Smith   if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); }
1546d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1547547795f9SHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1548547795f9SHong Zhang 
1549547795f9SHong Zhang   idx = a->j;
1550547795f9SHong Zhang   v   = a->a;
1551547795f9SHong Zhang   if (usecprow) {
1552547795f9SHong Zhang     mbs  = cprow.nrows;
1553547795f9SHong Zhang     ii   = cprow.i;
1554547795f9SHong Zhang     ridx = cprow.rindex;
1555547795f9SHong Zhang   } else {
1556547795f9SHong Zhang     mbs=a->mbs;
1557547795f9SHong Zhang     ii = a->i;
1558547795f9SHong Zhang     xb = x;
1559547795f9SHong Zhang   }
1560547795f9SHong Zhang 
1561547795f9SHong Zhang   switch (bs) {
1562547795f9SHong Zhang   case 1:
1563547795f9SHong Zhang     for (i=0; i<mbs; i++) {
1564547795f9SHong Zhang       if (usecprow) xb = x + ridx[i];
1565547795f9SHong Zhang       x1 = xb[0];
1566547795f9SHong Zhang       ib = idx + ii[0];
1567547795f9SHong Zhang       n  = ii[1] - ii[0]; ii++;
1568547795f9SHong Zhang       for (j=0; j<n; j++) {
1569547795f9SHong Zhang         rval     = ib[j];
1570547795f9SHong Zhang         z[rval] += PetscConj(*v) * x1;
1571547795f9SHong Zhang         v++;
1572547795f9SHong Zhang       }
1573547795f9SHong Zhang       if (!usecprow) xb++;
1574547795f9SHong Zhang     }
1575547795f9SHong Zhang     break;
1576547795f9SHong Zhang   case 2:
1577547795f9SHong Zhang     for (i=0; i<mbs; i++) {
1578547795f9SHong Zhang       if (usecprow) xb = x + 2*ridx[i];
1579547795f9SHong Zhang       x1 = xb[0]; x2 = xb[1];
1580547795f9SHong Zhang       ib = idx + ii[0];
1581547795f9SHong Zhang       n  = ii[1] - ii[0]; ii++;
1582547795f9SHong Zhang       for (j=0; j<n; j++) {
1583547795f9SHong Zhang         rval       = ib[j]*2;
1584547795f9SHong Zhang         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2;
1585547795f9SHong Zhang         z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2;
1586547795f9SHong Zhang         v         += 4;
1587547795f9SHong Zhang       }
1588547795f9SHong Zhang       if (!usecprow) xb += 2;
1589547795f9SHong Zhang     }
1590547795f9SHong Zhang     break;
1591547795f9SHong Zhang   case 3:
1592547795f9SHong Zhang     for (i=0; i<mbs; i++) {
1593547795f9SHong Zhang       if (usecprow) xb = x + 3*ridx[i];
1594547795f9SHong Zhang       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1595547795f9SHong Zhang       ib = idx + ii[0];
1596547795f9SHong Zhang       n  = ii[1] - ii[0]; ii++;
1597547795f9SHong Zhang       for (j=0; j<n; j++) {
1598547795f9SHong Zhang         rval       = ib[j]*3;
1599547795f9SHong Zhang         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3;
1600547795f9SHong Zhang         z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3;
1601547795f9SHong Zhang         z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3;
1602547795f9SHong Zhang         v         += 9;
1603547795f9SHong Zhang       }
1604547795f9SHong Zhang       if (!usecprow) xb += 3;
1605547795f9SHong Zhang     }
1606547795f9SHong Zhang     break;
1607547795f9SHong Zhang   case 4:
1608547795f9SHong Zhang     for (i=0; i<mbs; i++) {
1609547795f9SHong Zhang       if (usecprow) xb = x + 4*ridx[i];
1610547795f9SHong Zhang       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1611547795f9SHong Zhang       ib = idx + ii[0];
1612547795f9SHong Zhang       n  = ii[1] - ii[0]; ii++;
1613547795f9SHong Zhang       for (j=0; j<n; j++) {
1614547795f9SHong Zhang         rval       = ib[j]*4;
1615547795f9SHong Zhang         z[rval++] +=  PetscConj(v[0])*x1 + PetscConj(v[1])*x2  + PetscConj(v[2])*x3  + PetscConj(v[3])*x4;
1616547795f9SHong Zhang         z[rval++] +=  PetscConj(v[4])*x1 + PetscConj(v[5])*x2  + PetscConj(v[6])*x3  + PetscConj(v[7])*x4;
1617547795f9SHong Zhang         z[rval++] +=  PetscConj(v[8])*x1 + PetscConj(v[9])*x2  + PetscConj(v[10])*x3 + PetscConj(v[11])*x4;
1618547795f9SHong Zhang         z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4;
1619547795f9SHong Zhang         v         += 16;
1620547795f9SHong Zhang       }
1621547795f9SHong Zhang       if (!usecprow) xb += 4;
1622547795f9SHong Zhang     }
1623547795f9SHong Zhang     break;
1624547795f9SHong Zhang   case 5:
1625547795f9SHong Zhang     for (i=0; i<mbs; i++) {
1626547795f9SHong Zhang       if (usecprow) xb = x + 5*ridx[i];
1627547795f9SHong Zhang       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1628547795f9SHong Zhang       x4 = xb[3]; x5 = xb[4];
1629547795f9SHong Zhang       ib = idx + ii[0];
1630547795f9SHong Zhang       n  = ii[1] - ii[0]; ii++;
1631547795f9SHong Zhang       for (j=0; j<n; j++) {
1632547795f9SHong Zhang         rval       = ib[j]*5;
1633547795f9SHong Zhang         z[rval++] +=  PetscConj(v[0])*x1 +  PetscConj(v[1])*x2 +  PetscConj(v[2])*x3 +  PetscConj(v[3])*x4 +  PetscConj(v[4])*x5;
1634547795f9SHong Zhang         z[rval++] +=  PetscConj(v[5])*x1 +  PetscConj(v[6])*x2 +  PetscConj(v[7])*x3 +  PetscConj(v[8])*x4 +  PetscConj(v[9])*x5;
1635547795f9SHong Zhang         z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5;
1636547795f9SHong Zhang         z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5;
1637547795f9SHong Zhang         z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5;
1638547795f9SHong Zhang         v         += 25;
1639547795f9SHong Zhang       }
1640547795f9SHong Zhang       if (!usecprow) xb += 5;
1641547795f9SHong Zhang     }
1642547795f9SHong Zhang     break;
1643968ae2c8SSatish Balay   default: /* block sizes larger than 5 by 5 are handled by BLAS */
1644968ae2c8SSatish Balay     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet");
1645968ae2c8SSatish Balay #if 0
1646968ae2c8SSatish Balay     {
1647b8c08b77SHong Zhang       PetscInt          ncols,k,bs2=a->bs2;
1648b8c08b77SHong Zhang       PetscScalar       *work,*workt,zb;
1649d9ca1df4SBarry Smith       const PetscScalar *xtmp;
1650547795f9SHong Zhang       if (!a->mult_work) {
1651547795f9SHong Zhang         k    = PetscMax(A->rmap->n,A->cmap->n);
1652854ce69bSBarry Smith         ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
1653547795f9SHong Zhang       }
1654547795f9SHong Zhang       work = a->mult_work;
1655547795f9SHong Zhang       xtmp = x;
1656547795f9SHong Zhang       for (i=0; i<mbs; i++) {
1657547795f9SHong Zhang         n     = ii[1] - ii[0]; ii++;
1658547795f9SHong Zhang         ncols = n*bs;
1659547795f9SHong Zhang         ierr  = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
166026fbe8dcSKarl Rupp         if (usecprow) xtmp = x + bs*ridx[i];
166196b95a6bSBarry Smith         PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
1662547795f9SHong Zhang         /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
1663547795f9SHong Zhang         v += n*bs2;
1664547795f9SHong Zhang         if (!usecprow) xtmp += bs;
1665547795f9SHong Zhang         workt = work;
1666547795f9SHong Zhang         for (j=0; j<n; j++) {
1667547795f9SHong Zhang           zb = z + bs*(*idx++);
1668547795f9SHong Zhang           for (k=0; k<bs; k++) zb[k] += workt[k] ;
1669547795f9SHong Zhang           workt += bs;
1670547795f9SHong Zhang         }
1671547795f9SHong Zhang       }
1672547795f9SHong Zhang     }
1673968ae2c8SSatish Balay #endif
1674547795f9SHong Zhang   }
1675d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1676547795f9SHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1677547795f9SHong Zhang   ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr);
1678547795f9SHong Zhang   PetscFunctionReturn(0);
1679547795f9SHong Zhang }
1680547795f9SHong Zhang 
1681dfbe8321SBarry Smith PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
16822d61bbb3SSatish Balay {
16832d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1684d9ca1df4SBarry Smith   PetscScalar       *zb,*z,x1,x2,x3,x4,x5;
1685d9ca1df4SBarry Smith   const PetscScalar *x,*xb = 0;
1686d9ca1df4SBarry Smith   const MatScalar   *v;
16876849ba73SBarry Smith   PetscErrorCode    ierr;
1688d9ca1df4SBarry Smith   PetscInt          mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2;
1689d9ca1df4SBarry Smith   const PetscInt    *idx,*ii,*ib,*ridx = NULL;
16903447b6efSHong Zhang   Mat_CompressedRow cprow   = a->compressedrow;
1691ace3abfcSBarry Smith   PetscBool         usecprow=cprow.use;
16922d61bbb3SSatish Balay 
16932d61bbb3SSatish Balay   PetscFunctionBegin;
1694343551c4SBarry Smith   if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); }
1695d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
16963447b6efSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
16972d61bbb3SSatish Balay 
16982d61bbb3SSatish Balay   idx = a->j;
16992d61bbb3SSatish Balay   v   = a->a;
17003447b6efSHong Zhang   if (usecprow) {
17013447b6efSHong Zhang     mbs  = cprow.nrows;
17023447b6efSHong Zhang     ii   = cprow.i;
17037b2bb3b9SHong Zhang     ridx = cprow.rindex;
17043447b6efSHong Zhang   } else {
17053447b6efSHong Zhang     mbs=a->mbs;
17062d61bbb3SSatish Balay     ii = a->i;
1707f1af5d2fSBarry Smith     xb = x;
17083447b6efSHong Zhang   }
17092d61bbb3SSatish Balay 
17102d61bbb3SSatish Balay   switch (bs) {
17112d61bbb3SSatish Balay   case 1:
17122d61bbb3SSatish Balay     for (i=0; i<mbs; i++) {
17137b2bb3b9SHong Zhang       if (usecprow) xb = x + ridx[i];
1714f1af5d2fSBarry Smith       x1 = xb[0];
17153447b6efSHong Zhang       ib = idx + ii[0];
17163447b6efSHong Zhang       n  = ii[1] - ii[0]; ii++;
17172d61bbb3SSatish Balay       for (j=0; j<n; j++) {
17182d61bbb3SSatish Balay         rval     = ib[j];
1719f1af5d2fSBarry Smith         z[rval] += *v * x1;
1720f1af5d2fSBarry Smith         v++;
17212d61bbb3SSatish Balay       }
17223447b6efSHong Zhang       if (!usecprow) xb++;
17232d61bbb3SSatish Balay     }
17242d61bbb3SSatish Balay     break;
17252d61bbb3SSatish Balay   case 2:
17262d61bbb3SSatish Balay     for (i=0; i<mbs; i++) {
17277b2bb3b9SHong Zhang       if (usecprow) xb = x + 2*ridx[i];
1728f1af5d2fSBarry Smith       x1 = xb[0]; x2 = xb[1];
17293447b6efSHong Zhang       ib = idx + ii[0];
17303447b6efSHong Zhang       n  = ii[1] - ii[0]; ii++;
17312d61bbb3SSatish Balay       for (j=0; j<n; j++) {
17322d61bbb3SSatish Balay         rval       = ib[j]*2;
17332d61bbb3SSatish Balay         z[rval++] += v[0]*x1 + v[1]*x2;
17342d61bbb3SSatish Balay         z[rval++] += v[2]*x1 + v[3]*x2;
17352d61bbb3SSatish Balay         v         += 4;
17362d61bbb3SSatish Balay       }
17373447b6efSHong Zhang       if (!usecprow) xb += 2;
17382d61bbb3SSatish Balay     }
17392d61bbb3SSatish Balay     break;
17402d61bbb3SSatish Balay   case 3:
17412d61bbb3SSatish Balay     for (i=0; i<mbs; i++) {
17427b2bb3b9SHong Zhang       if (usecprow) xb = x + 3*ridx[i];
1743f1af5d2fSBarry Smith       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
17443447b6efSHong Zhang       ib = idx + ii[0];
17453447b6efSHong Zhang       n  = ii[1] - ii[0]; ii++;
17462d61bbb3SSatish Balay       for (j=0; j<n; j++) {
17472d61bbb3SSatish Balay         rval       = ib[j]*3;
17482d61bbb3SSatish Balay         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
17492d61bbb3SSatish Balay         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
17502d61bbb3SSatish Balay         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
17512d61bbb3SSatish Balay         v         += 9;
17522d61bbb3SSatish Balay       }
17533447b6efSHong Zhang       if (!usecprow) xb += 3;
17542d61bbb3SSatish Balay     }
17552d61bbb3SSatish Balay     break;
17562d61bbb3SSatish Balay   case 4:
17572d61bbb3SSatish Balay     for (i=0; i<mbs; i++) {
17587b2bb3b9SHong Zhang       if (usecprow) xb = x + 4*ridx[i];
1759f1af5d2fSBarry Smith       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
17603447b6efSHong Zhang       ib = idx + ii[0];
17613447b6efSHong Zhang       n  = ii[1] - ii[0]; ii++;
17622d61bbb3SSatish Balay       for (j=0; j<n; j++) {
17632d61bbb3SSatish Balay         rval       = ib[j]*4;
17642d61bbb3SSatish Balay         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
17652d61bbb3SSatish Balay         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
17662d61bbb3SSatish Balay         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
17672d61bbb3SSatish Balay         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
17682d61bbb3SSatish Balay         v         += 16;
17692d61bbb3SSatish Balay       }
17703447b6efSHong Zhang       if (!usecprow) xb += 4;
17712d61bbb3SSatish Balay     }
17722d61bbb3SSatish Balay     break;
17732d61bbb3SSatish Balay   case 5:
17742d61bbb3SSatish Balay     for (i=0; i<mbs; i++) {
17757b2bb3b9SHong Zhang       if (usecprow) xb = x + 5*ridx[i];
1776f1af5d2fSBarry Smith       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
17772d61bbb3SSatish Balay       x4 = xb[3]; x5 = xb[4];
17783447b6efSHong Zhang       ib = idx + ii[0];
17793447b6efSHong Zhang       n  = ii[1] - ii[0]; ii++;
17802d61bbb3SSatish Balay       for (j=0; j<n; j++) {
17812d61bbb3SSatish Balay         rval       = ib[j]*5;
17822d61bbb3SSatish Balay         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
17832d61bbb3SSatish Balay         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
17842d61bbb3SSatish Balay         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
17852d61bbb3SSatish Balay         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
17862d61bbb3SSatish Balay         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
17872d61bbb3SSatish Balay         v         += 25;
17882d61bbb3SSatish Balay       }
17893447b6efSHong Zhang       if (!usecprow) xb += 5;
17902d61bbb3SSatish Balay     }
17912d61bbb3SSatish Balay     break;
1792f1af5d2fSBarry Smith   default: {      /* block sizes larger then 5 by 5 are handled by BLAS */
1793690b6cddSBarry Smith     PetscInt          ncols,k;
1794d9ca1df4SBarry Smith     PetscScalar       *work,*workt;
1795d9ca1df4SBarry Smith     const PetscScalar *xtmp;
17962d61bbb3SSatish Balay     if (!a->mult_work) {
1797d0f46423SBarry Smith       k    = PetscMax(A->rmap->n,A->cmap->n);
1798854ce69bSBarry Smith       ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
17992d61bbb3SSatish Balay     }
18002d61bbb3SSatish Balay     work = a->mult_work;
18013447b6efSHong Zhang     xtmp = x;
18022d61bbb3SSatish Balay     for (i=0; i<mbs; i++) {
18032d61bbb3SSatish Balay       n     = ii[1] - ii[0]; ii++;
18042d61bbb3SSatish Balay       ncols = n*bs;
180587828ca2SBarry Smith       ierr  = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
180626fbe8dcSKarl Rupp       if (usecprow) xtmp = x + bs*ridx[i];
180796b95a6bSBarry Smith       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
180871044d3cSBarry Smith       /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
18092d61bbb3SSatish Balay       v += n*bs2;
18103447b6efSHong Zhang       if (!usecprow) xtmp += bs;
18112d61bbb3SSatish Balay       workt = work;
18122d61bbb3SSatish Balay       for (j=0; j<n; j++) {
18132d61bbb3SSatish Balay         zb = z + bs*(*idx++);
18142d61bbb3SSatish Balay         for (k=0; k<bs; k++) zb[k] += workt[k];
18152d61bbb3SSatish Balay         workt += bs;
18162d61bbb3SSatish Balay       }
18172d61bbb3SSatish Balay     }
18182d61bbb3SSatish Balay     }
18192d61bbb3SSatish Balay   }
1820d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
18213447b6efSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1822dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr);
18232d61bbb3SSatish Balay   PetscFunctionReturn(0);
18242d61bbb3SSatish Balay }
18252d61bbb3SSatish Balay 
1826f4df32b1SMatthew Knepley PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
18272d61bbb3SSatish Balay {
18282d61bbb3SSatish Balay   Mat_SeqBAIJ    *a      = (Mat_SeqBAIJ*)inA->data;
1829690b6cddSBarry Smith   PetscInt       totalnz = a->bs2*a->nz;
1830f4df32b1SMatthew Knepley   PetscScalar    oalpha  = alpha;
1831efee365bSSatish Balay   PetscErrorCode ierr;
1832c5df96a5SBarry Smith   PetscBLASInt   one = 1,tnz;
18332d61bbb3SSatish Balay 
18342d61bbb3SSatish Balay   PetscFunctionBegin;
1835c5df96a5SBarry Smith   ierr = PetscBLASIntCast(totalnz,&tnz);CHKERRQ(ierr);
18368b83055fSJed Brown   PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one));
1837efee365bSSatish Balay   ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
18382d61bbb3SSatish Balay   PetscFunctionReturn(0);
18392d61bbb3SSatish Balay }
18402d61bbb3SSatish Balay 
1841dfbe8321SBarry Smith PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
18422d61bbb3SSatish Balay {
18438a62d963SHong Zhang   PetscErrorCode ierr;
18442d61bbb3SSatish Balay   Mat_SeqBAIJ    *a  = (Mat_SeqBAIJ*)A->data;
18453f1db9ecSBarry Smith   MatScalar      *v  = a->a;
1846329f5518SBarry Smith   PetscReal      sum = 0.0;
1847d0f46423SBarry Smith   PetscInt       i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1;
18482d61bbb3SSatish Balay 
18492d61bbb3SSatish Balay   PetscFunctionBegin;
18502d61bbb3SSatish Balay   if (type == NORM_FROBENIUS) {
1851570b7f6dSBarry Smith #if defined(PETSC_USE_REAL___FP16)
1852570b7f6dSBarry Smith     PetscBLASInt one = 1,cnt = bs2*nz;
1853570b7f6dSBarry Smith     *norm = BLASnrm2_(&cnt,v,&one);
1854570b7f6dSBarry Smith #else
18552d61bbb3SSatish Balay     for (i=0; i<bs2*nz; i++) {
1856329f5518SBarry Smith       sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
18572d61bbb3SSatish Balay     }
1858570b7f6dSBarry Smith #endif
18598f1a2a5eSBarry Smith     *norm = PetscSqrtReal(sum);
186051f70360SJed Brown     ierr = PetscLogFlops(2*bs2*nz);CHKERRQ(ierr);
18618a62d963SHong Zhang   } else if (type == NORM_1) { /* maximum column sum */
18628a62d963SHong Zhang     PetscReal *tmp;
18638a62d963SHong Zhang     PetscInt  *bcol = a->j;
1864854ce69bSBarry Smith     ierr = PetscCalloc1(A->cmap->n+1,&tmp);CHKERRQ(ierr);
18658a62d963SHong Zhang     for (i=0; i<nz; i++) {
18668a62d963SHong Zhang       for (j=0; j<bs; j++) {
18678a62d963SHong Zhang         k1 = bs*(*bcol) + j; /* column index */
18688a62d963SHong Zhang         for (k=0; k<bs; k++) {
18698a62d963SHong Zhang           tmp[k1] += PetscAbsScalar(*v); v++;
18708a62d963SHong Zhang         }
18718a62d963SHong Zhang       }
18728a62d963SHong Zhang       bcol++;
18738a62d963SHong Zhang     }
18748a62d963SHong Zhang     *norm = 0.0;
1875d0f46423SBarry Smith     for (j=0; j<A->cmap->n; j++) {
18768a62d963SHong Zhang       if (tmp[j] > *norm) *norm = tmp[j];
18778a62d963SHong Zhang     }
18788a62d963SHong Zhang     ierr = PetscFree(tmp);CHKERRQ(ierr);
187951f70360SJed Brown     ierr = PetscLogFlops(PetscMax(bs2*nz-1,0));CHKERRQ(ierr);
1880596552b5SBarry Smith   } else if (type == NORM_INFINITY) { /* maximum row sum */
1881596552b5SBarry Smith     *norm = 0.0;
1882596552b5SBarry Smith     for (k=0; k<bs; k++) {
188374f84c7bSSatish Balay       for (j=0; j<a->mbs; j++) {
1884596552b5SBarry Smith         v   = a->a + bs2*a->i[j] + k;
1885596552b5SBarry Smith         sum = 0.0;
1886596552b5SBarry Smith         for (i=0; i<a->i[j+1]-a->i[j]; i++) {
18870e90e235SBarry Smith           for (k1=0; k1<bs; k1++) {
1888596552b5SBarry Smith             sum += PetscAbsScalar(*v);
1889596552b5SBarry Smith             v   += bs;
18902d61bbb3SSatish Balay           }
18910e90e235SBarry Smith         }
1892596552b5SBarry Smith         if (sum > *norm) *norm = sum;
1893596552b5SBarry Smith       }
1894596552b5SBarry Smith     }
189551f70360SJed Brown     ierr = PetscLogFlops(PetscMax(bs2*nz-1,0));CHKERRQ(ierr);
1896e7e72b3dSBarry Smith   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
18972d61bbb3SSatish Balay   PetscFunctionReturn(0);
18982d61bbb3SSatish Balay }
18992d61bbb3SSatish Balay 
19002d61bbb3SSatish Balay 
1901ace3abfcSBarry Smith PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg)
19022d61bbb3SSatish Balay {
19032d61bbb3SSatish Balay   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
1904dfbe8321SBarry Smith   PetscErrorCode ierr;
19052d61bbb3SSatish Balay 
19062d61bbb3SSatish Balay   PetscFunctionBegin;
19072d61bbb3SSatish Balay   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1908d0f46423SBarry Smith   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1909273d9f13SBarry Smith     *flg = PETSC_FALSE;
1910273d9f13SBarry Smith     PetscFunctionReturn(0);
19112d61bbb3SSatish Balay   }
19122d61bbb3SSatish Balay 
19132d61bbb3SSatish Balay   /* if the a->i are the same */
1914690b6cddSBarry Smith   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr);
191526fbe8dcSKarl Rupp   if (!*flg) PetscFunctionReturn(0);
19162d61bbb3SSatish Balay 
19172d61bbb3SSatish Balay   /* if a->j are the same */
1918690b6cddSBarry Smith   ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr);
191926fbe8dcSKarl Rupp   if (!*flg) PetscFunctionReturn(0);
192026fbe8dcSKarl Rupp 
19212d61bbb3SSatish Balay   /* if a->a are the same */
1922d0f46423SBarry Smith   ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
19232d61bbb3SSatish Balay   PetscFunctionReturn(0);
19242d61bbb3SSatish Balay 
19252d61bbb3SSatish Balay }
19262d61bbb3SSatish Balay 
1927dfbe8321SBarry Smith PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
19282d61bbb3SSatish Balay {
19292d61bbb3SSatish Balay   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1930dfbe8321SBarry Smith   PetscErrorCode ierr;
1931690b6cddSBarry Smith   PetscInt       i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
193287828ca2SBarry Smith   PetscScalar    *x,zero = 0.0;
19333f1db9ecSBarry Smith   MatScalar      *aa,*aa_j;
19342d61bbb3SSatish Balay 
19352d61bbb3SSatish Balay   PetscFunctionBegin;
1936e32f2f54SBarry Smith   if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1937d0f46423SBarry Smith   bs   = A->rmap->bs;
19382d61bbb3SSatish Balay   aa   = a->a;
19392d61bbb3SSatish Balay   ai   = a->i;
19402d61bbb3SSatish Balay   aj   = a->j;
19412d61bbb3SSatish Balay   ambs = a->mbs;
19422d61bbb3SSatish Balay   bs2  = a->bs2;
19432d61bbb3SSatish Balay 
19442dcb1b2aSMatthew Knepley   ierr = VecSet(v,zero);CHKERRQ(ierr);
19451ebc52fbSHong Zhang   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1946e1311b90SBarry Smith   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1947e32f2f54SBarry Smith   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
19482d61bbb3SSatish Balay   for (i=0; i<ambs; i++) {
19492d61bbb3SSatish Balay     for (j=ai[i]; j<ai[i+1]; j++) {
19502d61bbb3SSatish Balay       if (aj[j] == i) {
19512d61bbb3SSatish Balay         row  = i*bs;
19522d61bbb3SSatish Balay         aa_j = aa+j*bs2;
19532d61bbb3SSatish Balay         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
19542d61bbb3SSatish Balay         break;
19552d61bbb3SSatish Balay       }
19562d61bbb3SSatish Balay     }
19572d61bbb3SSatish Balay   }
19581ebc52fbSHong Zhang   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
19592d61bbb3SSatish Balay   PetscFunctionReturn(0);
19602d61bbb3SSatish Balay }
19612d61bbb3SSatish Balay 
1962dfbe8321SBarry Smith PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
19632d61bbb3SSatish Balay {
19642d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
196553ef36baSBarry Smith   const PetscScalar *l,*r,*li,*ri;
196653ef36baSBarry Smith   PetscScalar       x;
19673f1db9ecSBarry Smith   MatScalar         *aa, *v;
1968dfbe8321SBarry Smith   PetscErrorCode    ierr;
196953ef36baSBarry Smith   PetscInt          i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai;
197053ef36baSBarry Smith   const PetscInt    *ai,*aj;
19712d61bbb3SSatish Balay 
19722d61bbb3SSatish Balay   PetscFunctionBegin;
19732d61bbb3SSatish Balay   ai  = a->i;
19742d61bbb3SSatish Balay   aj  = a->j;
19752d61bbb3SSatish Balay   aa  = a->a;
1976d0f46423SBarry Smith   m   = A->rmap->n;
1977d0f46423SBarry Smith   n   = A->cmap->n;
1978d0f46423SBarry Smith   bs  = A->rmap->bs;
19792d61bbb3SSatish Balay   mbs = a->mbs;
19802d61bbb3SSatish Balay   bs2 = a->bs2;
19812d61bbb3SSatish Balay   if (ll) {
198253ef36baSBarry Smith     ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr);
1983e1311b90SBarry Smith     ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1984e32f2f54SBarry Smith     if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
19852d61bbb3SSatish Balay     for (i=0; i<mbs; i++) { /* for each block row */
19862d61bbb3SSatish Balay       M  = ai[i+1] - ai[i];
19872d61bbb3SSatish Balay       li = l + i*bs;
19882d61bbb3SSatish Balay       v  = aa + bs2*ai[i];
19892d61bbb3SSatish Balay       for (j=0; j<M; j++) { /* for each block */
19902d61bbb3SSatish Balay         for (k=0; k<bs2; k++) {
19912d61bbb3SSatish Balay           (*v++) *= li[k%bs];
19922d61bbb3SSatish Balay         }
19932d61bbb3SSatish Balay       }
19942d61bbb3SSatish Balay     }
199553ef36baSBarry Smith     ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr);
1996efee365bSSatish Balay     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
19972d61bbb3SSatish Balay   }
19982d61bbb3SSatish Balay 
19992d61bbb3SSatish Balay   if (rr) {
200053ef36baSBarry Smith     ierr = VecGetArrayRead(rr,&r);CHKERRQ(ierr);
2001e1311b90SBarry Smith     ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr);
2002e32f2f54SBarry Smith     if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
20032d61bbb3SSatish Balay     for (i=0; i<mbs; i++) { /* for each block row */
200453ef36baSBarry Smith       iai = ai[i];
200553ef36baSBarry Smith       M   = ai[i+1] - iai;
200653ef36baSBarry Smith       v   = aa + bs2*iai;
20072d61bbb3SSatish Balay       for (j=0; j<M; j++) { /* for each block */
200853ef36baSBarry Smith         ri = r + bs*aj[iai+j];
20092d61bbb3SSatish Balay         for (k=0; k<bs; k++) {
20102d61bbb3SSatish Balay           x = ri[k];
201153ef36baSBarry Smith           for (tmp=0; tmp<bs; tmp++) v[tmp] *= x;
201253ef36baSBarry Smith           v += bs;
20132d61bbb3SSatish Balay         }
20142d61bbb3SSatish Balay       }
20152d61bbb3SSatish Balay     }
201653ef36baSBarry Smith     ierr = VecRestoreArrayRead(rr,&r);CHKERRQ(ierr);
2017efee365bSSatish Balay     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
20182d61bbb3SSatish Balay   }
20192d61bbb3SSatish Balay   PetscFunctionReturn(0);
20202d61bbb3SSatish Balay }
20212d61bbb3SSatish Balay 
20222d61bbb3SSatish Balay 
2023dfbe8321SBarry Smith PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
20242d61bbb3SSatish Balay {
20252d61bbb3SSatish Balay   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
20262d61bbb3SSatish Balay 
20272d61bbb3SSatish Balay   PetscFunctionBegin;
20282d61bbb3SSatish Balay   info->block_size   = a->bs2;
2029ceed8ce5SJed Brown   info->nz_allocated = a->bs2*a->maxnz;
20302d61bbb3SSatish Balay   info->nz_used      = a->bs2*a->nz;
20312d61bbb3SSatish Balay   info->nz_unneeded  = (double)(info->nz_allocated - info->nz_used);
20322d61bbb3SSatish Balay   info->assemblies   = A->num_ass;
20338e58a170SBarry Smith   info->mallocs      = A->info.mallocs;
20347adad957SLisandro Dalcin   info->memory       = ((PetscObject)A)->mem;
2035d5f3da31SBarry Smith   if (A->factortype) {
20362d61bbb3SSatish Balay     info->fill_ratio_given  = A->info.fill_ratio_given;
20372d61bbb3SSatish Balay     info->fill_ratio_needed = A->info.fill_ratio_needed;
20382d61bbb3SSatish Balay     info->factor_mallocs    = A->info.factor_mallocs;
20392d61bbb3SSatish Balay   } else {
20402d61bbb3SSatish Balay     info->fill_ratio_given  = 0;
20412d61bbb3SSatish Balay     info->fill_ratio_needed = 0;
20422d61bbb3SSatish Balay     info->factor_mallocs    = 0;
20432d61bbb3SSatish Balay   }
20442d61bbb3SSatish Balay   PetscFunctionReturn(0);
20452d61bbb3SSatish Balay }
20462d61bbb3SSatish Balay 
2047dfbe8321SBarry Smith PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
20482d61bbb3SSatish Balay {
20492d61bbb3SSatish Balay   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2050dfbe8321SBarry Smith   PetscErrorCode ierr;
20512d61bbb3SSatish Balay 
20522d61bbb3SSatish Balay   PetscFunctionBegin;
2053549d3d68SSatish Balay   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
20542d61bbb3SSatish Balay   PetscFunctionReturn(0);
20552d61bbb3SSatish Balay }
2056