xref: /petsc/src/mat/impls/baij/seq/baij2.c (revision dcca6d9d80ebd869fe6029bd05a3aa9faafef49e)
1cac129eeSSatish Balay 
2c6db04a5SJed Brown #include <../src/mat/impls/baij/seq/baij.h>
306873bf2SBarry Smith #include <petsc-private/kernels/blockinvert.h>
4c6db04a5SJed Brown #include <petscbt.h>
5c6db04a5SJed Brown #include <petscblaslapack.h>
6d6232bceSMichael Lange #if defined(PETSC_THREADCOMM_ACTIVE)
7d6232bceSMichael Lange #include <petscthreadcomm.h>
8d6232bceSMichael Lange #endif
9cac129eeSSatish Balay 
104a2ae208SSatish Balay #undef __FUNCT__
114a2ae208SSatish Balay #define __FUNCT__ "MatIncreaseOverlap_SeqBAIJ"
12690b6cddSBarry Smith PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
13a3192f15SSatish Balay {
14a3192f15SSatish Balay   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
156849ba73SBarry Smith   PetscErrorCode ierr;
165d0c19d7SBarry Smith   PetscInt       row,i,j,k,l,m,n,*nidx,isz,val,ival;
175d0c19d7SBarry Smith   const PetscInt *idx;
18690b6cddSBarry Smith   PetscInt       start,end,*ai,*aj,bs,*nidx2;
19f1af5d2fSBarry Smith   PetscBT        table;
20a3192f15SSatish Balay 
213a40ed3dSBarry Smith   PetscFunctionBegin;
22a3192f15SSatish Balay   m  = a->mbs;
23a3192f15SSatish Balay   ai = a->i;
24a3192f15SSatish Balay   aj = a->j;
25d0f46423SBarry Smith   bs = A->rmap->bs;
26a3192f15SSatish Balay 
27e32f2f54SBarry Smith   if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
28a3192f15SSatish Balay 
2953b8de81SBarry Smith   ierr = PetscBTCreate(m,&table);CHKERRQ(ierr);
30690b6cddSBarry Smith   ierr = PetscMalloc((m+1)*sizeof(PetscInt),&nidx);CHKERRQ(ierr);
31d0f46423SBarry Smith   ierr = PetscMalloc((A->rmap->N+1)*sizeof(PetscInt),&nidx2);CHKERRQ(ierr);
32a3192f15SSatish Balay 
33a3192f15SSatish Balay   for (i=0; i<is_max; i++) {
34a3192f15SSatish Balay     /* Initialise the two local arrays */
35a3192f15SSatish Balay     isz  = 0;
366831982aSBarry Smith     ierr = PetscBTMemzero(m,table);CHKERRQ(ierr);
37a3192f15SSatish Balay 
38a3192f15SSatish Balay     /* Extract the indices, assume there can be duplicate entries */
39a3192f15SSatish Balay     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
40b9b97703SBarry Smith     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
41a3192f15SSatish Balay 
42a3192f15SSatish Balay     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
43a3192f15SSatish Balay     for (j=0; j<n; ++j) {
44218c64b6SSatish Balay       ival = idx[j]/bs; /* convert the indices into block indices */
45e32f2f54SBarry Smith       if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
4626fbe8dcSKarl Rupp       if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival;
47a3192f15SSatish Balay     }
48a3192f15SSatish Balay     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
496bf464f9SBarry Smith     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
50a3192f15SSatish Balay 
51a3192f15SSatish Balay     k = 0;
52a3192f15SSatish Balay     for (j=0; j<ov; j++) { /* for each overlap*/
53a3192f15SSatish Balay       n = isz;
54a3192f15SSatish Balay       for (; k<n; k++) {  /* do only those rows in nidx[k], which are not done yet */
55a3192f15SSatish Balay         row   = nidx[k];
56a3192f15SSatish Balay         start = ai[row];
57a3192f15SSatish Balay         end   = ai[row+1];
58a3192f15SSatish Balay         for (l = start; l<end; l++) {
59a3192f15SSatish Balay           val = aj[l];
6026fbe8dcSKarl Rupp           if (!PetscBTLookupSet(table,val)) nidx[isz++] = val;
61a3192f15SSatish Balay         }
62a3192f15SSatish Balay       }
63a3192f15SSatish Balay     }
64218c64b6SSatish Balay     /* expand the Index Set */
65218c64b6SSatish Balay     for (j=0; j<isz; j++) {
6626fbe8dcSKarl Rupp       for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
67218c64b6SSatish Balay     }
6870b3c8c7SBarry Smith     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
69a3192f15SSatish Balay   }
7094bacf5dSBarry Smith   ierr = PetscBTDestroy(&table);CHKERRQ(ierr);
71606d414cSSatish Balay   ierr = PetscFree(nidx);CHKERRQ(ierr);
72606d414cSSatish Balay   ierr = PetscFree(nidx2);CHKERRQ(ierr);
733a40ed3dSBarry Smith   PetscFunctionReturn(0);
74a3192f15SSatish Balay }
751c351548SSatish Balay 
764a2ae208SSatish Balay #undef __FUNCT__
774a2ae208SSatish Balay #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ_Private"
784aa3045dSJed Brown PetscErrorCode MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
79736121d4SSatish Balay {
80736121d4SSatish Balay   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*c;
816849ba73SBarry Smith   PetscErrorCode ierr;
82690b6cddSBarry Smith   PetscInt       *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
83690b6cddSBarry Smith   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
845d0c19d7SBarry Smith   const PetscInt *irow,*icol;
855d0c19d7SBarry Smith   PetscInt       nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
86690b6cddSBarry Smith   PetscInt       *aj = a->j,*ai = a->i;
873f1db9ecSBarry Smith   MatScalar      *mat_a;
88736121d4SSatish Balay   Mat            C;
89ace3abfcSBarry Smith   PetscBool      flag,sorted;
90736121d4SSatish Balay 
913a40ed3dSBarry Smith   PetscFunctionBegin;
9214ca34e6SBarry Smith   ierr = ISSorted(iscol,&sorted);CHKERRQ(ierr);
93e32f2f54SBarry Smith   if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
94736121d4SSatish Balay 
95736121d4SSatish Balay   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
96218c64b6SSatish Balay   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
97b9b97703SBarry Smith   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
98b9b97703SBarry Smith   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
99736121d4SSatish Balay 
100690b6cddSBarry Smith   ierr  = PetscMalloc((1+oldcols)*sizeof(PetscInt),&smap);CHKERRQ(ierr);
101736121d4SSatish Balay   ssmap = smap;
102690b6cddSBarry Smith   ierr  = PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);CHKERRQ(ierr);
103690b6cddSBarry Smith   ierr  = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr);
104736121d4SSatish Balay   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
105736121d4SSatish Balay   /* determine lens of each row */
106736121d4SSatish Balay   for (i=0; i<nrows; i++) {
107736121d4SSatish Balay     kstart  = ai[irow[i]];
108736121d4SSatish Balay     kend    = kstart + a->ilen[irow[i]];
109736121d4SSatish Balay     lens[i] = 0;
110736121d4SSatish Balay     for (k=kstart; k<kend; k++) {
11126fbe8dcSKarl Rupp       if (ssmap[aj[k]]) lens[i]++;
112736121d4SSatish Balay     }
113736121d4SSatish Balay   }
114736121d4SSatish Balay   /* Create and fill new matrix */
115736121d4SSatish Balay   if (scall == MAT_REUSE_MATRIX) {
116736121d4SSatish Balay     c = (Mat_SeqBAIJ*)((*B)->data);
117736121d4SSatish Balay 
118e32f2f54SBarry Smith     if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
119690b6cddSBarry Smith     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr);
120e7e72b3dSBarry Smith     if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
121690b6cddSBarry Smith     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr);
122736121d4SSatish Balay     C    = *B;
1233a40ed3dSBarry Smith   } else {
124ce94432eSBarry Smith     ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr);
125f69a0ea3SMatthew Knepley     ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1267adad957SLisandro Dalcin     ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
127ab93d7beSBarry Smith     ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(C,bs,0,lens);CHKERRQ(ierr);
128736121d4SSatish Balay   }
129736121d4SSatish Balay   c = (Mat_SeqBAIJ*)(C->data);
130736121d4SSatish Balay   for (i=0; i<nrows; i++) {
131736121d4SSatish Balay     row      = irow[i];
132736121d4SSatish Balay     kstart   = ai[row];
133736121d4SSatish Balay     kend     = kstart + a->ilen[row];
134736121d4SSatish Balay     mat_i    = c->i[i];
135736121d4SSatish Balay     mat_j    = c->j + mat_i;
136218c64b6SSatish Balay     mat_a    = c->a + mat_i*bs2;
137736121d4SSatish Balay     mat_ilen = c->ilen + i;
138736121d4SSatish Balay     for (k=kstart; k<kend; k++) {
139736121d4SSatish Balay       if ((tcol=ssmap[a->j[k]])) {
140736121d4SSatish Balay         *mat_j++ = tcol - 1;
141549d3d68SSatish Balay         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
142549d3d68SSatish Balay         mat_a   += bs2;
143736121d4SSatish Balay         (*mat_ilen)++;
144736121d4SSatish Balay       }
145736121d4SSatish Balay     }
146736121d4SSatish Balay   }
147218c64b6SSatish Balay 
148736121d4SSatish Balay   /* Free work space */
149736121d4SSatish Balay   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
150606d414cSSatish Balay   ierr = PetscFree(smap);CHKERRQ(ierr);
151606d414cSSatish Balay   ierr = PetscFree(lens);CHKERRQ(ierr);
1526d4a8577SBarry Smith   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1536d4a8577SBarry Smith   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
154736121d4SSatish Balay 
155736121d4SSatish Balay   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
156736121d4SSatish Balay   *B   = C;
1573a40ed3dSBarry Smith   PetscFunctionReturn(0);
158736121d4SSatish Balay }
159736121d4SSatish Balay 
1604a2ae208SSatish Balay #undef __FUNCT__
1614a2ae208SSatish Balay #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ"
1624aa3045dSJed Brown PetscErrorCode MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
163218c64b6SSatish Balay {
164218c64b6SSatish Balay   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
165218c64b6SSatish Balay   IS             is1,is2;
1666849ba73SBarry Smith   PetscErrorCode ierr;
1675d0c19d7SBarry Smith   PetscInt       *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count;
1685d0c19d7SBarry Smith   const PetscInt *irow,*icol;
169218c64b6SSatish Balay 
1703a40ed3dSBarry Smith   PetscFunctionBegin;
171218c64b6SSatish Balay   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
172218c64b6SSatish Balay   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
173b9b97703SBarry Smith   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
174b9b97703SBarry Smith   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
175218c64b6SSatish Balay 
176218c64b6SSatish Balay   /* Verify if the indices corespond to each element in a block
177218c64b6SSatish Balay    and form the IS with compressed IS */
178*dcca6d9dSJed Brown   ierr = PetscMalloc2(a->mbs,&vary,a->mbs,&iary);CHKERRQ(ierr);
179fca92195SBarry Smith   ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr);
180218c64b6SSatish Balay   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
181218c64b6SSatish Balay   count = 0;
182218c64b6SSatish Balay   for (i=0; i<a->mbs; i++) {
183e32f2f54SBarry Smith     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
184218c64b6SSatish Balay     if (vary[i]==bs) iary[count++] = i;
185218c64b6SSatish Balay   }
18670b3c8c7SBarry Smith   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr);
187218c64b6SSatish Balay 
188690b6cddSBarry Smith   ierr = PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));CHKERRQ(ierr);
189218c64b6SSatish Balay   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
190218c64b6SSatish Balay   count = 0;
191218c64b6SSatish Balay   for (i=0; i<a->mbs; i++) {
192e32f2f54SBarry Smith     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
193218c64b6SSatish Balay     if (vary[i]==bs) iary[count++] = i;
194218c64b6SSatish Balay   }
19570b3c8c7SBarry Smith   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr);
196218c64b6SSatish Balay   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
197218c64b6SSatish Balay   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
198fca92195SBarry Smith   ierr = PetscFree2(vary,iary);CHKERRQ(ierr);
199218c64b6SSatish Balay 
2004aa3045dSJed Brown   ierr = MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr);
2016bf464f9SBarry Smith   ierr = ISDestroy(&is1);CHKERRQ(ierr);
2026bf464f9SBarry Smith   ierr = ISDestroy(&is2);CHKERRQ(ierr);
2033a40ed3dSBarry Smith   PetscFunctionReturn(0);
204218c64b6SSatish Balay }
205218c64b6SSatish Balay 
2064a2ae208SSatish Balay #undef __FUNCT__
2074a2ae208SSatish Balay #define __FUNCT__ "MatGetSubMatrices_SeqBAIJ"
208690b6cddSBarry Smith PetscErrorCode MatGetSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
209736121d4SSatish Balay {
2106849ba73SBarry Smith   PetscErrorCode ierr;
211690b6cddSBarry Smith   PetscInt       i;
212736121d4SSatish Balay 
2133a40ed3dSBarry Smith   PetscFunctionBegin;
214736121d4SSatish Balay   if (scall == MAT_INITIAL_MATRIX) {
21582502324SSatish Balay     ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr);
216736121d4SSatish Balay   }
217736121d4SSatish Balay 
218736121d4SSatish Balay   for (i=0; i<n; i++) {
2194aa3045dSJed Brown     ierr = MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr);
220736121d4SSatish Balay   }
2213a40ed3dSBarry Smith   PetscFunctionReturn(0);
222736121d4SSatish Balay }
223218c64b6SSatish Balay 
224218c64b6SSatish Balay 
2252d61bbb3SSatish Balay /* -------------------------------------------------------*/
2262d61bbb3SSatish Balay /* Should check that shapes of vectors and matrices match */
2272d61bbb3SSatish Balay /* -------------------------------------------------------*/
2282d61bbb3SSatish Balay 
229d6232bceSMichael Lange #if defined(PETSC_THREADCOMM_ACTIVE)
230d6232bceSMichael Lange PetscErrorCode MatMult_SeqBAIJ_1_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz)
231d6232bceSMichael Lange {
232d6232bceSMichael Lange   PetscErrorCode    ierr;
233d6232bceSMichael Lange   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
234d6232bceSMichael Lange   PetscScalar       *z;
235d6232bceSMichael Lange   const PetscScalar *x;
236d6232bceSMichael Lange   const MatScalar   *aa;
237d6232bceSMichael Lange   PetscInt          *trstarts=A->rmap->trstarts;
238d6232bceSMichael Lange   PetscInt          n,start,end,i;
239d6232bceSMichael Lange   const PetscInt    *aj,*ai;
240d6232bceSMichael Lange   PetscScalar       sum;
241d6232bceSMichael Lange 
242d6232bceSMichael Lange   ierr  = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
243d6232bceSMichael Lange   ierr  = VecGetArray(zz,&z);CHKERRQ(ierr);
244d6232bceSMichael Lange   start = trstarts[thread_id];
245d6232bceSMichael Lange   end   = trstarts[thread_id+1];
246d6232bceSMichael Lange   ai    = a->i;
247d6232bceSMichael Lange   for (i=start; i<end; i++) {
248d6232bceSMichael Lange     n   = ai[i+1] - ai[i];
249d6232bceSMichael Lange     aj  = a->j + ai[i];
250d6232bceSMichael Lange     aa  = a->a + ai[i];
251d6232bceSMichael Lange     sum = 0.0;
252d6232bceSMichael Lange     PetscSparseDensePlusDot(sum,x,aa,aj,n);
253d6232bceSMichael Lange     z[i] = sum;
254d6232bceSMichael Lange   }
255d6232bceSMichael Lange   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
256d6232bceSMichael Lange   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
257d6232bceSMichael Lange   return 0;
258d6232bceSMichael Lange }
259d6232bceSMichael Lange 
260d6232bceSMichael Lange #undef __FUNCT__
261d6232bceSMichael Lange #define __FUNCT__ "MatMult_SeqBAIJ_1"
262d6232bceSMichael Lange PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
263d6232bceSMichael Lange {
264d6232bceSMichael Lange   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
265d6232bceSMichael Lange   PetscScalar       *z,sum;
266d6232bceSMichael Lange   const PetscScalar *x;
267d6232bceSMichael Lange   const MatScalar   *v;
268d6232bceSMichael Lange   PetscErrorCode    ierr;
269d6232bceSMichael Lange   PetscInt          mbs,i,n;
270d6232bceSMichael Lange   const PetscInt    *idx,*ii,*ridx=NULL;
271d6232bceSMichael Lange   PetscBool         usecprow=a->compressedrow.use;
272d6232bceSMichael Lange 
273d6232bceSMichael Lange   PetscFunctionBegin;
274d6232bceSMichael Lange 
275d6232bceSMichael Lange   if (usecprow) {
276d6232bceSMichael Lange     ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
277d6232bceSMichael Lange     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
278d6232bceSMichael Lange     mbs  = a->compressedrow.nrows;
279d6232bceSMichael Lange     ii   = a->compressedrow.i;
280d6232bceSMichael Lange     ridx = a->compressedrow.rindex;
281d6232bceSMichael Lange     ierr = PetscMemzero(z,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
282d6232bceSMichael Lange     for (i=0; i<mbs; i++) {
283d6232bceSMichael Lange       n   = ii[i+1] - ii[i];
284d6232bceSMichael Lange       v   = a->a + ii[i];
285d6232bceSMichael Lange       idx = a->j + ii[i];
286d6232bceSMichael Lange       PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
287d6232bceSMichael Lange       PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
288d6232bceSMichael Lange       sum = 0.0;
289d6232bceSMichael Lange       PetscSparseDensePlusDot(sum,x,v,idx,n);
290d6232bceSMichael Lange       z[ridx[i]] = sum;
291d6232bceSMichael Lange     }
292d6232bceSMichael Lange     ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
293d6232bceSMichael Lange     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
294d6232bceSMichael Lange   } else {
295d6232bceSMichael Lange     ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_1_Kernel,3,A,xx,zz);CHKERRQ(ierr);
296d6232bceSMichael Lange   }
297d6232bceSMichael Lange   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
298d6232bceSMichael Lange   PetscFunctionReturn(0);
299d6232bceSMichael Lange }
300d6232bceSMichael Lange #else
3014a2ae208SSatish Balay #undef __FUNCT__
3024a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqBAIJ_1"
303dfbe8321SBarry Smith PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
3042d61bbb3SSatish Balay {
3052d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
306d9fead3dSBarry Smith   PetscScalar       *z,sum;
307d9fead3dSBarry Smith   const PetscScalar *x;
308d9fead3dSBarry Smith   const MatScalar   *v;
3096849ba73SBarry Smith   PetscErrorCode    ierr;
3107c565772SBarry Smith   PetscInt          mbs,i,n;
3110298fd71SBarry Smith   const PetscInt    *idx,*ii,*ridx=NULL;
312ace3abfcSBarry Smith   PetscBool         usecprow=a->compressedrow.use;
3132d61bbb3SSatish Balay 
3142d61bbb3SSatish Balay   PetscFunctionBegin;
3153649974fSBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
3161ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
3172d61bbb3SSatish Balay 
31826e093fcSHong Zhang   if (usecprow) {
31926e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
32026e093fcSHong Zhang     ii   = a->compressedrow.i;
3217b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
322a1c3900fSBarry Smith     ierr = PetscMemzero(z,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
32326e093fcSHong Zhang   } else {
32426e093fcSHong Zhang     mbs = a->mbs;
3252d61bbb3SSatish Balay     ii  = a->i;
32626e093fcSHong Zhang   }
3272d61bbb3SSatish Balay 
3282d61bbb3SSatish Balay   for (i=0; i<mbs; i++) {
329ee54c7eeSHong Zhang     n   = ii[1] - ii[0];
330ee54c7eeSHong Zhang     v   = a->a + ii[0];
331ee54c7eeSHong Zhang     idx = a->j + ii[0];
332ee54c7eeSHong Zhang     ii++;
333444d8c10SJed Brown     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
334444d8c10SJed Brown     PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3352d61bbb3SSatish Balay     sum = 0.0;
3362162cab8SBarry Smith     PetscSparseDensePlusDot(sum,x,v,idx,n);
33726e093fcSHong Zhang     if (usecprow) {
3387b2bb3b9SHong Zhang       z[ridx[i]] = sum;
33926e093fcSHong Zhang     } else {
3402d61bbb3SSatish Balay       z[i]        = sum;
3412d61bbb3SSatish Balay     }
34226e093fcSHong Zhang   }
3433649974fSBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
3441ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
3457c565772SBarry Smith   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
3462d61bbb3SSatish Balay   PetscFunctionReturn(0);
3472d61bbb3SSatish Balay }
348d6232bceSMichael Lange #endif
3492d61bbb3SSatish Balay 
350d6232bceSMichael Lange #if defined(PETSC_THREADCOMM_ACTIVE)
351d6232bceSMichael Lange PetscErrorCode MatMult_SeqBAIJ_2_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz)
352d6232bceSMichael Lange {
353d6232bceSMichael Lange   PetscErrorCode    ierr;
354d6232bceSMichael Lange   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
355d6232bceSMichael Lange   PetscScalar       *z,x1,x2,sum1,sum2;
356d6232bceSMichael Lange   const PetscScalar *x,*xb;
357d6232bceSMichael Lange   const MatScalar   *aa;
358d6232bceSMichael Lange   PetscInt          *trstarts=A->rmap->trstarts;
359d6232bceSMichael Lange   PetscInt          n,start,end,i,j;
360d6232bceSMichael Lange   const PetscInt    *aj,*ai;
361d6232bceSMichael Lange 
362d6232bceSMichael Lange   ierr   = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
363d6232bceSMichael Lange   ierr   = VecGetArray(zz,&z);CHKERRQ(ierr);
364d6232bceSMichael Lange   start  = trstarts[thread_id] / 2;
365d6232bceSMichael Lange   end    = trstarts[thread_id+1] / 2;
366d6232bceSMichael Lange   ai     = a->i;
367d6232bceSMichael Lange   for (i=start; i<end; i++) {
368d6232bceSMichael Lange     n    = ai[i+1] - ai[i];
369d6232bceSMichael Lange     aj   = a->j + ai[i];
370d6232bceSMichael Lange     aa   = a->a + ai[i]*4;
371d6232bceSMichael Lange     sum1 = 0.0; sum2 = 0.0;
372d6232bceSMichael Lange     for (j=0; j<n; j++) {
373d6232bceSMichael Lange       xb = x + 2*aj[j]; x1 = xb[0]; x2 = xb[1];
374d6232bceSMichael Lange       sum1 += aa[4*j]*x1   + aa[4*j+2]*x2;
375d6232bceSMichael Lange       sum2 += aa[4*j+1]*x1 + aa[4*j+3]*x2;
376d6232bceSMichael Lange     }
377d6232bceSMichael Lange     z[2*i] = sum1; z[2*i+1] = sum2;
378d6232bceSMichael Lange   }
379d6232bceSMichael Lange   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
380d6232bceSMichael Lange   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
381d6232bceSMichael Lange   return 0;
382d6232bceSMichael Lange }
383d6232bceSMichael Lange 
384d6232bceSMichael Lange #undef __FUNCT__
385d6232bceSMichael Lange #define __FUNCT__ "MatMult_SeqBAIJ_2"
386d6232bceSMichael Lange PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
387d6232bceSMichael Lange {
388d6232bceSMichael Lange   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
389d6232bceSMichael Lange   PetscScalar       *z,x1,x2,sum1,sum2;
390d6232bceSMichael Lange   const PetscScalar *x,*xb;
391d6232bceSMichael Lange   const MatScalar   *v;
392d6232bceSMichael Lange   PetscErrorCode    ierr;
393d6232bceSMichael Lange   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
394d6232bceSMichael Lange   PetscBool         usecprow=a->compressedrow.use;
395d6232bceSMichael Lange 
396d6232bceSMichael Lange   PetscFunctionBegin;
397d6232bceSMichael Lange 
398d6232bceSMichael Lange   if (usecprow) {
399d6232bceSMichael Lange     ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
400d6232bceSMichael Lange     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
401d6232bceSMichael Lange     mbs  = a->compressedrow.nrows;
402d6232bceSMichael Lange     ii   = a->compressedrow.i;
403d6232bceSMichael Lange     ridx = a->compressedrow.rindex;
404d6232bceSMichael Lange     for (i=0; i<mbs; i++) {
405d6232bceSMichael Lange       n    = ii[i+1] - ii[i];
406d6232bceSMichael Lange       idx  = a->j + ii[i];
407d6232bceSMichael Lange       v    = a->a + ii[i]*4;
408d6232bceSMichael Lange       sum1 = 0.0; sum2 = 0.0;
409d6232bceSMichael Lange       PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
410d6232bceSMichael Lange       PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
411d6232bceSMichael Lange       for (j=0; j<n; j++) {
412d6232bceSMichael Lange 	xb = x + 2*idx[j]; x1 = xb[0]; x2 = xb[1];
413d6232bceSMichael Lange 	sum1 += v[4*j]*x1   + v[4*j+2]*x2;
414d6232bceSMichael Lange 	sum2 += v[4*j+1]*x1 + v[4*j+3]*x2;
415d6232bceSMichael Lange       }
416d6232bceSMichael Lange       z[2*ridx[i]] = sum1; z[2*ridx[i]+1] = sum2;
417d6232bceSMichael Lange     }
418d6232bceSMichael Lange     ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
419d6232bceSMichael Lange     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
420d6232bceSMichael Lange   } else {
421d6232bceSMichael Lange     ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_2_Kernel,3,A,xx,zz);CHKERRQ(ierr);
422d6232bceSMichael Lange   }
423d6232bceSMichael Lange   ierr = PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);CHKERRQ(ierr);
424d6232bceSMichael Lange   PetscFunctionReturn(0);
425d6232bceSMichael Lange }
426d6232bceSMichael Lange #else
4274a2ae208SSatish Balay #undef __FUNCT__
4284a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqBAIJ_2"
429dfbe8321SBarry Smith PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
4302d61bbb3SSatish Balay {
4312d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
432d9fead3dSBarry Smith   PetscScalar       *z = 0,sum1,sum2,*zarray;
433d9fead3dSBarry Smith   const PetscScalar *x,*xb;
43487828ca2SBarry Smith   PetscScalar       x1,x2;
435d9fead3dSBarry Smith   const MatScalar   *v;
436dfbe8321SBarry Smith   PetscErrorCode    ierr;
4377c565772SBarry Smith   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
438ace3abfcSBarry Smith   PetscBool         usecprow=a->compressedrow.use;
4392d61bbb3SSatish Balay 
4402d61bbb3SSatish Balay   PetscFunctionBegin;
4413649974fSBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
44226e093fcSHong Zhang   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
4432d61bbb3SSatish Balay 
4442d61bbb3SSatish Balay   idx = a->j;
4452d61bbb3SSatish Balay   v   = a->a;
44626e093fcSHong Zhang   if (usecprow) {
44726e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
44826e093fcSHong Zhang     ii   = a->compressedrow.i;
4497b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
45026e093fcSHong Zhang   } else {
45126e093fcSHong Zhang     mbs = a->mbs;
4522d61bbb3SSatish Balay     ii  = a->i;
45326e093fcSHong Zhang     z   = zarray;
45426e093fcSHong Zhang   }
4552d61bbb3SSatish Balay 
4562d61bbb3SSatish Balay   for (i=0; i<mbs; i++) {
4572d61bbb3SSatish Balay     n           = ii[1] - ii[0]; ii++;
4582d61bbb3SSatish Balay     sum1        = 0.0; sum2 = 0.0;
459444d8c10SJed Brown     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
460444d8c10SJed Brown     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
4612d61bbb3SSatish Balay     for (j=0; j<n; j++) {
4622d61bbb3SSatish Balay       xb    = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
4632d61bbb3SSatish Balay       sum1 += v[0]*x1 + v[2]*x2;
4642d61bbb3SSatish Balay       sum2 += v[1]*x1 + v[3]*x2;
4652d61bbb3SSatish Balay       v    += 4;
4662d61bbb3SSatish Balay     }
4677b2bb3b9SHong Zhang     if (usecprow) z = zarray + 2*ridx[i];
4682d61bbb3SSatish Balay     z[0] = sum1; z[1] = sum2;
46926e093fcSHong Zhang     if (!usecprow) z += 2;
4702d61bbb3SSatish Balay   }
4713649974fSBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
47226e093fcSHong Zhang   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
4737c565772SBarry Smith   ierr = PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);CHKERRQ(ierr);
4742d61bbb3SSatish Balay   PetscFunctionReturn(0);
4752d61bbb3SSatish Balay }
476d6232bceSMichael Lange #endif
4772d61bbb3SSatish Balay 
478d6232bceSMichael Lange #if defined(PETSC_THREADCOMM_ACTIVE)
479d6232bceSMichael Lange PetscErrorCode MatMult_SeqBAIJ_3_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz)
480d6232bceSMichael Lange {
481d6232bceSMichael Lange   PetscErrorCode    ierr;
482d6232bceSMichael Lange   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
483d6232bceSMichael Lange   PetscScalar       *z,x1,x2,x3,sum1,sum2,sum3;
484d6232bceSMichael Lange   const PetscScalar *x,*xb;
485d6232bceSMichael Lange   const MatScalar   *aa;
486d6232bceSMichael Lange   PetscInt          *trstarts=A->rmap->trstarts;
487d6232bceSMichael Lange   PetscInt          n,start,end,i,j;
488d6232bceSMichael Lange   const PetscInt    *aj,*ai;
489d6232bceSMichael Lange 
490d6232bceSMichael Lange   ierr   = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
491d6232bceSMichael Lange   ierr   = VecGetArray(zz,&z);CHKERRQ(ierr);
492d6232bceSMichael Lange   start  = trstarts[thread_id] / 3;
493d6232bceSMichael Lange   end    = trstarts[thread_id+1] / 3;
494d6232bceSMichael Lange   ai     = a->i;
495d6232bceSMichael Lange   for (i=start; i<end; i++) {
496d6232bceSMichael Lange     n    = ai[i+1] - ai[i];
497d6232bceSMichael Lange     aj   = a->j + ai[i];
498d6232bceSMichael Lange     aa   = a->a + ai[i]*9;
499d6232bceSMichael Lange     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
500d6232bceSMichael Lange     for (j=0; j<n; j++) {
501d6232bceSMichael Lange       xb = x + 3*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
502d6232bceSMichael Lange       sum1 += aa[9*j]*x1   + aa[9*j+3]*x2 + aa[9*j+6]*x3;
503d6232bceSMichael Lange       sum2 += aa[9*j+1]*x1 + aa[9*j+4]*x2 + aa[9*j+7]*x3;
504d6232bceSMichael Lange       sum3 += aa[9*j+2]*x1 + aa[9*j+5]*x2 + aa[9*j+8]*x3;
505d6232bceSMichael Lange     }
506d6232bceSMichael Lange     z[3*i] = sum1; z[3*i+1] = sum2; z[3*i+2] = sum3;
507d6232bceSMichael Lange   }
508d6232bceSMichael Lange   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
509d6232bceSMichael Lange   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
510d6232bceSMichael Lange   return 0;
511d6232bceSMichael Lange }
512d6232bceSMichael Lange 
513d6232bceSMichael Lange #undef __FUNCT__
514d6232bceSMichael Lange #define __FUNCT__ "MatMult_SeqBAIJ_3"
515d6232bceSMichael Lange PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
516d6232bceSMichael Lange {
517d6232bceSMichael Lange   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
518d6232bceSMichael Lange   PetscScalar       *z,sum1,sum2,sum3,x1,x2,x3;
519d6232bceSMichael Lange   const PetscScalar *x,*xb;
520d6232bceSMichael Lange   const MatScalar   *v;
521d6232bceSMichael Lange   PetscErrorCode    ierr;
522d6232bceSMichael Lange   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
523d6232bceSMichael Lange   PetscBool         usecprow=a->compressedrow.use;
524d6232bceSMichael Lange 
525d6232bceSMichael Lange 
526d6232bceSMichael Lange #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
527d6232bceSMichael Lange #pragma disjoint(*v,*z,*xb)
528d6232bceSMichael Lange #endif
529d6232bceSMichael Lange 
530d6232bceSMichael Lange   PetscFunctionBegin;
531d6232bceSMichael Lange 
532d6232bceSMichael Lange   if (usecprow) {
533d6232bceSMichael Lange     ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
534d6232bceSMichael Lange     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
535d6232bceSMichael Lange     mbs  = a->compressedrow.nrows;
536d6232bceSMichael Lange     ii   = a->compressedrow.i;
537d6232bceSMichael Lange     ridx = a->compressedrow.rindex;
538d6232bceSMichael Lange     for (i=0; i<mbs; i++) {
539d6232bceSMichael Lange       n    = ii[i+1] - ii[i];
540d6232bceSMichael Lange       idx  = a->j + ii[i];
541d6232bceSMichael Lange       v    = a->a + ii[i]*9;
542d6232bceSMichael Lange       sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
543d6232bceSMichael Lange       PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
544d6232bceSMichael Lange       PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
545d6232bceSMichael Lange       for (j=0; j<n; j++) {
546d6232bceSMichael Lange 	xb = x + 3*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
547d6232bceSMichael Lange 	sum1 += v[9*j]*x1   + v[9*j+3]*x2 + v[9*j+6]*x3;
548d6232bceSMichael Lange 	sum2 += v[9*j+1]*x1 + v[9*j+4]*x2 + v[9*j+7]*x3;
549d6232bceSMichael Lange 	sum3 += v[9*j+2]*x1 + v[9*j+5]*x2 + v[9*j+8]*x3;
550d6232bceSMichael Lange       }
551d6232bceSMichael Lange       z[3*ridx[i]] = sum1; z[3*ridx[i]+1] = sum2; z[3*ridx[i]+2] = sum3;
552d6232bceSMichael Lange     }
553d6232bceSMichael Lange     ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
554d6232bceSMichael Lange     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
555d6232bceSMichael Lange   } else {
556d6232bceSMichael Lange     ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_3_Kernel,3,A,xx,zz);CHKERRQ(ierr);
557d6232bceSMichael Lange   }
558d6232bceSMichael Lange   ierr = PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);CHKERRQ(ierr);
559d6232bceSMichael Lange   PetscFunctionReturn(0);
560d6232bceSMichael Lange }
561d6232bceSMichael Lange #else
5624a2ae208SSatish Balay #undef __FUNCT__
5634a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqBAIJ_3"
564dfbe8321SBarry Smith PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
5652d61bbb3SSatish Balay {
5662d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
567d9fead3dSBarry Smith   PetscScalar       *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray;
568d9fead3dSBarry Smith   const PetscScalar *x,*xb;
569d9fead3dSBarry Smith   const MatScalar   *v;
570dfbe8321SBarry Smith   PetscErrorCode    ierr;
5717c565772SBarry Smith   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
572ace3abfcSBarry Smith   PetscBool         usecprow=a->compressedrow.use;
57326e093fcSHong Zhang 
5742d61bbb3SSatish Balay 
575b6410449SSatish Balay #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
576fee21e36SBarry Smith #pragma disjoint(*v,*z,*xb)
577fee21e36SBarry Smith #endif
578fee21e36SBarry Smith 
5792d61bbb3SSatish Balay   PetscFunctionBegin;
5803649974fSBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
58126e093fcSHong Zhang   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
5822d61bbb3SSatish Balay 
5832d61bbb3SSatish Balay   idx = a->j;
5842d61bbb3SSatish Balay   v   = a->a;
58526e093fcSHong Zhang   if (usecprow) {
58626e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
58726e093fcSHong Zhang     ii   = a->compressedrow.i;
5887b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
58926e093fcSHong Zhang   } else {
59026e093fcSHong Zhang     mbs = a->mbs;
5912d61bbb3SSatish Balay     ii  = a->i;
59226e093fcSHong Zhang     z   = zarray;
59326e093fcSHong Zhang   }
5942d61bbb3SSatish Balay 
5952d61bbb3SSatish Balay   for (i=0; i<mbs; i++) {
5962d61bbb3SSatish Balay     n           = ii[1] - ii[0]; ii++;
5972d61bbb3SSatish Balay     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0;
598444d8c10SJed Brown     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
599444d8c10SJed Brown     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
6002d61bbb3SSatish Balay     for (j=0; j<n; j++) {
60126fbe8dcSKarl Rupp       xb = x + 3*(*idx++);
60226fbe8dcSKarl Rupp       x1 = xb[0];
60326fbe8dcSKarl Rupp       x2 = xb[1];
60426fbe8dcSKarl Rupp       x3 = xb[2];
60526fbe8dcSKarl Rupp 
6062d61bbb3SSatish Balay       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
6072d61bbb3SSatish Balay       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
6082d61bbb3SSatish Balay       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
6092d61bbb3SSatish Balay       v    += 9;
6102d61bbb3SSatish Balay     }
6117b2bb3b9SHong Zhang     if (usecprow) z = zarray + 3*ridx[i];
6122d61bbb3SSatish Balay     z[0] = sum1; z[1] = sum2; z[2] = sum3;
61326e093fcSHong Zhang     if (!usecprow) z += 3;
6142d61bbb3SSatish Balay   }
6153649974fSBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
61626e093fcSHong Zhang   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
6177c565772SBarry Smith   ierr = PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);CHKERRQ(ierr);
6182d61bbb3SSatish Balay   PetscFunctionReturn(0);
6192d61bbb3SSatish Balay }
620d6232bceSMichael Lange #endif
6212d61bbb3SSatish Balay 
622d6232bceSMichael Lange #if defined(PETSC_THREADCOMM_ACTIVE)
623d6232bceSMichael Lange PetscErrorCode MatMult_SeqBAIJ_4_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz)
624d6232bceSMichael Lange {
625d6232bceSMichael Lange   PetscErrorCode    ierr;
626d6232bceSMichael Lange   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
627d6232bceSMichael Lange   PetscScalar       *z,x1,x2,x3,x4,sum1,sum2,sum3,sum4;
628d6232bceSMichael Lange   const PetscScalar *x,*xb;
629d6232bceSMichael Lange   const MatScalar   *aa;
630d6232bceSMichael Lange   PetscInt          *trstarts=A->rmap->trstarts;
631d6232bceSMichael Lange   PetscInt          n,start,end,i,j;
632d6232bceSMichael Lange   const PetscInt    *aj,*ai;
633d6232bceSMichael Lange 
634d6232bceSMichael Lange   ierr   = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
635d6232bceSMichael Lange   ierr   = VecGetArray(zz,&z);CHKERRQ(ierr);
636d6232bceSMichael Lange   start  = trstarts[thread_id] / 4;
637d6232bceSMichael Lange   end    = trstarts[thread_id+1] / 4;
638d6232bceSMichael Lange   ai     = a->i;
639d6232bceSMichael Lange   for (i=start; i<end; i++) {
640d6232bceSMichael Lange     n    = ai[i+1] - ai[i];
641d6232bceSMichael Lange     aj   = a->j + ai[i];
642d6232bceSMichael Lange     aa   = a->a + ai[i]*16;
643d6232bceSMichael Lange     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
644d6232bceSMichael Lange     for (j=0; j<n; j++) {
645d6232bceSMichael Lange       xb = x + 4*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
646d6232bceSMichael Lange       sum1 += aa[16*j]*x1   + aa[16*j+4]*x2 + aa[16*j+8]*x3  + aa[16*j+12]*x4;
647d6232bceSMichael Lange       sum2 += aa[16*j+1]*x1 + aa[16*j+5]*x2 + aa[16*j+9]*x3  + aa[16*j+13]*x4;
648d6232bceSMichael Lange       sum3 += aa[16*j+2]*x1 + aa[16*j+6]*x2 + aa[16*j+10]*x3 + aa[16*j+14]*x4;
649d6232bceSMichael Lange       sum4 += aa[16*j+3]*x1 + aa[16*j+7]*x2 + aa[16*j+11]*x3 + aa[16*j+15]*x4;
650d6232bceSMichael Lange     }
651d6232bceSMichael Lange     z[4*i]   = sum1; z[4*i+1] = sum2;
652d6232bceSMichael Lange     z[4*i+2] = sum3; z[4*i+3] = sum4;
653d6232bceSMichael Lange   }
654d6232bceSMichael Lange   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
655d6232bceSMichael Lange   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
656d6232bceSMichael Lange   return 0;
657d6232bceSMichael Lange }
658d6232bceSMichael Lange 
659d6232bceSMichael Lange #undef __FUNCT__
660d6232bceSMichael Lange #define __FUNCT__ "MatMult_SeqBAIJ_4"
661d6232bceSMichael Lange PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
662d6232bceSMichael Lange {
663d6232bceSMichael Lange   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
664d6232bceSMichael Lange   PetscScalar       *z,x1,x2,x3,x4,sum1,sum2,sum3,sum4;
665d6232bceSMichael Lange   const PetscScalar *x,*xb;
666d6232bceSMichael Lange   const MatScalar   *v;
667d6232bceSMichael Lange   PetscErrorCode    ierr;
668d6232bceSMichael Lange   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
669d6232bceSMichael Lange   PetscBool         usecprow=a->compressedrow.use;
670d6232bceSMichael Lange 
671d6232bceSMichael Lange   PetscFunctionBegin;
672d6232bceSMichael Lange 
673d6232bceSMichael Lange   if (usecprow) {
674d6232bceSMichael Lange     ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
675d6232bceSMichael Lange     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
676d6232bceSMichael Lange     mbs  = a->compressedrow.nrows;
677d6232bceSMichael Lange     ii   = a->compressedrow.i;
678d6232bceSMichael Lange     ridx = a->compressedrow.rindex;
679d6232bceSMichael Lange     for (i=0; i<mbs; i++) {
680d6232bceSMichael Lange       n = ii[i+1] - ii[1];
681d6232bceSMichael Lange       idx  = a->j + ii[i];
682d6232bceSMichael Lange       v    = a->a + ii[i]*16;
683d6232bceSMichael Lange       sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
684d6232bceSMichael Lange       PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
685d6232bceSMichael Lange       PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
686d6232bceSMichael Lange       for (j=0; j<n; j++) {
687d6232bceSMichael Lange 	xb = x + 4*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
688d6232bceSMichael Lange 	sum1 += v[16*j]*x1   + v[16*j+4]*x2 + v[16*j+8]*x3  + v[16*j+12]*x4;
689d6232bceSMichael Lange 	sum2 += v[16*j+1]*x1 + v[16*j+5]*x2 + v[16*j+9]*x3  + v[16*j+13]*x4;
690d6232bceSMichael Lange 	sum3 += v[16*j+2]*x1 + v[16*j+6]*x2 + v[16*j+10]*x3 + v[16*j+14]*x4;
691d6232bceSMichael Lange 	sum4 += v[16*j+3]*x1 + v[16*j+7]*x2 + v[16*j+11]*x3 + v[16*j+15]*x4;
692d6232bceSMichael Lange       }
693d6232bceSMichael Lange       z[4*ridx[i]]   = sum1; z[4*ridx[i]+1] = sum2;
694d6232bceSMichael Lange       z[4*ridx[i]+2] = sum3; z[4*ridx[i]+3] = sum4;
695d6232bceSMichael Lange     }
696d6232bceSMichael Lange     ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
697d6232bceSMichael Lange     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
698d6232bceSMichael Lange   } else {
699d6232bceSMichael Lange     ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_4_Kernel,3,A,xx,zz);CHKERRQ(ierr);
700d6232bceSMichael Lange   }
701d6232bceSMichael Lange   ierr = PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);CHKERRQ(ierr);
702d6232bceSMichael Lange   PetscFunctionReturn(0);
703d6232bceSMichael Lange }
704d6232bceSMichael Lange #else
7054a2ae208SSatish Balay #undef __FUNCT__
7064a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqBAIJ_4"
707dfbe8321SBarry Smith PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
7082d61bbb3SSatish Balay {
7092d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
710d9fead3dSBarry Smith   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
711d9fead3dSBarry Smith   const PetscScalar *x,*xb;
712d9fead3dSBarry Smith   const MatScalar   *v;
713dfbe8321SBarry Smith   PetscErrorCode    ierr;
7147c565772SBarry Smith   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
715ace3abfcSBarry Smith   PetscBool         usecprow=a->compressedrow.use;
7162d61bbb3SSatish Balay 
7172d61bbb3SSatish Balay   PetscFunctionBegin;
7183649974fSBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
71926e093fcSHong Zhang   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
7202d61bbb3SSatish Balay 
7212d61bbb3SSatish Balay   idx = a->j;
7222d61bbb3SSatish Balay   v   = a->a;
72326e093fcSHong Zhang   if (usecprow) {
72426e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
72526e093fcSHong Zhang     ii   = a->compressedrow.i;
7267b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
72726e093fcSHong Zhang   } else {
72826e093fcSHong Zhang     mbs = a->mbs;
7292d61bbb3SSatish Balay     ii  = a->i;
73026e093fcSHong Zhang     z   = zarray;
73126e093fcSHong Zhang   }
7322d61bbb3SSatish Balay 
7332d61bbb3SSatish Balay   for (i=0; i<mbs; i++) {
73426fbe8dcSKarl Rupp     n = ii[1] - ii[0];
73526fbe8dcSKarl Rupp     ii++;
73626fbe8dcSKarl Rupp     sum1 = 0.0;
73726fbe8dcSKarl Rupp     sum2 = 0.0;
73826fbe8dcSKarl Rupp     sum3 = 0.0;
73926fbe8dcSKarl Rupp     sum4 = 0.0;
74026fbe8dcSKarl Rupp 
741444d8c10SJed Brown     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
742444d8c10SJed Brown     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
7432d61bbb3SSatish Balay     for (j=0; j<n; j++) {
7442d61bbb3SSatish Balay       xb    = x + 4*(*idx++);
7452d61bbb3SSatish Balay       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
7462d61bbb3SSatish Balay       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
7472d61bbb3SSatish Balay       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
7482d61bbb3SSatish Balay       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
7492d61bbb3SSatish Balay       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
7502d61bbb3SSatish Balay       v    += 16;
7512d61bbb3SSatish Balay     }
7527b2bb3b9SHong Zhang     if (usecprow) z = zarray + 4*ridx[i];
7532d61bbb3SSatish Balay     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
75426e093fcSHong Zhang     if (!usecprow) z += 4;
7552d61bbb3SSatish Balay   }
7563649974fSBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
75726e093fcSHong Zhang   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
7587c565772SBarry Smith   ierr = PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);CHKERRQ(ierr);
7592d61bbb3SSatish Balay   PetscFunctionReturn(0);
7602d61bbb3SSatish Balay }
761d6232bceSMichael Lange #endif
7622d61bbb3SSatish Balay 
763d6232bceSMichael Lange #if defined(PETSC_THREADCOMM_ACTIVE)
764d6232bceSMichael Lange PetscErrorCode MatMult_SeqBAIJ_5_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz)
765d6232bceSMichael Lange {
766d6232bceSMichael Lange   PetscErrorCode    ierr;
767d6232bceSMichael Lange   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
768d6232bceSMichael Lange   PetscScalar       *z,x1,x2,x3,x4,x5,sum1,sum2,sum3,sum4,sum5;
769d6232bceSMichael Lange   const PetscScalar *x,*xb;
770d6232bceSMichael Lange   const MatScalar   *aa;
771d6232bceSMichael Lange   PetscInt          *trstarts=A->rmap->trstarts;
772d6232bceSMichael Lange   PetscInt          n,start,end,i,j;
773d6232bceSMichael Lange   const PetscInt    *aj,*ai;
774d6232bceSMichael Lange 
775d6232bceSMichael Lange   ierr   = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
776d6232bceSMichael Lange   ierr   = VecGetArray(zz,&z);CHKERRQ(ierr);
777d6232bceSMichael Lange   start  = trstarts[thread_id] / 5;
778d6232bceSMichael Lange   end    = trstarts[thread_id+1] / 5;
779d6232bceSMichael Lange   ai     = a->i;
780d6232bceSMichael Lange   for (i=start; i<end; i++) {
781d6232bceSMichael Lange     n    = ai[i+1] - ai[i];
782d6232bceSMichael Lange     aj   = a->j + ai[i];
783d6232bceSMichael Lange     aa   = a->a + ai[i]*25;
784d6232bceSMichael Lange     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
785d6232bceSMichael Lange     for (j=0; j<n; j++) {
786d6232bceSMichael Lange       xb = x + 5*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
787d6232bceSMichael Lange       sum1 += aa[25*j]*x1   + aa[25*j+5]*x2 + aa[25*j+10]*x3 + aa[25*j+15]*x4 + aa[25*j+20]*x5;
788d6232bceSMichael Lange       sum2 += aa[25*j+1]*x1 + aa[25*j+6]*x2 + aa[25*j+11]*x3 + aa[25*j+16]*x4 + aa[25*j+21]*x5;
789d6232bceSMichael Lange       sum3 += aa[25*j+2]*x1 + aa[25*j+7]*x2 + aa[25*j+12]*x3 + aa[25*j+17]*x4 + aa[25*j+22]*x5;
790d6232bceSMichael Lange       sum4 += aa[25*j+3]*x1 + aa[25*j+8]*x2 + aa[25*j+13]*x3 + aa[25*j+18]*x4 + aa[25*j+23]*x5;
791d6232bceSMichael Lange       sum5 += aa[25*j+4]*x1 + aa[25*j+9]*x2 + aa[25*j+14]*x3 + aa[25*j+19]*x4 + aa[25*j+24]*x5;
792d6232bceSMichael Lange     }
793d6232bceSMichael Lange     z[5*i]   = sum1; z[5*i+1] = sum2; z[5*i+2] = sum3;
794d6232bceSMichael Lange     z[5*i+3] = sum4; z[5*i+4] = sum5;
795d6232bceSMichael Lange   }
796d6232bceSMichael Lange   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
797d6232bceSMichael Lange   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
798d6232bceSMichael Lange   return 0;
799d6232bceSMichael Lange }
800d6232bceSMichael Lange 
801d6232bceSMichael Lange #undef __FUNCT__
802d6232bceSMichael Lange #define __FUNCT__ "MatMult_SeqBAIJ_5"
803d6232bceSMichael Lange PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
804d6232bceSMichael Lange {
805d6232bceSMichael Lange   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
806d6232bceSMichael Lange   PetscScalar       *z,x1,x2,x3,x4,x5,sum1,sum2,sum3,sum4,sum5;
807d6232bceSMichael Lange   const PetscScalar *xb,*x;
808d6232bceSMichael Lange   const MatScalar   *v;
809d6232bceSMichael Lange   PetscErrorCode    ierr;
810d6232bceSMichael Lange   const PetscInt    *idx,*ii,*ridx=NULL;
811d6232bceSMichael Lange   PetscInt          mbs,i,j,n;
812d6232bceSMichael Lange   PetscBool         usecprow=a->compressedrow.use;
813d6232bceSMichael Lange 
814d6232bceSMichael Lange   PetscFunctionBegin;
815d6232bceSMichael Lange 
816d6232bceSMichael Lange   if (usecprow) {
817d6232bceSMichael Lange     ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
818d6232bceSMichael Lange     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
819d6232bceSMichael Lange     mbs  = a->compressedrow.nrows;
820d6232bceSMichael Lange     ii   = a->compressedrow.i;
821d6232bceSMichael Lange     ridx = a->compressedrow.rindex;
822d6232bceSMichael Lange     for (i=0; i<mbs; i++) {
823d6232bceSMichael Lange       n    = ii[i+1] - ii[i];
824d6232bceSMichael Lange       idx  = a->j + ii[i];
825d6232bceSMichael Lange       v    = a->a + ii[i]*25;
826d6232bceSMichael Lange       sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
827d6232bceSMichael Lange       PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
828d6232bceSMichael Lange       PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
829d6232bceSMichael Lange       for (j=0; j<n; j++) {
830d6232bceSMichael Lange 	xb = x + 5*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
831d6232bceSMichael Lange 	sum1 += v[25*j]*x1   + v[25*j+5]*x2 + v[25*j+10]*x3 + v[25*j+15]*x4 + v[25*j+20]*x5;
832d6232bceSMichael Lange 	sum2 += v[25*j+1]*x1 + v[25*j+6]*x2 + v[25*j+11]*x3 + v[25*j+16]*x4 + v[25*j+21]*x5;
833d6232bceSMichael Lange 	sum3 += v[25*j+2]*x1 + v[25*j+7]*x2 + v[25*j+12]*x3 + v[25*j+17]*x4 + v[25*j+22]*x5;
834d6232bceSMichael Lange 	sum4 += v[25*j+3]*x1 + v[25*j+8]*x2 + v[25*j+13]*x3 + v[25*j+18]*x4 + v[25*j+23]*x5;
835d6232bceSMichael Lange 	sum5 += v[25*j+4]*x1 + v[25*j+9]*x2 + v[25*j+14]*x3 + v[25*j+19]*x4 + v[25*j+24]*x5;
836d6232bceSMichael Lange       }
837d6232bceSMichael Lange       z[5*ridx[i]]   = sum1; z[5*ridx[i]+1] = sum2; z[5*ridx[i]+2] = sum3;
838d6232bceSMichael Lange       z[5*ridx[i]+3] = sum4; z[5*ridx[i]+4] = sum5;
839d6232bceSMichael Lange     }
840d6232bceSMichael Lange     ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
841d6232bceSMichael Lange     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
842d6232bceSMichael Lange   } else {
843d6232bceSMichael Lange     ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_5_Kernel,3,A,xx,zz);CHKERRQ(ierr);
844d6232bceSMichael Lange   }
845d6232bceSMichael Lange   ierr = PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);CHKERRQ(ierr);
846d6232bceSMichael Lange   PetscFunctionReturn(0);
847d6232bceSMichael Lange }
848d6232bceSMichael Lange #else
8494a2ae208SSatish Balay #undef __FUNCT__
8504a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqBAIJ_5"
851dfbe8321SBarry Smith PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
8522d61bbb3SSatish Balay {
8532d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
854d9fead3dSBarry Smith   PetscScalar       sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray;
855d9fead3dSBarry Smith   const PetscScalar *xb,*x;
856d9fead3dSBarry Smith   const MatScalar   *v;
857dfbe8321SBarry Smith   PetscErrorCode    ierr;
8580298fd71SBarry Smith   const PetscInt    *idx,*ii,*ridx=NULL;
8597c565772SBarry Smith   PetscInt          mbs,i,j,n;
860ace3abfcSBarry Smith   PetscBool         usecprow=a->compressedrow.use;
8612d61bbb3SSatish Balay 
862433994e6SBarry Smith   PetscFunctionBegin;
8633649974fSBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
86426e093fcSHong Zhang   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
8652d61bbb3SSatish Balay 
8662d61bbb3SSatish Balay   idx = a->j;
8672d61bbb3SSatish Balay   v   = a->a;
86826e093fcSHong Zhang   if (usecprow) {
86926e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
87026e093fcSHong Zhang     ii   = a->compressedrow.i;
8717b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
87226e093fcSHong Zhang   } else {
87326e093fcSHong Zhang     mbs = a->mbs;
8742d61bbb3SSatish Balay     ii  = a->i;
87526e093fcSHong Zhang     z   = zarray;
87626e093fcSHong Zhang   }
8772d61bbb3SSatish Balay 
8782d61bbb3SSatish Balay   for (i=0; i<mbs; i++) {
8792d61bbb3SSatish Balay     n           = ii[1] - ii[0]; ii++;
8802d61bbb3SSatish Balay     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
881444d8c10SJed Brown     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
882444d8c10SJed Brown     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
8832d61bbb3SSatish Balay     for (j=0; j<n; j++) {
8842d61bbb3SSatish Balay       xb    = x + 5*(*idx++);
8852d61bbb3SSatish Balay       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
8862d61bbb3SSatish Balay       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
8872d61bbb3SSatish Balay       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
8882d61bbb3SSatish Balay       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
8892d61bbb3SSatish Balay       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
8902d61bbb3SSatish Balay       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
8912d61bbb3SSatish Balay       v    += 25;
8922d61bbb3SSatish Balay     }
8937b2bb3b9SHong Zhang     if (usecprow) z = zarray + 5*ridx[i];
8942d61bbb3SSatish Balay     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
89526e093fcSHong Zhang     if (!usecprow) z += 5;
8962d61bbb3SSatish Balay   }
8973649974fSBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
89826e093fcSHong Zhang   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
8997c565772SBarry Smith   ierr = PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);CHKERRQ(ierr);
9002d61bbb3SSatish Balay   PetscFunctionReturn(0);
9012d61bbb3SSatish Balay }
902d6232bceSMichael Lange #endif
9032d61bbb3SSatish Balay 
90415091d37SBarry Smith 
905d6232bceSMichael Lange #if defined(PETSC_THREADCOMM_ACTIVE)
906d6232bceSMichael Lange PetscErrorCode MatMult_SeqBAIJ_6_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz)
907d6232bceSMichael Lange {
908d6232bceSMichael Lange   PetscErrorCode    ierr;
909d6232bceSMichael Lange   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
910d6232bceSMichael Lange   PetscScalar       *z,x1,x2,x3,x4,x5,x6,sum1,sum2,sum3,sum4,sum5,sum6;
911d6232bceSMichael Lange   const PetscScalar *x,*xb;
912d6232bceSMichael Lange   const MatScalar   *aa;
913d6232bceSMichael Lange   PetscInt          *trstarts=A->rmap->trstarts;
914d6232bceSMichael Lange   PetscInt          n,start,end,i,j;
915d6232bceSMichael Lange   const PetscInt    *aj,*ai;
916d6232bceSMichael Lange 
917d6232bceSMichael Lange   ierr   = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
918d6232bceSMichael Lange   ierr   = VecGetArray(zz,&z);CHKERRQ(ierr);
919d6232bceSMichael Lange   start  = trstarts[thread_id] / 6;
920d6232bceSMichael Lange   end    = trstarts[thread_id+1] / 6;
921d6232bceSMichael Lange   ai     = a->i;
922d6232bceSMichael Lange   for (i=start; i<end; i++) {
923d6232bceSMichael Lange     n    = ai[i+1] - ai[i];
924d6232bceSMichael Lange     aj   = a->j + ai[i];
925d6232bceSMichael Lange     aa   = a->a + ai[i]*36;
926d6232bceSMichael Lange     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0;
927d6232bceSMichael Lange     for (j=0; j<n; j++) {
928d6232bceSMichael Lange       xb = x + 6*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
929d6232bceSMichael Lange       sum1 += aa[36*j]*x1   + aa[36*j+6]*x2  + aa[36*j+12]*x3 + aa[36*j+18]*x4 + aa[36*j+24]*x5 + aa[36*j+30]*x6;
930d6232bceSMichael Lange       sum2 += aa[36*j+1]*x1 + aa[36*j+7]*x2  + aa[36*j+13]*x3 + aa[36*j+19]*x4 + aa[36*j+25]*x5 + aa[36*j+31]*x6;
931d6232bceSMichael Lange       sum3 += aa[36*j+2]*x1 + aa[36*j+8]*x2  + aa[36*j+14]*x3 + aa[36*j+20]*x4 + aa[36*j+26]*x5 + aa[36*j+32]*x6;
932d6232bceSMichael Lange       sum4 += aa[36*j+3]*x1 + aa[36*j+9]*x2  + aa[36*j+15]*x3 + aa[36*j+21]*x4 + aa[36*j+27]*x5 + aa[36*j+33]*x6;
933d6232bceSMichael Lange       sum5 += aa[36*j+4]*x1 + aa[36*j+10]*x2 + aa[36*j+16]*x3 + aa[36*j+22]*x4 + aa[36*j+28]*x5 + aa[36*j+34]*x6;
934d6232bceSMichael Lange       sum6 += aa[36*j+5]*x1 + aa[36*j+11]*x2 + aa[36*j+17]*x3 + aa[36*j+23]*x4 + aa[36*j+29]*x5 + aa[36*j+35]*x6;
935d6232bceSMichael Lange     }
936d6232bceSMichael Lange     z[6*i]   = sum1; z[6*i+1] = sum2; z[6*i+2] = sum3;
937d6232bceSMichael Lange     z[6*i+3] = sum4; z[6*i+4] = sum5; z[6*i+5] = sum6;
938d6232bceSMichael Lange   }
939d6232bceSMichael Lange   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
940d6232bceSMichael Lange   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
941d6232bceSMichael Lange   return 0;
942d6232bceSMichael Lange }
943d6232bceSMichael Lange 
944d6232bceSMichael Lange #undef __FUNCT__
945d6232bceSMichael Lange #define __FUNCT__ "MatMult_SeqBAIJ_6"
946d6232bceSMichael Lange PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
947d6232bceSMichael Lange {
948d6232bceSMichael Lange   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
949d6232bceSMichael Lange   PetscScalar       *z,x1,x2,x3,x4,x5,x6,sum1,sum2,sum3,sum4,sum5,sum6;
950d6232bceSMichael Lange   const PetscScalar *x,*xb;
951d6232bceSMichael Lange   const MatScalar   *v;
952d6232bceSMichael Lange   PetscErrorCode    ierr;
953d6232bceSMichael Lange   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
954d6232bceSMichael Lange   PetscBool         usecprow=a->compressedrow.use;
955d6232bceSMichael Lange 
956d6232bceSMichael Lange   PetscFunctionBegin;
957d6232bceSMichael Lange 
958d6232bceSMichael Lange   if (usecprow) {
959d6232bceSMichael Lange     ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
960d6232bceSMichael Lange     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
961d6232bceSMichael Lange     mbs  = a->compressedrow.nrows;
962d6232bceSMichael Lange     ii   = a->compressedrow.i;
963d6232bceSMichael Lange     ridx = a->compressedrow.rindex;
964d6232bceSMichael Lange     for (i=0; i<mbs; i++) {
965d6232bceSMichael Lange       n  = ii[i+1] - ii[i];
966d6232bceSMichael Lange       idx  = a->j + ii[i];
967d6232bceSMichael Lange       v    = a->a + ii[i]*36;
968d6232bceSMichael Lange       sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0;
969d6232bceSMichael Lange       PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
970d6232bceSMichael Lange       PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
971d6232bceSMichael Lange       for (j=0; j<n; j++) {
972d6232bceSMichael Lange 	xb = x + 6*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
973d6232bceSMichael Lange 	sum1 += v[36*j]*x1   + v[36*j+6]*x2  + v[36*j+12]*x3 + v[36*j+18]*x4 + v[36*j+24]*x5 + v[36*j+30]*x6;
974d6232bceSMichael Lange 	sum2 += v[36*j+1]*x1 + v[36*j+7]*x2  + v[36*j+13]*x3 + v[36*j+19]*x4 + v[36*j+25]*x5 + v[36*j+31]*x6;
975d6232bceSMichael Lange 	sum3 += v[36*j+2]*x1 + v[36*j+8]*x2  + v[36*j+14]*x3 + v[36*j+20]*x4 + v[36*j+26]*x5 + v[36*j+32]*x6;
976d6232bceSMichael Lange 	sum4 += v[36*j+3]*x1 + v[36*j+9]*x2  + v[36*j+15]*x3 + v[36*j+21]*x4 + v[36*j+27]*x5 + v[36*j+33]*x6;
977d6232bceSMichael Lange 	sum5 += v[36*j+4]*x1 + v[36*j+10]*x2 + v[36*j+16]*x3 + v[36*j+22]*x4 + v[36*j+28]*x5 + v[36*j+34]*x6;
978d6232bceSMichael Lange 	sum6 += v[36*j+5]*x1 + v[36*j+11]*x2 + v[36*j+17]*x3 + v[36*j+23]*x4 + v[36*j+29]*x5 + v[36*j+35]*x6;
979d6232bceSMichael Lange       }
980d6232bceSMichael Lange       z[6*ridx[i]]   = sum1; z[6*ridx[i]+1] = sum2; z[6*ridx[i]+2] = sum3;
981d6232bceSMichael Lange       z[6*ridx[i]+3] = sum4; z[6*ridx[i]+4] = sum5; z[6*ridx[i]+5] = sum6;
982d6232bceSMichael Lange     }
983d6232bceSMichael Lange     ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
984d6232bceSMichael Lange     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
985d6232bceSMichael Lange   } else {
986d6232bceSMichael Lange     ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_6_Kernel,3,A,xx,zz);CHKERRQ(ierr);
987d6232bceSMichael Lange   }
988d6232bceSMichael Lange   ierr = PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);CHKERRQ(ierr);
989d6232bceSMichael Lange   PetscFunctionReturn(0);
990d6232bceSMichael Lange }
991d6232bceSMichael Lange #else
9924a2ae208SSatish Balay #undef __FUNCT__
9934a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqBAIJ_6"
994dfbe8321SBarry Smith PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
99515091d37SBarry Smith {
99615091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
997d9fead3dSBarry Smith   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
998d9fead3dSBarry Smith   const PetscScalar *x,*xb;
99926e093fcSHong Zhang   PetscScalar       x1,x2,x3,x4,x5,x6,*zarray;
1000d9fead3dSBarry Smith   const MatScalar   *v;
1001dfbe8321SBarry Smith   PetscErrorCode    ierr;
10027c565772SBarry Smith   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
1003ace3abfcSBarry Smith   PetscBool         usecprow=a->compressedrow.use;
100415091d37SBarry Smith 
1005433994e6SBarry Smith   PetscFunctionBegin;
10063649974fSBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
100726e093fcSHong Zhang   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
100815091d37SBarry Smith 
100915091d37SBarry Smith   idx = a->j;
101015091d37SBarry Smith   v   = a->a;
101126e093fcSHong Zhang   if (usecprow) {
101226e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
101326e093fcSHong Zhang     ii   = a->compressedrow.i;
10147b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
101526e093fcSHong Zhang   } else {
101626e093fcSHong Zhang     mbs = a->mbs;
101715091d37SBarry Smith     ii  = a->i;
101826e093fcSHong Zhang     z   = zarray;
101926e093fcSHong Zhang   }
102015091d37SBarry Smith 
102115091d37SBarry Smith   for (i=0; i<mbs; i++) {
102226fbe8dcSKarl Rupp     n  = ii[1] - ii[0];
102326fbe8dcSKarl Rupp     ii++;
102426fbe8dcSKarl Rupp     sum1 = 0.0;
102526fbe8dcSKarl Rupp     sum2 = 0.0;
102626fbe8dcSKarl Rupp     sum3 = 0.0;
102726fbe8dcSKarl Rupp     sum4 = 0.0;
102826fbe8dcSKarl Rupp     sum5 = 0.0;
102926fbe8dcSKarl Rupp     sum6 = 0.0;
103026fbe8dcSKarl Rupp 
1031444d8c10SJed Brown     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1032444d8c10SJed Brown     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
103315091d37SBarry Smith     for (j=0; j<n; j++) {
103415091d37SBarry Smith       xb    = x + 6*(*idx++);
103515091d37SBarry Smith       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
103615091d37SBarry Smith       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
103715091d37SBarry Smith       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
103815091d37SBarry Smith       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
103915091d37SBarry Smith       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
104015091d37SBarry Smith       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
104115091d37SBarry Smith       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
104215091d37SBarry Smith       v    += 36;
104315091d37SBarry Smith     }
10447b2bb3b9SHong Zhang     if (usecprow) z = zarray + 6*ridx[i];
104515091d37SBarry Smith     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
104626e093fcSHong Zhang     if (!usecprow) z += 6;
104715091d37SBarry Smith   }
104815091d37SBarry Smith 
10493649974fSBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
105026e093fcSHong Zhang   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
10517c565772SBarry Smith   ierr = PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);CHKERRQ(ierr);
105215091d37SBarry Smith   PetscFunctionReturn(0);
105315091d37SBarry Smith }
1054d6232bceSMichael Lange #endif
10558ab949d8SShri Abhyankar 
1056d6232bceSMichael Lange #if defined(PETSC_THREADCOMM_ACTIVE)
1057d6232bceSMichael Lange PetscErrorCode MatMult_SeqBAIJ_7_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz)
1058d6232bceSMichael Lange {
1059d6232bceSMichael Lange   PetscErrorCode    ierr;
1060d6232bceSMichael Lange   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1061d6232bceSMichael Lange   PetscScalar       *z,x1,x2,x3,x4,x5,x6,x7,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1062d6232bceSMichael Lange   const PetscScalar *x,*xb;
1063d6232bceSMichael Lange   const MatScalar   *aa;
1064d6232bceSMichael Lange   PetscInt          *trstarts=A->rmap->trstarts;
1065d6232bceSMichael Lange   PetscInt          n,start,end,i,j;
1066d6232bceSMichael Lange   const PetscInt    *aj,*ai;
1067d6232bceSMichael Lange 
1068d6232bceSMichael Lange   ierr   = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1069d6232bceSMichael Lange   ierr   = VecGetArray(zz,&z);CHKERRQ(ierr);
1070d6232bceSMichael Lange   start  = trstarts[thread_id] / 7;
1071d6232bceSMichael Lange   end    = trstarts[thread_id+1] / 7;
1072d6232bceSMichael Lange   ai     = a->i;
1073d6232bceSMichael Lange   for (i=start; i<end; i++) {
1074d6232bceSMichael Lange     n    = ai[i+1] - ai[i];
1075d6232bceSMichael Lange     aj   = a->j + ai[i];
1076d6232bceSMichael Lange     aa   = a->a + ai[i]*49;
1077d6232bceSMichael Lange     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1078d6232bceSMichael Lange     for (j=0; j<n; j++) {
1079d6232bceSMichael Lange       xb = x + 7*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1080d6232bceSMichael Lange       sum1 += aa[49*j]*x1   + aa[49*j+7]*x2  + aa[49*j+14]*x3 + aa[49*j+21]*x4 + aa[49*j+28]*x5 + aa[49*j+35]*x6 + aa[49*j+42]*x7;
1081d6232bceSMichael Lange       sum2 += aa[49*j+1]*x1 + aa[49*j+8]*x2  + aa[49*j+15]*x3 + aa[49*j+22]*x4 + aa[49*j+29]*x5 + aa[49*j+36]*x6 + aa[49*j+43]*x7;
1082d6232bceSMichael Lange       sum3 += aa[49*j+2]*x1 + aa[49*j+9]*x2  + aa[49*j+16]*x3 + aa[49*j+23]*x4 + aa[49*j+30]*x5 + aa[49*j+37]*x6 + aa[49*j+44]*x7;
1083d6232bceSMichael Lange       sum4 += aa[49*j+3]*x1 + aa[49*j+10]*x2 + aa[49*j+17]*x3 + aa[49*j+24]*x4 + aa[49*j+31]*x5 + aa[49*j+38]*x6 + aa[49*j+45]*x7;
1084d6232bceSMichael Lange       sum5 += aa[49*j+4]*x1 + aa[49*j+11]*x2 + aa[49*j+18]*x3 + aa[49*j+25]*x4 + aa[49*j+32]*x5 + aa[49*j+39]*x6 + aa[49*j+46]*x7;
1085d6232bceSMichael Lange       sum6 += aa[49*j+5]*x1 + aa[49*j+12]*x2 + aa[49*j+19]*x3 + aa[49*j+26]*x4 + aa[49*j+33]*x5 + aa[49*j+40]*x6 + aa[49*j+47]*x7;
1086d6232bceSMichael Lange       sum7 += aa[49*j+6]*x1 + aa[49*j+13]*x2 + aa[49*j+20]*x3 + aa[49*j+27]*x4 + aa[49*j+34]*x5 + aa[49*j+41]*x6 + aa[49*j+48]*x7;
1087d6232bceSMichael Lange     }
1088d6232bceSMichael Lange     z[7*i]   = sum1; z[7*i+1] = sum2; z[7*i+2] = sum3; z[7*i+3] = sum4;
1089d6232bceSMichael Lange     z[7*i+4] = sum5; z[7*i+5] = sum6; z[7*i+6] = sum7;
1090d6232bceSMichael Lange   }
1091d6232bceSMichael Lange   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1092d6232bceSMichael Lange   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1093d6232bceSMichael Lange   return 0;
1094d6232bceSMichael Lange }
1095d6232bceSMichael Lange 
1096d6232bceSMichael Lange #undef __FUNCT__
1097d6232bceSMichael Lange #define __FUNCT__ "MatMult_SeqBAIJ_7"
1098d6232bceSMichael Lange PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
1099d6232bceSMichael Lange {
1100d6232bceSMichael Lange   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1101d6232bceSMichael Lange   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1102d6232bceSMichael Lange   const PetscScalar *x,*xb;
1103d6232bceSMichael Lange   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*zarray;
1104d6232bceSMichael Lange   const MatScalar   *v;
1105d6232bceSMichael Lange   PetscErrorCode    ierr;
1106d6232bceSMichael Lange   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
1107d6232bceSMichael Lange   PetscBool         usecprow=a->compressedrow.use;
1108d6232bceSMichael Lange 
1109d6232bceSMichael Lange   PetscFunctionBegin;
1110d6232bceSMichael Lange 
1111d6232bceSMichael Lange   if (usecprow) {
1112d6232bceSMichael Lange     ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1113d6232bceSMichael Lange     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1114d6232bceSMichael Lange     mbs  = a->compressedrow.nrows;
1115d6232bceSMichael Lange     ii   = a->compressedrow.i;
1116d6232bceSMichael Lange     ridx = a->compressedrow.rindex;
1117d6232bceSMichael Lange     for (i=0; i<mbs; i++) {
1118d6232bceSMichael Lange       n    = ii[i+1] - ii[i];
1119d6232bceSMichael Lange       idx  = a->j + ii[i];
1120d6232bceSMichael Lange       v    = a->a + ii[i]*49;
1121d6232bceSMichael Lange       sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1122d6232bceSMichael Lange       PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1123d6232bceSMichael Lange       PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1124d6232bceSMichael Lange       for (j=0; j<n; j++) {
1125d6232bceSMichael Lange 	xb = x + 7*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1126d6232bceSMichael Lange 	sum1 += v[49*j]*x1   + v[49*j+7]*x2  + v[49*j+14]*x3 + v[49*j+21]*x4 + v[49*j+28]*x5 + v[49*j+35]*x6 + v[49*j+42]*x7;
1127d6232bceSMichael Lange 	sum2 += v[49*j+1]*x1 + v[49*j+8]*x2  + v[49*j+15]*x3 + v[49*j+22]*x4 + v[49*j+29]*x5 + v[49*j+36]*x6 + v[49*j+43]*x7;
1128d6232bceSMichael Lange 	sum3 += v[49*j+2]*x1 + v[49*j+9]*x2  + v[49*j+16]*x3 + v[49*j+23]*x4 + v[49*j+30]*x5 + v[49*j+37]*x6 + v[49*j+44]*x7;
1129d6232bceSMichael Lange 	sum4 += v[49*j+3]*x1 + v[49*j+10]*x2 + v[49*j+17]*x3 + v[49*j+24]*x4 + v[49*j+31]*x5 + v[49*j+38]*x6 + v[49*j+45]*x7;
1130d6232bceSMichael Lange 	sum5 += v[49*j+4]*x1 + v[49*j+11]*x2 + v[49*j+18]*x3 + v[49*j+25]*x4 + v[49*j+32]*x5 + v[49*j+39]*x6 + v[49*j+46]*x7;
1131d6232bceSMichael Lange 	sum6 += v[49*j+5]*x1 + v[49*j+12]*x2 + v[49*j+19]*x3 + v[49*j+26]*x4 + v[49*j+33]*x5 + v[49*j+40]*x6 + v[49*j+47]*x7;
1132d6232bceSMichael Lange 	sum7 += v[49*j+6]*x1 + v[49*j+13]*x2 + v[49*j+20]*x3 + v[49*j+27]*x4 + v[49*j+34]*x5 + v[49*j+41]*x6 + v[49*j+48]*x7;
1133d6232bceSMichael Lange       }
1134d6232bceSMichael Lange       z[7*ridx[i]]   = sum1; z[7*ridx[i]+1] = sum2; z[7*ridx[i]+2] = sum3; z[7*ridx[i]+3] = sum4;
1135d6232bceSMichael Lange       z[7*ridx[i]+4] = sum5; z[7*ridx[i]+5] = sum6; z[7*ridx[i]+6] = sum7;
1136d6232bceSMichael Lange     }
1137d6232bceSMichael Lange     ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1138d6232bceSMichael Lange     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1139d6232bceSMichael Lange   } else {
1140d6232bceSMichael Lange     ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_7_Kernel,3,A,xx,zz);CHKERRQ(ierr);
1141d6232bceSMichael Lange   }
1142d6232bceSMichael Lange   ierr = PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);CHKERRQ(ierr);
1143d6232bceSMichael Lange   PetscFunctionReturn(0);
1144d6232bceSMichael Lange }
1145d6232bceSMichael Lange #else
11464a2ae208SSatish Balay #undef __FUNCT__
11474a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqBAIJ_7"
1148dfbe8321SBarry Smith PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
11492d61bbb3SSatish Balay {
11502d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1151d9fead3dSBarry Smith   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1152d9fead3dSBarry Smith   const PetscScalar *x,*xb;
115326e093fcSHong Zhang   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*zarray;
1154d9fead3dSBarry Smith   const MatScalar   *v;
1155dfbe8321SBarry Smith   PetscErrorCode    ierr;
11567c565772SBarry Smith   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
1157ace3abfcSBarry Smith   PetscBool         usecprow=a->compressedrow.use;
11582d61bbb3SSatish Balay 
1159433994e6SBarry Smith   PetscFunctionBegin;
11603649974fSBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
116126e093fcSHong Zhang   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
11622d61bbb3SSatish Balay 
11632d61bbb3SSatish Balay   idx = a->j;
11642d61bbb3SSatish Balay   v   = a->a;
116526e093fcSHong Zhang   if (usecprow) {
116626e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
116726e093fcSHong Zhang     ii   = a->compressedrow.i;
11687b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
116926e093fcSHong Zhang   } else {
117026e093fcSHong Zhang     mbs = a->mbs;
11712d61bbb3SSatish Balay     ii  = a->i;
117226e093fcSHong Zhang     z   = zarray;
117326e093fcSHong Zhang   }
11742d61bbb3SSatish Balay 
11752d61bbb3SSatish Balay   for (i=0; i<mbs; i++) {
117626fbe8dcSKarl Rupp     n  = ii[1] - ii[0];
117726fbe8dcSKarl Rupp     ii++;
117826fbe8dcSKarl Rupp     sum1 = 0.0;
117926fbe8dcSKarl Rupp     sum2 = 0.0;
118026fbe8dcSKarl Rupp     sum3 = 0.0;
118126fbe8dcSKarl Rupp     sum4 = 0.0;
118226fbe8dcSKarl Rupp     sum5 = 0.0;
118326fbe8dcSKarl Rupp     sum6 = 0.0;
118426fbe8dcSKarl Rupp     sum7 = 0.0;
118526fbe8dcSKarl Rupp 
1186444d8c10SJed Brown     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1187444d8c10SJed Brown     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
11882d61bbb3SSatish Balay     for (j=0; j<n; j++) {
11892d61bbb3SSatish Balay       xb    = x + 7*(*idx++);
11902d61bbb3SSatish Balay       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
11912d61bbb3SSatish Balay       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
11922d61bbb3SSatish Balay       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
11932d61bbb3SSatish Balay       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
11942d61bbb3SSatish Balay       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
11952d61bbb3SSatish Balay       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
11962d61bbb3SSatish Balay       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
11972d61bbb3SSatish Balay       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
11982d61bbb3SSatish Balay       v    += 49;
11992d61bbb3SSatish Balay     }
12007b2bb3b9SHong Zhang     if (usecprow) z = zarray + 7*ridx[i];
12012d61bbb3SSatish Balay     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
120226e093fcSHong Zhang     if (!usecprow) z += 7;
12032d61bbb3SSatish Balay   }
12042d61bbb3SSatish Balay 
12053649974fSBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
120626e093fcSHong Zhang   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
12077c565772SBarry Smith   ierr = PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);CHKERRQ(ierr);
12082d61bbb3SSatish Balay   PetscFunctionReturn(0);
12092d61bbb3SSatish Balay }
1210d6232bceSMichael Lange #endif
12112d61bbb3SSatish Balay 
12128ab949d8SShri Abhyankar /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
1213832cc040SShri Abhyankar /* Default MatMult for block size 15 */
12148ab949d8SShri Abhyankar 
12158ab949d8SShri Abhyankar #undef __FUNCT__
12168ab949d8SShri Abhyankar #define __FUNCT__ "MatMult_SeqBAIJ_15_ver1"
12178ab949d8SShri Abhyankar PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz)
12188ab949d8SShri Abhyankar {
12198ab949d8SShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
12208ab949d8SShri Abhyankar   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
12218ab949d8SShri Abhyankar   const PetscScalar *x,*xb;
122253ef36baSBarry Smith   PetscScalar       *zarray,xv;
12238ab949d8SShri Abhyankar   const MatScalar   *v;
12248ab949d8SShri Abhyankar   PetscErrorCode    ierr;
12258ab949d8SShri Abhyankar   const PetscInt    *ii,*ij=a->j,*idx;
12267c565772SBarry Smith   PetscInt          mbs,i,j,k,n,*ridx=NULL;
1227ace3abfcSBarry Smith   PetscBool         usecprow=a->compressedrow.use;
12288ab949d8SShri Abhyankar 
12298ab949d8SShri Abhyankar   PetscFunctionBegin;
12303649974fSBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
12318ab949d8SShri Abhyankar   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
12328ab949d8SShri Abhyankar 
12338ab949d8SShri Abhyankar   v = a->a;
12348ab949d8SShri Abhyankar   if (usecprow) {
12358ab949d8SShri Abhyankar     mbs  = a->compressedrow.nrows;
12368ab949d8SShri Abhyankar     ii   = a->compressedrow.i;
12378ab949d8SShri Abhyankar     ridx = a->compressedrow.rindex;
12388ab949d8SShri Abhyankar   } else {
12398ab949d8SShri Abhyankar     mbs = a->mbs;
12408ab949d8SShri Abhyankar     ii  = a->i;
12418ab949d8SShri Abhyankar     z   = zarray;
12428ab949d8SShri Abhyankar   }
12438ab949d8SShri Abhyankar 
12448ab949d8SShri Abhyankar   for (i=0; i<mbs; i++) {
12458ab949d8SShri Abhyankar     n    = ii[i+1] - ii[i];
12468ab949d8SShri Abhyankar     idx  = ij + ii[i];
12478ab949d8SShri Abhyankar     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
12488ab949d8SShri Abhyankar     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
12498ab949d8SShri Abhyankar 
12508ab949d8SShri Abhyankar     for (j=0; j<n; j++) {
12518ab949d8SShri Abhyankar       xb = x + 15*(idx[j]);
12528ab949d8SShri Abhyankar 
12538ab949d8SShri Abhyankar       for (k=0; k<15; k++) {
125453ef36baSBarry Smith         xv     =  xb[k];
125553ef36baSBarry Smith         sum1  += v[0]*xv;
125653ef36baSBarry Smith         sum2  += v[1]*xv;
125753ef36baSBarry Smith         sum3  += v[2]*xv;
125853ef36baSBarry Smith         sum4  += v[3]*xv;
125953ef36baSBarry Smith         sum5  += v[4]*xv;
126053ef36baSBarry Smith         sum6  += v[5]*xv;
126153ef36baSBarry Smith         sum7  += v[6]*xv;
126253ef36baSBarry Smith         sum8  += v[7]*xv;
126353ef36baSBarry Smith         sum9  += v[8]*xv;
126453ef36baSBarry Smith         sum10 += v[9]*xv;
126553ef36baSBarry Smith         sum11 += v[10]*xv;
126653ef36baSBarry Smith         sum12 += v[11]*xv;
126753ef36baSBarry Smith         sum13 += v[12]*xv;
126853ef36baSBarry Smith         sum14 += v[13]*xv;
126953ef36baSBarry Smith         sum15 += v[14]*xv;
12708ab949d8SShri Abhyankar         v     += 15;
12718ab949d8SShri Abhyankar       }
12728ab949d8SShri Abhyankar     }
12738ab949d8SShri Abhyankar     if (usecprow) z = zarray + 15*ridx[i];
12748ab949d8SShri Abhyankar     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
12758ab949d8SShri Abhyankar     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
12768ab949d8SShri Abhyankar 
12778ab949d8SShri Abhyankar     if (!usecprow) z += 15;
12788ab949d8SShri Abhyankar   }
12798ab949d8SShri Abhyankar 
12803649974fSBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
12818ab949d8SShri Abhyankar   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
12827c565772SBarry Smith   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
12838ab949d8SShri Abhyankar   PetscFunctionReturn(0);
12848ab949d8SShri Abhyankar }
12858ab949d8SShri Abhyankar 
12868ab949d8SShri Abhyankar /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
12878ab949d8SShri Abhyankar #undef __FUNCT__
12888ab949d8SShri Abhyankar #define __FUNCT__ "MatMult_SeqBAIJ_15_ver2"
12898ab949d8SShri Abhyankar PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz)
12908ab949d8SShri Abhyankar {
12918ab949d8SShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
12928ab949d8SShri Abhyankar   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
12938ab949d8SShri Abhyankar   const PetscScalar *x,*xb;
12940b8f6341SShri Abhyankar   PetscScalar       x1,x2,x3,x4,*zarray;
12958ab949d8SShri Abhyankar   const MatScalar   *v;
12968ab949d8SShri Abhyankar   PetscErrorCode    ierr;
12978ab949d8SShri Abhyankar   const PetscInt    *ii,*ij=a->j,*idx;
12987c565772SBarry Smith   PetscInt          mbs,i,j,n,*ridx=NULL;
1299ace3abfcSBarry Smith   PetscBool         usecprow=a->compressedrow.use;
13008ab949d8SShri Abhyankar 
13018ab949d8SShri Abhyankar   PetscFunctionBegin;
13023649974fSBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
13038ab949d8SShri Abhyankar   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
13048ab949d8SShri Abhyankar 
13058ab949d8SShri Abhyankar   v = a->a;
13068ab949d8SShri Abhyankar   if (usecprow) {
13078ab949d8SShri Abhyankar     mbs  = a->compressedrow.nrows;
13088ab949d8SShri Abhyankar     ii   = a->compressedrow.i;
13098ab949d8SShri Abhyankar     ridx = a->compressedrow.rindex;
13108ab949d8SShri Abhyankar   } else {
13118ab949d8SShri Abhyankar     mbs = a->mbs;
13128ab949d8SShri Abhyankar     ii  = a->i;
13138ab949d8SShri Abhyankar     z   = zarray;
13148ab949d8SShri Abhyankar   }
13158ab949d8SShri Abhyankar 
13168ab949d8SShri Abhyankar   for (i=0; i<mbs; i++) {
13178ab949d8SShri Abhyankar     n    = ii[i+1] - ii[i];
13188ab949d8SShri Abhyankar     idx  = ij + ii[i];
13198ab949d8SShri Abhyankar     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
13208ab949d8SShri Abhyankar     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
13218ab949d8SShri Abhyankar 
13228ab949d8SShri Abhyankar     for (j=0; j<n; j++) {
13238ab949d8SShri Abhyankar       xb = x + 15*(idx[j]);
13240b8f6341SShri Abhyankar       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
13258ab949d8SShri Abhyankar 
13268ab949d8SShri Abhyankar       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
13278ab949d8SShri Abhyankar       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
13288ab949d8SShri Abhyankar       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
13298ab949d8SShri Abhyankar       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
13308ab949d8SShri Abhyankar       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
13318ab949d8SShri Abhyankar       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
13328ab949d8SShri Abhyankar       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
13338ab949d8SShri Abhyankar       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
13348ab949d8SShri Abhyankar       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
13358ab949d8SShri Abhyankar       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
13368ab949d8SShri Abhyankar       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
13378ab949d8SShri Abhyankar       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
13388ab949d8SShri Abhyankar       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
13398ab949d8SShri Abhyankar       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
13408ab949d8SShri Abhyankar       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
13418ab949d8SShri Abhyankar 
13428ab949d8SShri Abhyankar       v += 60;
13438ab949d8SShri Abhyankar 
13440b8f6341SShri Abhyankar       x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];
13458ab949d8SShri Abhyankar 
13468ab949d8SShri Abhyankar       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
13478ab949d8SShri Abhyankar       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
13488ab949d8SShri Abhyankar       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
13498ab949d8SShri Abhyankar       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
13508ab949d8SShri Abhyankar       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
13518ab949d8SShri Abhyankar       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
13528ab949d8SShri Abhyankar       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
13538ab949d8SShri Abhyankar       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
13548ab949d8SShri Abhyankar       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
13558ab949d8SShri Abhyankar       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
13568ab949d8SShri Abhyankar       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
13578ab949d8SShri Abhyankar       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
13588ab949d8SShri Abhyankar       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
13598ab949d8SShri Abhyankar       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
13608ab949d8SShri Abhyankar       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
13618ab949d8SShri Abhyankar       v     += 60;
13628ab949d8SShri Abhyankar 
13630b8f6341SShri Abhyankar       x1     = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
13640b8f6341SShri Abhyankar       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
13650b8f6341SShri Abhyankar       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
13660b8f6341SShri Abhyankar       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
13670b8f6341SShri Abhyankar       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
13680b8f6341SShri Abhyankar       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
13690b8f6341SShri Abhyankar       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
13700b8f6341SShri Abhyankar       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
13710b8f6341SShri Abhyankar       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
13720b8f6341SShri Abhyankar       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
13730b8f6341SShri Abhyankar       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
13740b8f6341SShri Abhyankar       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
13750b8f6341SShri Abhyankar       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
13760b8f6341SShri Abhyankar       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
13770b8f6341SShri Abhyankar       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
13780b8f6341SShri Abhyankar       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
13790b8f6341SShri Abhyankar       v     += 60;
13800b8f6341SShri Abhyankar 
13810b8f6341SShri Abhyankar       x1     = xb[12]; x2 = xb[13]; x3 = xb[14];
13828ab949d8SShri Abhyankar       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3;
13838ab949d8SShri Abhyankar       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3;
13848ab949d8SShri Abhyankar       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3;
13858ab949d8SShri Abhyankar       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3;
13868ab949d8SShri Abhyankar       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3;
13878ab949d8SShri Abhyankar       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3;
13888ab949d8SShri Abhyankar       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3;
13898ab949d8SShri Abhyankar       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3;
13908ab949d8SShri Abhyankar       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3;
13918ab949d8SShri Abhyankar       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3;
13928ab949d8SShri Abhyankar       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3;
13938ab949d8SShri Abhyankar       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3;
13948ab949d8SShri Abhyankar       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3;
13958ab949d8SShri Abhyankar       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3;
13968ab949d8SShri Abhyankar       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3;
13978ab949d8SShri Abhyankar       v     += 45;
13988ab949d8SShri Abhyankar     }
13998ab949d8SShri Abhyankar     if (usecprow) z = zarray + 15*ridx[i];
14008ab949d8SShri Abhyankar     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
14018ab949d8SShri Abhyankar     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
14028ab949d8SShri Abhyankar 
14038ab949d8SShri Abhyankar     if (!usecprow) z += 15;
14048ab949d8SShri Abhyankar   }
14058ab949d8SShri Abhyankar 
14063649974fSBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
14078ab949d8SShri Abhyankar   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
14087c565772SBarry Smith   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
14098ab949d8SShri Abhyankar   PetscFunctionReturn(0);
14108ab949d8SShri Abhyankar }
14118ab949d8SShri Abhyankar 
14128ab949d8SShri Abhyankar /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
14138ab949d8SShri Abhyankar #undef __FUNCT__
14148ab949d8SShri Abhyankar #define __FUNCT__ "MatMult_SeqBAIJ_15_ver3"
14158ab949d8SShri Abhyankar PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz)
14168ab949d8SShri Abhyankar {
14178ab949d8SShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
14188ab949d8SShri Abhyankar   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
14198ab949d8SShri Abhyankar   const PetscScalar *x,*xb;
14200b8f6341SShri Abhyankar   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,*zarray;
14218ab949d8SShri Abhyankar   const MatScalar   *v;
14228ab949d8SShri Abhyankar   PetscErrorCode    ierr;
14238ab949d8SShri Abhyankar   const PetscInt    *ii,*ij=a->j,*idx;
14247c565772SBarry Smith   PetscInt          mbs,i,j,n,*ridx=NULL;
1425ace3abfcSBarry Smith   PetscBool         usecprow=a->compressedrow.use;
14268ab949d8SShri Abhyankar 
14278ab949d8SShri Abhyankar   PetscFunctionBegin;
14283649974fSBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
14298ab949d8SShri Abhyankar   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
14308ab949d8SShri Abhyankar 
14318ab949d8SShri Abhyankar   v = a->a;
14328ab949d8SShri Abhyankar   if (usecprow) {
14338ab949d8SShri Abhyankar     mbs  = a->compressedrow.nrows;
14348ab949d8SShri Abhyankar     ii   = a->compressedrow.i;
14358ab949d8SShri Abhyankar     ridx = a->compressedrow.rindex;
14368ab949d8SShri Abhyankar   } else {
14378ab949d8SShri Abhyankar     mbs = a->mbs;
14388ab949d8SShri Abhyankar     ii  = a->i;
14398ab949d8SShri Abhyankar     z   = zarray;
14408ab949d8SShri Abhyankar   }
14418ab949d8SShri Abhyankar 
14428ab949d8SShri Abhyankar   for (i=0; i<mbs; i++) {
14438ab949d8SShri Abhyankar     n    = ii[i+1] - ii[i];
14448ab949d8SShri Abhyankar     idx  = ij + ii[i];
14458ab949d8SShri Abhyankar     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
14468ab949d8SShri Abhyankar     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
14478ab949d8SShri Abhyankar 
14488ab949d8SShri Abhyankar     for (j=0; j<n; j++) {
14498ab949d8SShri Abhyankar       xb = x + 15*(idx[j]);
14508ab949d8SShri Abhyankar       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
14510b8f6341SShri Abhyankar       x8 = xb[7];
14528ab949d8SShri Abhyankar 
14538ab949d8SShri Abhyankar       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8;
14548ab949d8SShri Abhyankar       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8;
14558ab949d8SShri Abhyankar       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8;
14568ab949d8SShri Abhyankar       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8;
14578ab949d8SShri Abhyankar       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8;
14588ab949d8SShri Abhyankar       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8;
14598ab949d8SShri Abhyankar       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8;
14608ab949d8SShri Abhyankar       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8;
14618ab949d8SShri Abhyankar       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8;
14628ab949d8SShri Abhyankar       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8;
14638ab949d8SShri Abhyankar       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8;
14648ab949d8SShri Abhyankar       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8;
14658ab949d8SShri Abhyankar       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8;
14668ab949d8SShri Abhyankar       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8;
14678ab949d8SShri Abhyankar       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8;
14688ab949d8SShri Abhyankar       v     += 120;
14698ab949d8SShri Abhyankar 
14700b8f6341SShri Abhyankar       x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14];
14710b8f6341SShri Abhyankar 
14728ab949d8SShri Abhyankar       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7;
14738ab949d8SShri Abhyankar       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7;
14748ab949d8SShri Abhyankar       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7;
14758ab949d8SShri Abhyankar       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7;
14768ab949d8SShri Abhyankar       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7;
14778ab949d8SShri Abhyankar       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7;
14788ab949d8SShri Abhyankar       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7;
14798ab949d8SShri Abhyankar       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7;
14808ab949d8SShri Abhyankar       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7;
14818ab949d8SShri Abhyankar       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7;
14828ab949d8SShri Abhyankar       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7;
14838ab949d8SShri Abhyankar       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7;
14848ab949d8SShri Abhyankar       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7;
14858ab949d8SShri Abhyankar       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7;
14868ab949d8SShri Abhyankar       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7;
14878ab949d8SShri Abhyankar       v     += 105;
14888ab949d8SShri Abhyankar     }
14898ab949d8SShri Abhyankar     if (usecprow) z = zarray + 15*ridx[i];
14908ab949d8SShri Abhyankar     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
14918ab949d8SShri Abhyankar     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
14928ab949d8SShri Abhyankar 
14938ab949d8SShri Abhyankar     if (!usecprow) z += 15;
14948ab949d8SShri Abhyankar   }
14958ab949d8SShri Abhyankar 
14963649974fSBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
14978ab949d8SShri Abhyankar   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
14987c565772SBarry Smith   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
14998ab949d8SShri Abhyankar   PetscFunctionReturn(0);
15008ab949d8SShri Abhyankar }
15018ab949d8SShri Abhyankar 
15028ab949d8SShri Abhyankar /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
15038ab949d8SShri Abhyankar 
15048ab949d8SShri Abhyankar #undef __FUNCT__
15058ab949d8SShri Abhyankar #define __FUNCT__ "MatMult_SeqBAIJ_15_ver4"
15068ab949d8SShri Abhyankar PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz)
15078ab949d8SShri Abhyankar {
15088ab949d8SShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
15098ab949d8SShri Abhyankar   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
15108ab949d8SShri Abhyankar   const PetscScalar *x,*xb;
15118ab949d8SShri Abhyankar   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray;
15128ab949d8SShri Abhyankar   const MatScalar   *v;
15138ab949d8SShri Abhyankar   PetscErrorCode    ierr;
15148ab949d8SShri Abhyankar   const PetscInt    *ii,*ij=a->j,*idx;
15157c565772SBarry Smith   PetscInt          mbs,i,j,n,*ridx=NULL;
1516ace3abfcSBarry Smith   PetscBool         usecprow=a->compressedrow.use;
15178ab949d8SShri Abhyankar 
15188ab949d8SShri Abhyankar   PetscFunctionBegin;
15193649974fSBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
15208ab949d8SShri Abhyankar   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
15218ab949d8SShri Abhyankar 
15228ab949d8SShri Abhyankar   v = a->a;
15238ab949d8SShri Abhyankar   if (usecprow) {
15248ab949d8SShri Abhyankar     mbs  = a->compressedrow.nrows;
15258ab949d8SShri Abhyankar     ii   = a->compressedrow.i;
15268ab949d8SShri Abhyankar     ridx = a->compressedrow.rindex;
15278ab949d8SShri Abhyankar   } else {
15288ab949d8SShri Abhyankar     mbs = a->mbs;
15298ab949d8SShri Abhyankar     ii  = a->i;
15308ab949d8SShri Abhyankar     z   = zarray;
15318ab949d8SShri Abhyankar   }
15328ab949d8SShri Abhyankar 
15338ab949d8SShri Abhyankar   for (i=0; i<mbs; i++) {
15348ab949d8SShri Abhyankar     n    = ii[i+1] - ii[i];
15358ab949d8SShri Abhyankar     idx  = ij + ii[i];
15368ab949d8SShri Abhyankar     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
15378ab949d8SShri Abhyankar     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
15388ab949d8SShri Abhyankar 
15398ab949d8SShri Abhyankar     for (j=0; j<n; j++) {
15408ab949d8SShri Abhyankar       xb = x + 15*(idx[j]);
15418ab949d8SShri Abhyankar       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
15428ab949d8SShri Abhyankar       x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14];
15438ab949d8SShri Abhyankar 
15448ab949d8SShri Abhyankar       sum1  +=  v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
15458ab949d8SShri Abhyankar       sum2  +=  v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
15468ab949d8SShri Abhyankar       sum3  +=  v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
15478ab949d8SShri Abhyankar       sum4  +=  v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
15488ab949d8SShri Abhyankar       sum5  += v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
15498ab949d8SShri Abhyankar       sum6  += v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
15508ab949d8SShri Abhyankar       sum7  += v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
15518ab949d8SShri Abhyankar       sum8  += v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
15528ab949d8SShri Abhyankar       sum9  += v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
15538ab949d8SShri Abhyankar       sum10 += v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
15548ab949d8SShri Abhyankar       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
15558ab949d8SShri Abhyankar       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
15568ab949d8SShri Abhyankar       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
15578ab949d8SShri Abhyankar       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
15588ab949d8SShri Abhyankar       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
15598ab949d8SShri Abhyankar       v     += 225;
15608ab949d8SShri Abhyankar     }
15618ab949d8SShri Abhyankar     if (usecprow) z = zarray + 15*ridx[i];
15628ab949d8SShri Abhyankar     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
15638ab949d8SShri Abhyankar     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
15648ab949d8SShri Abhyankar 
15658ab949d8SShri Abhyankar     if (!usecprow) z += 15;
15668ab949d8SShri Abhyankar   }
15678ab949d8SShri Abhyankar 
15683649974fSBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
15698ab949d8SShri Abhyankar   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
15707c565772SBarry Smith   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
15718ab949d8SShri Abhyankar   PetscFunctionReturn(0);
15728ab949d8SShri Abhyankar }
15738ab949d8SShri Abhyankar 
15748ab949d8SShri Abhyankar 
15753f1db9ecSBarry Smith /*
15763f1db9ecSBarry Smith     This will not work with MatScalar == float because it calls the BLAS
15773f1db9ecSBarry Smith */
15784a2ae208SSatish Balay #undef __FUNCT__
15794a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqBAIJ_N"
1580dfbe8321SBarry Smith PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
15812d61bbb3SSatish Balay {
15822d61bbb3SSatish Balay   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1583dcf5cc72SBarry Smith   PetscScalar    *x,*z = 0,*xb,*work,*workt,*zarray;
15843f1db9ecSBarry Smith   MatScalar      *v;
1585dfbe8321SBarry Smith   PetscErrorCode ierr;
1586a2ea699eSBarry Smith   PetscInt       mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2;
15877c565772SBarry Smith   PetscInt       ncols,k,*ridx=NULL;
1588ace3abfcSBarry Smith   PetscBool      usecprow=a->compressedrow.use;
15892d61bbb3SSatish Balay 
15902d61bbb3SSatish Balay   PetscFunctionBegin;
15911ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
159226e093fcSHong Zhang   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
15932d61bbb3SSatish Balay 
15942d61bbb3SSatish Balay   idx = a->j;
15952d61bbb3SSatish Balay   v   = a->a;
159626e093fcSHong Zhang   if (usecprow) {
159726e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
159826e093fcSHong Zhang     ii   = a->compressedrow.i;
15997b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
160026e093fcSHong Zhang   } else {
160126e093fcSHong Zhang     mbs = a->mbs;
16022d61bbb3SSatish Balay     ii  = a->i;
160326e093fcSHong Zhang     z   = zarray;
160426e093fcSHong Zhang   }
1605218c64b6SSatish Balay 
16062d61bbb3SSatish Balay   if (!a->mult_work) {
1607d0f46423SBarry Smith     k    = PetscMax(A->rmap->n,A->cmap->n);
160887828ca2SBarry Smith     ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
16092d61bbb3SSatish Balay   }
16102d61bbb3SSatish Balay   work = a->mult_work;
16112d61bbb3SSatish Balay   for (i=0; i<mbs; i++) {
16122d61bbb3SSatish Balay     n           = ii[1] - ii[0]; ii++;
16132d61bbb3SSatish Balay     ncols       = n*bs;
16142d61bbb3SSatish Balay     workt       = work;
16152d61bbb3SSatish Balay     for (j=0; j<n; j++) {
16162d61bbb3SSatish Balay       xb = x + bs*(*idx++);
16172d61bbb3SSatish Balay       for (k=0; k<bs; k++) workt[k] = xb[k];
16182d61bbb3SSatish Balay       workt += bs;
16192d61bbb3SSatish Balay     }
16207b2bb3b9SHong Zhang     if (usecprow) z = zarray + bs*ridx[i];
162196b95a6bSBarry Smith     PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
162271044d3cSBarry Smith     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
16232d61bbb3SSatish Balay     v += n*bs2;
162426e093fcSHong Zhang     if (!usecprow) z += bs;
16252d61bbb3SSatish Balay   }
16261ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
162726e093fcSHong Zhang   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
16287c565772SBarry Smith   ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);CHKERRQ(ierr);
16292d61bbb3SSatish Balay   PetscFunctionReturn(0);
16302d61bbb3SSatish Balay }
16312d61bbb3SSatish Balay 
16324a2ae208SSatish Balay #undef __FUNCT__
16334a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqBAIJ_1"
1634dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
16352d61bbb3SSatish Balay {
16362d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1637122f12eaSBarry Smith   const PetscScalar *x;
1638122f12eaSBarry Smith   PetscScalar       *y,*z,sum;
1639122f12eaSBarry Smith   const MatScalar   *v;
1640dfbe8321SBarry Smith   PetscErrorCode    ierr;
16417c565772SBarry Smith   PetscInt          mbs=a->mbs,i,n,*ridx=NULL;
1642122f12eaSBarry Smith   const PetscInt    *idx,*ii;
1643ace3abfcSBarry Smith   PetscBool         usecprow=a->compressedrow.use;
16442d61bbb3SSatish Balay 
16452d61bbb3SSatish Balay   PetscFunctionBegin;
16463649974fSBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1647122f12eaSBarry Smith   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
16482e8a6d31SBarry Smith   if (zz != yy) {
1649122f12eaSBarry Smith     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
16502e8a6d31SBarry Smith   } else {
1651122f12eaSBarry Smith     z = y;
16522e8a6d31SBarry Smith   }
16532d61bbb3SSatish Balay 
16542d61bbb3SSatish Balay   idx = a->j;
16552d61bbb3SSatish Balay   v   = a->a;
165626e093fcSHong Zhang   if (usecprow) {
16574eb6d288SHong Zhang     if (zz != yy) {
1658122f12eaSBarry Smith       ierr = PetscMemcpy(z,y,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
16594eb6d288SHong Zhang     }
166026e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
166126e093fcSHong Zhang     ii   = a->compressedrow.i;
16627b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
166326e093fcSHong Zhang   } else {
16642d61bbb3SSatish Balay     ii = a->i;
166526e093fcSHong Zhang   }
16662d61bbb3SSatish Balay 
16672d61bbb3SSatish Balay   for (i=0; i<mbs; i++) {
1668122f12eaSBarry Smith     n = ii[1] - ii[0];
1669122f12eaSBarry Smith     ii++;
167026e093fcSHong Zhang     if (!usecprow) {
1671122f12eaSBarry Smith       sum         = y[i];
1672122f12eaSBarry Smith     } else {
1673122f12eaSBarry Smith       sum = y[ridx[i]];
1674122f12eaSBarry Smith     }
1675444d8c10SJed Brown     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1676444d8c10SJed Brown     PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
1677122f12eaSBarry Smith     PetscSparseDensePlusDot(sum,x,v,idx,n);
1678122f12eaSBarry Smith     v   += n;
1679122f12eaSBarry Smith     idx += n;
1680122f12eaSBarry Smith     if (usecprow) {
1681122f12eaSBarry Smith       z[ridx[i]] = sum;
1682122f12eaSBarry Smith     } else {
1683122f12eaSBarry Smith       z[i] = sum;
168426e093fcSHong Zhang     }
16852d61bbb3SSatish Balay   }
16863649974fSBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1687122f12eaSBarry Smith   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
16882e8a6d31SBarry Smith   if (zz != yy) {
1689122f12eaSBarry Smith     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
16902e8a6d31SBarry Smith   }
16917c565772SBarry Smith   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
16922d61bbb3SSatish Balay   PetscFunctionReturn(0);
16932d61bbb3SSatish Balay }
16942d61bbb3SSatish Balay 
16954a2ae208SSatish Balay #undef __FUNCT__
16964a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqBAIJ_2"
1697dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
16982d61bbb3SSatish Balay {
16992d61bbb3SSatish Balay   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1700dcf5cc72SBarry Smith   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2;
170126e093fcSHong Zhang   PetscScalar    x1,x2,*yarray,*zarray;
17023f1db9ecSBarry Smith   MatScalar      *v;
1703dfbe8321SBarry Smith   PetscErrorCode ierr;
17040298fd71SBarry Smith   PetscInt       mbs     =a->mbs,i,*idx,*ii,j,n,*ridx=NULL;
1705ace3abfcSBarry Smith   PetscBool      usecprow=a->compressedrow.use;
17062d61bbb3SSatish Balay 
17072d61bbb3SSatish Balay   PetscFunctionBegin;
17081ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
170926e093fcSHong Zhang   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
17102e8a6d31SBarry Smith   if (zz != yy) {
171126e093fcSHong Zhang     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
17122e8a6d31SBarry Smith   } else {
171326e093fcSHong Zhang     zarray = yarray;
17142e8a6d31SBarry Smith   }
17152d61bbb3SSatish Balay 
17162d61bbb3SSatish Balay   idx = a->j;
17172d61bbb3SSatish Balay   v   = a->a;
171826e093fcSHong Zhang   if (usecprow) {
17194eb6d288SHong Zhang     if (zz != yy) {
17204eb6d288SHong Zhang       ierr = PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
17214eb6d288SHong Zhang     }
172226e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
172326e093fcSHong Zhang     ii   = a->compressedrow.i;
17247b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
17254eb6d288SHong Zhang     if (zz != yy) {
17264eb6d288SHong Zhang       ierr = PetscMemcpy(zarray,yarray,a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
17274eb6d288SHong Zhang     }
172826e093fcSHong Zhang   } else {
17292d61bbb3SSatish Balay     ii = a->i;
173026e093fcSHong Zhang     y  = yarray;
173126e093fcSHong Zhang     z  = zarray;
173226e093fcSHong Zhang   }
17332d61bbb3SSatish Balay 
17342d61bbb3SSatish Balay   for (i=0; i<mbs; i++) {
17352d61bbb3SSatish Balay     n = ii[1] - ii[0]; ii++;
173626e093fcSHong Zhang     if (usecprow) {
17377b2bb3b9SHong Zhang       z = zarray + 2*ridx[i];
17387b2bb3b9SHong Zhang       y = yarray + 2*ridx[i];
173926e093fcSHong Zhang     }
17402d61bbb3SSatish Balay     sum1 = y[0]; sum2 = y[1];
1741444d8c10SJed Brown     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1742444d8c10SJed Brown     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
17432d61bbb3SSatish Balay     for (j=0; j<n; j++) {
174426fbe8dcSKarl Rupp       xb = x + 2*(*idx++);
174526fbe8dcSKarl Rupp       x1 = xb[0];
174626fbe8dcSKarl Rupp       x2 = xb[1];
174726fbe8dcSKarl Rupp 
17482d61bbb3SSatish Balay       sum1 += v[0]*x1 + v[2]*x2;
17492d61bbb3SSatish Balay       sum2 += v[1]*x1 + v[3]*x2;
17502d61bbb3SSatish Balay       v    += 4;
17512d61bbb3SSatish Balay     }
17522d61bbb3SSatish Balay     z[0] = sum1; z[1] = sum2;
175326e093fcSHong Zhang     if (!usecprow) {
17542d61bbb3SSatish Balay       z += 2; y += 2;
17552d61bbb3SSatish Balay     }
175626e093fcSHong Zhang   }
17571ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
175826e093fcSHong Zhang   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
17592e8a6d31SBarry Smith   if (zz != yy) {
176026e093fcSHong Zhang     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
17612e8a6d31SBarry Smith   }
1762dc0b31edSSatish Balay   ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr);
17632d61bbb3SSatish Balay   PetscFunctionReturn(0);
17642d61bbb3SSatish Balay }
17652d61bbb3SSatish Balay 
17664a2ae208SSatish Balay #undef __FUNCT__
17674a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqBAIJ_3"
1768dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
17692d61bbb3SSatish Balay {
17702d61bbb3SSatish Balay   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1771dcf5cc72SBarry Smith   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
17723f1db9ecSBarry Smith   MatScalar      *v;
1773dfbe8321SBarry Smith   PetscErrorCode ierr;
17740298fd71SBarry Smith   PetscInt       mbs     =a->mbs,i,*idx,*ii,j,n,*ridx=NULL;
1775ace3abfcSBarry Smith   PetscBool      usecprow=a->compressedrow.use;
17762d61bbb3SSatish Balay 
17772d61bbb3SSatish Balay   PetscFunctionBegin;
17781ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
177926e093fcSHong Zhang   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
17802e8a6d31SBarry Smith   if (zz != yy) {
178126e093fcSHong Zhang     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
17822e8a6d31SBarry Smith   } else {
178326e093fcSHong Zhang     zarray = yarray;
17842e8a6d31SBarry Smith   }
17852d61bbb3SSatish Balay 
17862d61bbb3SSatish Balay   idx = a->j;
17872d61bbb3SSatish Balay   v   = a->a;
178826e093fcSHong Zhang   if (usecprow) {
17894eb6d288SHong Zhang     if (zz != yy) {
17904eb6d288SHong Zhang       ierr = PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
17914eb6d288SHong Zhang     }
179226e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
179326e093fcSHong Zhang     ii   = a->compressedrow.i;
17947b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
179526e093fcSHong Zhang   } else {
17962d61bbb3SSatish Balay     ii = a->i;
179726e093fcSHong Zhang     y  = yarray;
179826e093fcSHong Zhang     z  = zarray;
179926e093fcSHong Zhang   }
18002d61bbb3SSatish Balay 
18012d61bbb3SSatish Balay   for (i=0; i<mbs; i++) {
18022d61bbb3SSatish Balay     n = ii[1] - ii[0]; ii++;
180326e093fcSHong Zhang     if (usecprow) {
18047b2bb3b9SHong Zhang       z = zarray + 3*ridx[i];
18057b2bb3b9SHong Zhang       y = yarray + 3*ridx[i];
180626e093fcSHong Zhang     }
18072d61bbb3SSatish Balay     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
1808444d8c10SJed Brown     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1809444d8c10SJed Brown     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
18102d61bbb3SSatish Balay     for (j=0; j<n; j++) {
18112d61bbb3SSatish Balay       xb    = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
18122d61bbb3SSatish Balay       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
18132d61bbb3SSatish Balay       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
18142d61bbb3SSatish Balay       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
18152d61bbb3SSatish Balay       v    += 9;
18162d61bbb3SSatish Balay     }
18172d61bbb3SSatish Balay     z[0] = sum1; z[1] = sum2; z[2] = sum3;
181826e093fcSHong Zhang     if (!usecprow) {
18192d61bbb3SSatish Balay       z += 3; y += 3;
18202d61bbb3SSatish Balay     }
182126e093fcSHong Zhang   }
18221ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
182326e093fcSHong Zhang   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
18242e8a6d31SBarry Smith   if (zz != yy) {
182526e093fcSHong Zhang     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
18262e8a6d31SBarry Smith   }
1827dc0b31edSSatish Balay   ierr = PetscLogFlops(18.0*a->nz);CHKERRQ(ierr);
18282d61bbb3SSatish Balay   PetscFunctionReturn(0);
18292d61bbb3SSatish Balay }
18302d61bbb3SSatish Balay 
18314a2ae208SSatish Balay #undef __FUNCT__
18324a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqBAIJ_4"
1833dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
18342d61bbb3SSatish Balay {
18352d61bbb3SSatish Balay   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1836dcf5cc72SBarry Smith   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
18373f1db9ecSBarry Smith   MatScalar      *v;
1838dfbe8321SBarry Smith   PetscErrorCode ierr;
18390298fd71SBarry Smith   PetscInt       mbs     =a->mbs,i,*idx,*ii,j,n,*ridx=NULL;
1840ace3abfcSBarry Smith   PetscBool      usecprow=a->compressedrow.use;
18412d61bbb3SSatish Balay 
18422d61bbb3SSatish Balay   PetscFunctionBegin;
18431ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
184426e093fcSHong Zhang   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
18452e8a6d31SBarry Smith   if (zz != yy) {
184626e093fcSHong Zhang     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
18472e8a6d31SBarry Smith   } else {
184826e093fcSHong Zhang     zarray = yarray;
18492e8a6d31SBarry Smith   }
18502d61bbb3SSatish Balay 
18512d61bbb3SSatish Balay   idx = a->j;
18522d61bbb3SSatish Balay   v   = a->a;
185326e093fcSHong Zhang   if (usecprow) {
18544eb6d288SHong Zhang     if (zz != yy) {
18554eb6d288SHong Zhang       ierr = PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
18564eb6d288SHong Zhang     }
185726e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
185826e093fcSHong Zhang     ii   = a->compressedrow.i;
18597b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
186026e093fcSHong Zhang   } else {
18612d61bbb3SSatish Balay     ii = a->i;
186226e093fcSHong Zhang     y  = yarray;
186326e093fcSHong Zhang     z  = zarray;
186426e093fcSHong Zhang   }
18652d61bbb3SSatish Balay 
18662d61bbb3SSatish Balay   for (i=0; i<mbs; i++) {
18672d61bbb3SSatish Balay     n = ii[1] - ii[0]; ii++;
186826e093fcSHong Zhang     if (usecprow) {
18697b2bb3b9SHong Zhang       z = zarray + 4*ridx[i];
18707b2bb3b9SHong Zhang       y = yarray + 4*ridx[i];
187126e093fcSHong Zhang     }
18722d61bbb3SSatish Balay     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
1873444d8c10SJed Brown     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1874444d8c10SJed Brown     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
18752d61bbb3SSatish Balay     for (j=0; j<n; j++) {
18762d61bbb3SSatish Balay       xb    = x + 4*(*idx++);
18772d61bbb3SSatish Balay       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
18782d61bbb3SSatish Balay       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
18792d61bbb3SSatish Balay       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
18802d61bbb3SSatish Balay       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
18812d61bbb3SSatish Balay       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
18822d61bbb3SSatish Balay       v    += 16;
18832d61bbb3SSatish Balay     }
18842d61bbb3SSatish Balay     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
188526e093fcSHong Zhang     if (!usecprow) {
18862d61bbb3SSatish Balay       z += 4; y += 4;
18872d61bbb3SSatish Balay     }
188826e093fcSHong Zhang   }
18891ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
189026e093fcSHong Zhang   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
18912e8a6d31SBarry Smith   if (zz != yy) {
189226e093fcSHong Zhang     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
18932e8a6d31SBarry Smith   }
1894dc0b31edSSatish Balay   ierr = PetscLogFlops(32.0*a->nz);CHKERRQ(ierr);
18952d61bbb3SSatish Balay   PetscFunctionReturn(0);
18962d61bbb3SSatish Balay }
18972d61bbb3SSatish Balay 
18984a2ae208SSatish Balay #undef __FUNCT__
18994a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqBAIJ_5"
1900dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
19012d61bbb3SSatish Balay {
19022d61bbb3SSatish Balay   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1903dcf5cc72SBarry Smith   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
190426e093fcSHong Zhang   PetscScalar    *yarray,*zarray;
19053f1db9ecSBarry Smith   MatScalar      *v;
1906dfbe8321SBarry Smith   PetscErrorCode ierr;
19070298fd71SBarry Smith   PetscInt       mbs     =a->mbs,i,*idx,*ii,j,n,*ridx=NULL;
1908ace3abfcSBarry Smith   PetscBool      usecprow=a->compressedrow.use;
19092d61bbb3SSatish Balay 
19102d61bbb3SSatish Balay   PetscFunctionBegin;
19111ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
191226e093fcSHong Zhang   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
19132e8a6d31SBarry Smith   if (zz != yy) {
191426e093fcSHong Zhang     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
19152e8a6d31SBarry Smith   } else {
191626e093fcSHong Zhang     zarray = yarray;
19172e8a6d31SBarry Smith   }
19182d61bbb3SSatish Balay 
19192d61bbb3SSatish Balay   idx = a->j;
19202d61bbb3SSatish Balay   v   = a->a;
192126e093fcSHong Zhang   if (usecprow) {
19224eb6d288SHong Zhang     if (zz != yy) {
19234eb6d288SHong Zhang       ierr = PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
19244eb6d288SHong Zhang     }
192526e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
192626e093fcSHong Zhang     ii   = a->compressedrow.i;
19277b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
192826e093fcSHong Zhang   } else {
19292d61bbb3SSatish Balay     ii = a->i;
193026e093fcSHong Zhang     y  = yarray;
193126e093fcSHong Zhang     z  = zarray;
193226e093fcSHong Zhang   }
19332d61bbb3SSatish Balay 
19342d61bbb3SSatish Balay   for (i=0; i<mbs; i++) {
19352d61bbb3SSatish Balay     n = ii[1] - ii[0]; ii++;
193626e093fcSHong Zhang     if (usecprow) {
19377b2bb3b9SHong Zhang       z = zarray + 5*ridx[i];
19387b2bb3b9SHong Zhang       y = yarray + 5*ridx[i];
193926e093fcSHong Zhang     }
19402d61bbb3SSatish Balay     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
1941444d8c10SJed Brown     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1942444d8c10SJed Brown     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
19432d61bbb3SSatish Balay     for (j=0; j<n; j++) {
19442d61bbb3SSatish Balay       xb    = x + 5*(*idx++);
19452d61bbb3SSatish Balay       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
19462d61bbb3SSatish Balay       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
19472d61bbb3SSatish Balay       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
19482d61bbb3SSatish Balay       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
19492d61bbb3SSatish Balay       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
19502d61bbb3SSatish Balay       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
19512d61bbb3SSatish Balay       v    += 25;
19522d61bbb3SSatish Balay     }
19532d61bbb3SSatish Balay     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
195426e093fcSHong Zhang     if (!usecprow) {
19552d61bbb3SSatish Balay       z += 5; y += 5;
19562d61bbb3SSatish Balay     }
195726e093fcSHong Zhang   }
19581ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
195926e093fcSHong Zhang   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
19602e8a6d31SBarry Smith   if (zz != yy) {
196126e093fcSHong Zhang     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
19622e8a6d31SBarry Smith   }
1963dc0b31edSSatish Balay   ierr = PetscLogFlops(50.0*a->nz);CHKERRQ(ierr);
19642d61bbb3SSatish Balay   PetscFunctionReturn(0);
19652d61bbb3SSatish Balay }
19664a2ae208SSatish Balay #undef __FUNCT__
19674a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqBAIJ_6"
1968dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
196915091d37SBarry Smith {
197015091d37SBarry Smith   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1971dcf5cc72SBarry Smith   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6;
197226e093fcSHong Zhang   PetscScalar    x1,x2,x3,x4,x5,x6,*yarray,*zarray;
197315091d37SBarry Smith   MatScalar      *v;
1974dfbe8321SBarry Smith   PetscErrorCode ierr;
19750298fd71SBarry Smith   PetscInt       mbs     =a->mbs,i,*idx,*ii,j,n,*ridx=NULL;
1976ace3abfcSBarry Smith   PetscBool      usecprow=a->compressedrow.use;
197715091d37SBarry Smith 
197815091d37SBarry Smith   PetscFunctionBegin;
19791ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
198026e093fcSHong Zhang   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
198115091d37SBarry Smith   if (zz != yy) {
198226e093fcSHong Zhang     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
198315091d37SBarry Smith   } else {
198426e093fcSHong Zhang     zarray = yarray;
198515091d37SBarry Smith   }
198615091d37SBarry Smith 
198715091d37SBarry Smith   idx = a->j;
198815091d37SBarry Smith   v   = a->a;
198926e093fcSHong Zhang   if (usecprow) {
19904eb6d288SHong Zhang     if (zz != yy) {
19914eb6d288SHong Zhang       ierr = PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
19924eb6d288SHong Zhang     }
199326e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
199426e093fcSHong Zhang     ii   = a->compressedrow.i;
19957b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
199626e093fcSHong Zhang   } else {
199715091d37SBarry Smith     ii = a->i;
199826e093fcSHong Zhang     y  = yarray;
199926e093fcSHong Zhang     z  = zarray;
200026e093fcSHong Zhang   }
200115091d37SBarry Smith 
200215091d37SBarry Smith   for (i=0; i<mbs; i++) {
200315091d37SBarry Smith     n = ii[1] - ii[0]; ii++;
200426e093fcSHong Zhang     if (usecprow) {
20057b2bb3b9SHong Zhang       z = zarray + 6*ridx[i];
20067b2bb3b9SHong Zhang       y = yarray + 6*ridx[i];
200726e093fcSHong Zhang     }
200815091d37SBarry Smith     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
2009444d8c10SJed Brown     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
2010444d8c10SJed Brown     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
201115091d37SBarry Smith     for (j=0; j<n; j++) {
20123b95cb0eSSatish Balay       xb    = x + 6*(*idx++);
201315091d37SBarry Smith       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
201415091d37SBarry Smith       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
201515091d37SBarry Smith       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
201615091d37SBarry Smith       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
201715091d37SBarry Smith       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
201815091d37SBarry Smith       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
201915091d37SBarry Smith       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
202015091d37SBarry Smith       v    += 36;
202115091d37SBarry Smith     }
202215091d37SBarry Smith     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
202326e093fcSHong Zhang     if (!usecprow) {
202415091d37SBarry Smith       z += 6; y += 6;
202515091d37SBarry Smith     }
202626e093fcSHong Zhang   }
20271ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
202826e093fcSHong Zhang   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
202915091d37SBarry Smith   if (zz != yy) {
203026e093fcSHong Zhang     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
203115091d37SBarry Smith   }
2032dc0b31edSSatish Balay   ierr = PetscLogFlops(72.0*a->nz);CHKERRQ(ierr);
203315091d37SBarry Smith   PetscFunctionReturn(0);
203415091d37SBarry Smith }
20352d61bbb3SSatish Balay 
20364a2ae208SSatish Balay #undef __FUNCT__
20374a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqBAIJ_7"
2038dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
20392d61bbb3SSatish Balay {
20402d61bbb3SSatish Balay   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2041dcf5cc72SBarry Smith   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
204226e093fcSHong Zhang   PetscScalar    x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
20433f1db9ecSBarry Smith   MatScalar      *v;
2044dfbe8321SBarry Smith   PetscErrorCode ierr;
20450298fd71SBarry Smith   PetscInt       mbs     =a->mbs,i,*idx,*ii,j,n,*ridx=NULL;
2046ace3abfcSBarry Smith   PetscBool      usecprow=a->compressedrow.use;
20472d61bbb3SSatish Balay 
20482d61bbb3SSatish Balay   PetscFunctionBegin;
20491ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
205026e093fcSHong Zhang   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
20512e8a6d31SBarry Smith   if (zz != yy) {
205226e093fcSHong Zhang     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
20532e8a6d31SBarry Smith   } else {
205426e093fcSHong Zhang     zarray = yarray;
20552e8a6d31SBarry Smith   }
20562d61bbb3SSatish Balay 
20572d61bbb3SSatish Balay   idx = a->j;
20582d61bbb3SSatish Balay   v   = a->a;
205926e093fcSHong Zhang   if (usecprow) {
20604eb6d288SHong Zhang     if (zz != yy) {
20614eb6d288SHong Zhang       ierr = PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
20624eb6d288SHong Zhang     }
206326e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
206426e093fcSHong Zhang     ii   = a->compressedrow.i;
20657b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
206626e093fcSHong Zhang   } else {
20672d61bbb3SSatish Balay     ii = a->i;
206826e093fcSHong Zhang     y  = yarray;
206926e093fcSHong Zhang     z  = zarray;
207026e093fcSHong Zhang   }
20712d61bbb3SSatish Balay 
20722d61bbb3SSatish Balay   for (i=0; i<mbs; i++) {
20732d61bbb3SSatish Balay     n = ii[1] - ii[0]; ii++;
207426e093fcSHong Zhang     if (usecprow) {
20757b2bb3b9SHong Zhang       z = zarray + 7*ridx[i];
20767b2bb3b9SHong Zhang       y = yarray + 7*ridx[i];
207726e093fcSHong Zhang     }
20782d61bbb3SSatish Balay     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
2079444d8c10SJed Brown     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
2080444d8c10SJed Brown     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
20812d61bbb3SSatish Balay     for (j=0; j<n; j++) {
20822d61bbb3SSatish Balay       xb    = x + 7*(*idx++);
20832d61bbb3SSatish Balay       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
20842d61bbb3SSatish Balay       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
20852d61bbb3SSatish Balay       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
20862d61bbb3SSatish Balay       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
20872d61bbb3SSatish Balay       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
20882d61bbb3SSatish Balay       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
20892d61bbb3SSatish Balay       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
20902d61bbb3SSatish Balay       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
20912d61bbb3SSatish Balay       v    += 49;
20922d61bbb3SSatish Balay     }
20932d61bbb3SSatish Balay     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
209426e093fcSHong Zhang     if (!usecprow) {
20952d61bbb3SSatish Balay       z += 7; y += 7;
20962d61bbb3SSatish Balay     }
209726e093fcSHong Zhang   }
20981ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
209926e093fcSHong Zhang   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
21002e8a6d31SBarry Smith   if (zz != yy) {
210126e093fcSHong Zhang     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
21022e8a6d31SBarry Smith   }
2103dc0b31edSSatish Balay   ierr = PetscLogFlops(98.0*a->nz);CHKERRQ(ierr);
21042d61bbb3SSatish Balay   PetscFunctionReturn(0);
21052d61bbb3SSatish Balay }
2106218c64b6SSatish Balay 
21074a2ae208SSatish Balay #undef __FUNCT__
21084a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqBAIJ_N"
2109dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
21102d61bbb3SSatish Balay {
21112d61bbb3SSatish Balay   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2112bba805e6SBarry Smith   PetscScalar    *x,*z = 0,*xb,*work,*workt,*zarray;
21133f1db9ecSBarry Smith   MatScalar      *v;
21146849ba73SBarry Smith   PetscErrorCode ierr;
2115d0f46423SBarry Smith   PetscInt       mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2;
21160298fd71SBarry Smith   PetscInt       ncols,k,*ridx=NULL;
2117ace3abfcSBarry Smith   PetscBool      usecprow=a->compressedrow.use;
2118218c64b6SSatish Balay 
21192d61bbb3SSatish Balay   PetscFunctionBegin;
2120343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
21211ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
212226e093fcSHong Zhang   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
21232d61bbb3SSatish Balay 
21242d61bbb3SSatish Balay   idx = a->j;
21252d61bbb3SSatish Balay   v   = a->a;
212626e093fcSHong Zhang   if (usecprow) {
212726e093fcSHong Zhang     mbs  = a->compressedrow.nrows;
212826e093fcSHong Zhang     ii   = a->compressedrow.i;
21297b2bb3b9SHong Zhang     ridx = a->compressedrow.rindex;
213026e093fcSHong Zhang   } else {
213126e093fcSHong Zhang     mbs = a->mbs;
21322d61bbb3SSatish Balay     ii  = a->i;
213326e093fcSHong Zhang     z   = zarray;
213426e093fcSHong Zhang   }
21352d61bbb3SSatish Balay 
21362d61bbb3SSatish Balay   if (!a->mult_work) {
2137d0f46423SBarry Smith     k    = PetscMax(A->rmap->n,A->cmap->n);
213887828ca2SBarry Smith     ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
21392d61bbb3SSatish Balay   }
21402d61bbb3SSatish Balay   work = a->mult_work;
21412d61bbb3SSatish Balay   for (i=0; i<mbs; i++) {
21422d61bbb3SSatish Balay     n     = ii[1] - ii[0]; ii++;
21432d61bbb3SSatish Balay     ncols = n*bs;
21442d61bbb3SSatish Balay     workt = work;
21452d61bbb3SSatish Balay     for (j=0; j<n; j++) {
21462d61bbb3SSatish Balay       xb = x + bs*(*idx++);
21472d61bbb3SSatish Balay       for (k=0; k<bs; k++) workt[k] = xb[k];
21482d61bbb3SSatish Balay       workt += bs;
21492d61bbb3SSatish Balay     }
21507b2bb3b9SHong Zhang     if (usecprow) z = zarray + bs*ridx[i];
215196b95a6bSBarry Smith     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
215271044d3cSBarry Smith     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
21532d61bbb3SSatish Balay     v += n*bs2;
215426fbe8dcSKarl Rupp     if (!usecprow) z += bs;
215526e093fcSHong Zhang   }
21561ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
215726e093fcSHong Zhang   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
2158dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*a->nz*bs2);CHKERRQ(ierr);
21592d61bbb3SSatish Balay   PetscFunctionReturn(0);
21602d61bbb3SSatish Balay }
21612d61bbb3SSatish Balay 
21624a2ae208SSatish Balay #undef __FUNCT__
2163547795f9SHong Zhang #define __FUNCT__ "MatMultHermitianTranspose_SeqBAIJ"
2164547795f9SHong Zhang PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
2165547795f9SHong Zhang {
2166547795f9SHong Zhang   PetscScalar    zero = 0.0;
2167547795f9SHong Zhang   PetscErrorCode ierr;
2168547795f9SHong Zhang 
2169547795f9SHong Zhang   PetscFunctionBegin;
2170547795f9SHong Zhang   ierr = VecSet(zz,zero);CHKERRQ(ierr);
2171547795f9SHong Zhang   ierr = MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr);
2172547795f9SHong Zhang   PetscFunctionReturn(0);
2173547795f9SHong Zhang }
2174547795f9SHong Zhang 
2175547795f9SHong Zhang #undef __FUNCT__
21764a2ae208SSatish Balay #define __FUNCT__ "MatMultTranspose_SeqBAIJ"
2177dfbe8321SBarry Smith PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
21782d61bbb3SSatish Balay {
21793447b6efSHong Zhang   PetscScalar    zero = 0.0;
21806849ba73SBarry Smith   PetscErrorCode ierr;
21812d61bbb3SSatish Balay 
21822d61bbb3SSatish Balay   PetscFunctionBegin;
21832dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
21843447b6efSHong Zhang   ierr = MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr);
21852d61bbb3SSatish Balay   PetscFunctionReturn(0);
21862d61bbb3SSatish Balay }
21872d61bbb3SSatish Balay 
21884a2ae208SSatish Balay #undef __FUNCT__
2189547795f9SHong Zhang #define __FUNCT__ "MatMultHermitianTransposeAdd_SeqBAIJ"
2190547795f9SHong Zhang PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
2191547795f9SHong Zhang {
2192547795f9SHong Zhang   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2193547795f9SHong Zhang   PetscScalar       *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5;
2194547795f9SHong Zhang   MatScalar         *v;
2195547795f9SHong Zhang   PetscErrorCode    ierr;
21960298fd71SBarry Smith   PetscInt          mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=NULL;
2197547795f9SHong Zhang   Mat_CompressedRow cprow   = a->compressedrow;
2198ace3abfcSBarry Smith   PetscBool         usecprow=cprow.use;
2199547795f9SHong Zhang 
2200547795f9SHong Zhang   PetscFunctionBegin;
2201343551c4SBarry Smith   if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); }
2202547795f9SHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2203547795f9SHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
2204547795f9SHong Zhang 
2205547795f9SHong Zhang   idx = a->j;
2206547795f9SHong Zhang   v   = a->a;
2207547795f9SHong Zhang   if (usecprow) {
2208547795f9SHong Zhang     mbs  = cprow.nrows;
2209547795f9SHong Zhang     ii   = cprow.i;
2210547795f9SHong Zhang     ridx = cprow.rindex;
2211547795f9SHong Zhang   } else {
2212547795f9SHong Zhang     mbs=a->mbs;
2213547795f9SHong Zhang     ii = a->i;
2214547795f9SHong Zhang     xb = x;
2215547795f9SHong Zhang   }
2216547795f9SHong Zhang 
2217547795f9SHong Zhang   switch (bs) {
2218547795f9SHong Zhang   case 1:
2219547795f9SHong Zhang     for (i=0; i<mbs; i++) {
2220547795f9SHong Zhang       if (usecprow) xb = x + ridx[i];
2221547795f9SHong Zhang       x1 = xb[0];
2222547795f9SHong Zhang       ib = idx + ii[0];
2223547795f9SHong Zhang       n  = ii[1] - ii[0]; ii++;
2224547795f9SHong Zhang       for (j=0; j<n; j++) {
2225547795f9SHong Zhang         rval     = ib[j];
2226547795f9SHong Zhang         z[rval] += PetscConj(*v) * x1;
2227547795f9SHong Zhang         v++;
2228547795f9SHong Zhang       }
2229547795f9SHong Zhang       if (!usecprow) xb++;
2230547795f9SHong Zhang     }
2231547795f9SHong Zhang     break;
2232547795f9SHong Zhang   case 2:
2233547795f9SHong Zhang     for (i=0; i<mbs; i++) {
2234547795f9SHong Zhang       if (usecprow) xb = x + 2*ridx[i];
2235547795f9SHong Zhang       x1 = xb[0]; x2 = xb[1];
2236547795f9SHong Zhang       ib = idx + ii[0];
2237547795f9SHong Zhang       n  = ii[1] - ii[0]; ii++;
2238547795f9SHong Zhang       for (j=0; j<n; j++) {
2239547795f9SHong Zhang         rval       = ib[j]*2;
2240547795f9SHong Zhang         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2;
2241547795f9SHong Zhang         z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2;
2242547795f9SHong Zhang         v         += 4;
2243547795f9SHong Zhang       }
2244547795f9SHong Zhang       if (!usecprow) xb += 2;
2245547795f9SHong Zhang     }
2246547795f9SHong Zhang     break;
2247547795f9SHong Zhang   case 3:
2248547795f9SHong Zhang     for (i=0; i<mbs; i++) {
2249547795f9SHong Zhang       if (usecprow) xb = x + 3*ridx[i];
2250547795f9SHong Zhang       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2251547795f9SHong Zhang       ib = idx + ii[0];
2252547795f9SHong Zhang       n  = ii[1] - ii[0]; ii++;
2253547795f9SHong Zhang       for (j=0; j<n; j++) {
2254547795f9SHong Zhang         rval       = ib[j]*3;
2255547795f9SHong Zhang         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3;
2256547795f9SHong Zhang         z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3;
2257547795f9SHong Zhang         z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3;
2258547795f9SHong Zhang         v         += 9;
2259547795f9SHong Zhang       }
2260547795f9SHong Zhang       if (!usecprow) xb += 3;
2261547795f9SHong Zhang     }
2262547795f9SHong Zhang     break;
2263547795f9SHong Zhang   case 4:
2264547795f9SHong Zhang     for (i=0; i<mbs; i++) {
2265547795f9SHong Zhang       if (usecprow) xb = x + 4*ridx[i];
2266547795f9SHong Zhang       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
2267547795f9SHong Zhang       ib = idx + ii[0];
2268547795f9SHong Zhang       n  = ii[1] - ii[0]; ii++;
2269547795f9SHong Zhang       for (j=0; j<n; j++) {
2270547795f9SHong Zhang         rval       = ib[j]*4;
2271547795f9SHong Zhang         z[rval++] +=  PetscConj(v[0])*x1 + PetscConj(v[1])*x2  + PetscConj(v[2])*x3  + PetscConj(v[3])*x4;
2272547795f9SHong Zhang         z[rval++] +=  PetscConj(v[4])*x1 + PetscConj(v[5])*x2  + PetscConj(v[6])*x3  + PetscConj(v[7])*x4;
2273547795f9SHong Zhang         z[rval++] +=  PetscConj(v[8])*x1 + PetscConj(v[9])*x2  + PetscConj(v[10])*x3 + PetscConj(v[11])*x4;
2274547795f9SHong Zhang         z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4;
2275547795f9SHong Zhang         v         += 16;
2276547795f9SHong Zhang       }
2277547795f9SHong Zhang       if (!usecprow) xb += 4;
2278547795f9SHong Zhang     }
2279547795f9SHong Zhang     break;
2280547795f9SHong Zhang   case 5:
2281547795f9SHong Zhang     for (i=0; i<mbs; i++) {
2282547795f9SHong Zhang       if (usecprow) xb = x + 5*ridx[i];
2283547795f9SHong Zhang       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2284547795f9SHong Zhang       x4 = xb[3]; x5 = xb[4];
2285547795f9SHong Zhang       ib = idx + ii[0];
2286547795f9SHong Zhang       n  = ii[1] - ii[0]; ii++;
2287547795f9SHong Zhang       for (j=0; j<n; j++) {
2288547795f9SHong Zhang         rval       = ib[j]*5;
2289547795f9SHong Zhang         z[rval++] +=  PetscConj(v[0])*x1 +  PetscConj(v[1])*x2 +  PetscConj(v[2])*x3 +  PetscConj(v[3])*x4 +  PetscConj(v[4])*x5;
2290547795f9SHong Zhang         z[rval++] +=  PetscConj(v[5])*x1 +  PetscConj(v[6])*x2 +  PetscConj(v[7])*x3 +  PetscConj(v[8])*x4 +  PetscConj(v[9])*x5;
2291547795f9SHong Zhang         z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5;
2292547795f9SHong Zhang         z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5;
2293547795f9SHong Zhang         z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5;
2294547795f9SHong Zhang         v         += 25;
2295547795f9SHong Zhang       }
2296547795f9SHong Zhang       if (!usecprow) xb += 5;
2297547795f9SHong Zhang     }
2298547795f9SHong Zhang     break;
2299547795f9SHong Zhang   default: {      /* block sizes larger than 5 by 5 are handled by BLAS */
2300547795f9SHong Zhang     PetscInt    ncols,k;
2301547795f9SHong Zhang     PetscScalar *work,*workt,*xtmp;
2302547795f9SHong Zhang 
2303e32f2f54SBarry Smith     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet");
2304547795f9SHong Zhang     if (!a->mult_work) {
2305547795f9SHong Zhang       k    = PetscMax(A->rmap->n,A->cmap->n);
2306547795f9SHong Zhang       ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
2307547795f9SHong Zhang     }
2308547795f9SHong Zhang     work = a->mult_work;
2309547795f9SHong Zhang     xtmp = x;
2310547795f9SHong Zhang     for (i=0; i<mbs; i++) {
2311547795f9SHong Zhang       n     = ii[1] - ii[0]; ii++;
2312547795f9SHong Zhang       ncols = n*bs;
2313547795f9SHong Zhang       ierr  = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
231426fbe8dcSKarl Rupp       if (usecprow) xtmp = x + bs*ridx[i];
231596b95a6bSBarry Smith       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2316547795f9SHong Zhang       /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
2317547795f9SHong Zhang       v += n*bs2;
2318547795f9SHong Zhang       if (!usecprow) xtmp += bs;
2319547795f9SHong Zhang       workt = work;
2320547795f9SHong Zhang       for (j=0; j<n; j++) {
2321547795f9SHong Zhang         zb = z + bs*(*idx++);
2322547795f9SHong Zhang         for (k=0; k<bs; k++) zb[k] += workt[k] ;
2323547795f9SHong Zhang         workt += bs;
2324547795f9SHong Zhang       }
2325547795f9SHong Zhang     }
2326547795f9SHong Zhang     }
2327547795f9SHong Zhang   }
2328547795f9SHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2329547795f9SHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
2330547795f9SHong Zhang   ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr);
2331547795f9SHong Zhang   PetscFunctionReturn(0);
2332547795f9SHong Zhang }
2333547795f9SHong Zhang 
2334547795f9SHong Zhang #undef __FUNCT__
23354a2ae208SSatish Balay #define __FUNCT__ "MatMultTransposeAdd_SeqBAIJ"
2336dfbe8321SBarry Smith PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
23372d61bbb3SSatish Balay {
23382d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2339dcf5cc72SBarry Smith   PetscScalar       *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5;
23403f1db9ecSBarry Smith   MatScalar         *v;
23416849ba73SBarry Smith   PetscErrorCode    ierr;
23420298fd71SBarry Smith   PetscInt          mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=NULL;
23433447b6efSHong Zhang   Mat_CompressedRow cprow   = a->compressedrow;
2344ace3abfcSBarry Smith   PetscBool         usecprow=cprow.use;
23452d61bbb3SSatish Balay 
23462d61bbb3SSatish Balay   PetscFunctionBegin;
2347343551c4SBarry Smith   if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); }
23483447b6efSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
23493447b6efSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
23502d61bbb3SSatish Balay 
23512d61bbb3SSatish Balay   idx = a->j;
23522d61bbb3SSatish Balay   v   = a->a;
23533447b6efSHong Zhang   if (usecprow) {
23543447b6efSHong Zhang     mbs  = cprow.nrows;
23553447b6efSHong Zhang     ii   = cprow.i;
23567b2bb3b9SHong Zhang     ridx = cprow.rindex;
23573447b6efSHong Zhang   } else {
23583447b6efSHong Zhang     mbs=a->mbs;
23592d61bbb3SSatish Balay     ii = a->i;
2360f1af5d2fSBarry Smith     xb = x;
23613447b6efSHong Zhang   }
23622d61bbb3SSatish Balay 
23632d61bbb3SSatish Balay   switch (bs) {
23642d61bbb3SSatish Balay   case 1:
23652d61bbb3SSatish Balay     for (i=0; i<mbs; i++) {
23667b2bb3b9SHong Zhang       if (usecprow) xb = x + ridx[i];
2367f1af5d2fSBarry Smith       x1 = xb[0];
23683447b6efSHong Zhang       ib = idx + ii[0];
23693447b6efSHong Zhang       n  = ii[1] - ii[0]; ii++;
23702d61bbb3SSatish Balay       for (j=0; j<n; j++) {
23712d61bbb3SSatish Balay         rval     = ib[j];
2372f1af5d2fSBarry Smith         z[rval] += *v * x1;
2373f1af5d2fSBarry Smith         v++;
23742d61bbb3SSatish Balay       }
23753447b6efSHong Zhang       if (!usecprow) xb++;
23762d61bbb3SSatish Balay     }
23772d61bbb3SSatish Balay     break;
23782d61bbb3SSatish Balay   case 2:
23792d61bbb3SSatish Balay     for (i=0; i<mbs; i++) {
23807b2bb3b9SHong Zhang       if (usecprow) xb = x + 2*ridx[i];
2381f1af5d2fSBarry Smith       x1 = xb[0]; x2 = xb[1];
23823447b6efSHong Zhang       ib = idx + ii[0];
23833447b6efSHong Zhang       n  = ii[1] - ii[0]; ii++;
23842d61bbb3SSatish Balay       for (j=0; j<n; j++) {
23852d61bbb3SSatish Balay         rval       = ib[j]*2;
23862d61bbb3SSatish Balay         z[rval++] += v[0]*x1 + v[1]*x2;
23872d61bbb3SSatish Balay         z[rval++] += v[2]*x1 + v[3]*x2;
23882d61bbb3SSatish Balay         v         += 4;
23892d61bbb3SSatish Balay       }
23903447b6efSHong Zhang       if (!usecprow) xb += 2;
23912d61bbb3SSatish Balay     }
23922d61bbb3SSatish Balay     break;
23932d61bbb3SSatish Balay   case 3:
23942d61bbb3SSatish Balay     for (i=0; i<mbs; i++) {
23957b2bb3b9SHong Zhang       if (usecprow) xb = x + 3*ridx[i];
2396f1af5d2fSBarry Smith       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
23973447b6efSHong Zhang       ib = idx + ii[0];
23983447b6efSHong Zhang       n  = ii[1] - ii[0]; ii++;
23992d61bbb3SSatish Balay       for (j=0; j<n; j++) {
24002d61bbb3SSatish Balay         rval       = ib[j]*3;
24012d61bbb3SSatish Balay         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
24022d61bbb3SSatish Balay         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
24032d61bbb3SSatish Balay         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
24042d61bbb3SSatish Balay         v         += 9;
24052d61bbb3SSatish Balay       }
24063447b6efSHong Zhang       if (!usecprow) xb += 3;
24072d61bbb3SSatish Balay     }
24082d61bbb3SSatish Balay     break;
24092d61bbb3SSatish Balay   case 4:
24102d61bbb3SSatish Balay     for (i=0; i<mbs; i++) {
24117b2bb3b9SHong Zhang       if (usecprow) xb = x + 4*ridx[i];
2412f1af5d2fSBarry Smith       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
24133447b6efSHong Zhang       ib = idx + ii[0];
24143447b6efSHong Zhang       n  = ii[1] - ii[0]; ii++;
24152d61bbb3SSatish Balay       for (j=0; j<n; j++) {
24162d61bbb3SSatish Balay         rval       = ib[j]*4;
24172d61bbb3SSatish Balay         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
24182d61bbb3SSatish Balay         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
24192d61bbb3SSatish Balay         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
24202d61bbb3SSatish Balay         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
24212d61bbb3SSatish Balay         v         += 16;
24222d61bbb3SSatish Balay       }
24233447b6efSHong Zhang       if (!usecprow) xb += 4;
24242d61bbb3SSatish Balay     }
24252d61bbb3SSatish Balay     break;
24262d61bbb3SSatish Balay   case 5:
24272d61bbb3SSatish Balay     for (i=0; i<mbs; i++) {
24287b2bb3b9SHong Zhang       if (usecprow) xb = x + 5*ridx[i];
2429f1af5d2fSBarry Smith       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
24302d61bbb3SSatish Balay       x4 = xb[3]; x5 = xb[4];
24313447b6efSHong Zhang       ib = idx + ii[0];
24323447b6efSHong Zhang       n  = ii[1] - ii[0]; ii++;
24332d61bbb3SSatish Balay       for (j=0; j<n; j++) {
24342d61bbb3SSatish Balay         rval       = ib[j]*5;
24352d61bbb3SSatish Balay         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
24362d61bbb3SSatish Balay         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
24372d61bbb3SSatish Balay         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
24382d61bbb3SSatish Balay         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
24392d61bbb3SSatish Balay         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
24402d61bbb3SSatish Balay         v         += 25;
24412d61bbb3SSatish Balay       }
24423447b6efSHong Zhang       if (!usecprow) xb += 5;
24432d61bbb3SSatish Balay     }
24442d61bbb3SSatish Balay     break;
2445f1af5d2fSBarry Smith   default: {      /* block sizes larger then 5 by 5 are handled by BLAS */
2446690b6cddSBarry Smith     PetscInt    ncols,k;
24473447b6efSHong Zhang     PetscScalar *work,*workt,*xtmp;
24483f1db9ecSBarry Smith 
24492d61bbb3SSatish Balay     if (!a->mult_work) {
2450d0f46423SBarry Smith       k    = PetscMax(A->rmap->n,A->cmap->n);
245187828ca2SBarry Smith       ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
24522d61bbb3SSatish Balay     }
24532d61bbb3SSatish Balay     work = a->mult_work;
24543447b6efSHong Zhang     xtmp = x;
24552d61bbb3SSatish Balay     for (i=0; i<mbs; i++) {
24562d61bbb3SSatish Balay       n     = ii[1] - ii[0]; ii++;
24572d61bbb3SSatish Balay       ncols = n*bs;
245887828ca2SBarry Smith       ierr  = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
245926fbe8dcSKarl Rupp       if (usecprow) xtmp = x + bs*ridx[i];
246096b95a6bSBarry Smith       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
246171044d3cSBarry Smith       /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
24622d61bbb3SSatish Balay       v += n*bs2;
24633447b6efSHong Zhang       if (!usecprow) xtmp += bs;
24642d61bbb3SSatish Balay       workt = work;
24652d61bbb3SSatish Balay       for (j=0; j<n; j++) {
24662d61bbb3SSatish Balay         zb = z + bs*(*idx++);
24672d61bbb3SSatish Balay         for (k=0; k<bs; k++) zb[k] += workt[k];
24682d61bbb3SSatish Balay         workt += bs;
24692d61bbb3SSatish Balay       }
24702d61bbb3SSatish Balay     }
24712d61bbb3SSatish Balay     }
24722d61bbb3SSatish Balay   }
24733447b6efSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
24743447b6efSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
2475dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr);
24762d61bbb3SSatish Balay   PetscFunctionReturn(0);
24772d61bbb3SSatish Balay }
24782d61bbb3SSatish Balay 
24794a2ae208SSatish Balay #undef __FUNCT__
24804a2ae208SSatish Balay #define __FUNCT__ "MatScale_SeqBAIJ"
2481f4df32b1SMatthew Knepley PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
24822d61bbb3SSatish Balay {
24832d61bbb3SSatish Balay   Mat_SeqBAIJ    *a      = (Mat_SeqBAIJ*)inA->data;
2484690b6cddSBarry Smith   PetscInt       totalnz = a->bs2*a->nz;
2485f4df32b1SMatthew Knepley   PetscScalar    oalpha  = alpha;
2486efee365bSSatish Balay   PetscErrorCode ierr;
2487c5df96a5SBarry Smith   PetscBLASInt   one = 1,tnz;
24882d61bbb3SSatish Balay 
24892d61bbb3SSatish Balay   PetscFunctionBegin;
2490c5df96a5SBarry Smith   ierr = PetscBLASIntCast(totalnz,&tnz);CHKERRQ(ierr);
24918b83055fSJed Brown   PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one));
2492efee365bSSatish Balay   ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
24932d61bbb3SSatish Balay   PetscFunctionReturn(0);
24942d61bbb3SSatish Balay }
24952d61bbb3SSatish Balay 
24964a2ae208SSatish Balay #undef __FUNCT__
24974a2ae208SSatish Balay #define __FUNCT__ "MatNorm_SeqBAIJ"
2498dfbe8321SBarry Smith PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
24992d61bbb3SSatish Balay {
25008a62d963SHong Zhang   PetscErrorCode ierr;
25012d61bbb3SSatish Balay   Mat_SeqBAIJ    *a  = (Mat_SeqBAIJ*)A->data;
25023f1db9ecSBarry Smith   MatScalar      *v  = a->a;
2503329f5518SBarry Smith   PetscReal      sum = 0.0;
2504d0f46423SBarry Smith   PetscInt       i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1;
25052d61bbb3SSatish Balay 
25062d61bbb3SSatish Balay   PetscFunctionBegin;
25072d61bbb3SSatish Balay   if (type == NORM_FROBENIUS) {
25082d61bbb3SSatish Balay     for (i=0; i< bs2*nz; i++) {
2509329f5518SBarry Smith       sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
25102d61bbb3SSatish Balay     }
25118f1a2a5eSBarry Smith     *norm = PetscSqrtReal(sum);
25128a62d963SHong Zhang   } else if (type == NORM_1) { /* maximum column sum */
25138a62d963SHong Zhang     PetscReal *tmp;
25148a62d963SHong Zhang     PetscInt  *bcol = a->j;
2515d0f46423SBarry Smith     ierr = PetscMalloc((A->cmap->n+1)*sizeof(PetscReal),&tmp);CHKERRQ(ierr);
2516d0f46423SBarry Smith     ierr = PetscMemzero(tmp,A->cmap->n*sizeof(PetscReal));CHKERRQ(ierr);
25178a62d963SHong Zhang     for (i=0; i<nz; i++) {
25188a62d963SHong Zhang       for (j=0; j<bs; j++) {
25198a62d963SHong Zhang         k1 = bs*(*bcol) + j; /* column index */
25208a62d963SHong Zhang         for (k=0; k<bs; k++) {
25218a62d963SHong Zhang           tmp[k1] += PetscAbsScalar(*v); v++;
25228a62d963SHong Zhang         }
25238a62d963SHong Zhang       }
25248a62d963SHong Zhang       bcol++;
25258a62d963SHong Zhang     }
25268a62d963SHong Zhang     *norm = 0.0;
2527d0f46423SBarry Smith     for (j=0; j<A->cmap->n; j++) {
25288a62d963SHong Zhang       if (tmp[j] > *norm) *norm = tmp[j];
25298a62d963SHong Zhang     }
25308a62d963SHong Zhang     ierr = PetscFree(tmp);CHKERRQ(ierr);
2531596552b5SBarry Smith   } else if (type == NORM_INFINITY) { /* maximum row sum */
2532596552b5SBarry Smith     *norm = 0.0;
2533596552b5SBarry Smith     for (k=0; k<bs; k++) {
253474f84c7bSSatish Balay       for (j=0; j<a->mbs; j++) {
2535596552b5SBarry Smith         v   = a->a + bs2*a->i[j] + k;
2536596552b5SBarry Smith         sum = 0.0;
2537596552b5SBarry Smith         for (i=0; i<a->i[j+1]-a->i[j]; i++) {
25380e90e235SBarry Smith           for (k1=0; k1<bs; k1++) {
2539596552b5SBarry Smith             sum += PetscAbsScalar(*v);
2540596552b5SBarry Smith             v   += bs;
25412d61bbb3SSatish Balay           }
25420e90e235SBarry Smith         }
2543596552b5SBarry Smith         if (sum > *norm) *norm = sum;
2544596552b5SBarry Smith       }
2545596552b5SBarry Smith     }
2546e7e72b3dSBarry Smith   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
25472d61bbb3SSatish Balay   PetscFunctionReturn(0);
25482d61bbb3SSatish Balay }
25492d61bbb3SSatish Balay 
25502d61bbb3SSatish Balay 
25514a2ae208SSatish Balay #undef __FUNCT__
25524a2ae208SSatish Balay #define __FUNCT__ "MatEqual_SeqBAIJ"
2553ace3abfcSBarry Smith PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg)
25542d61bbb3SSatish Balay {
25552d61bbb3SSatish Balay   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
2556dfbe8321SBarry Smith   PetscErrorCode ierr;
25572d61bbb3SSatish Balay 
25582d61bbb3SSatish Balay   PetscFunctionBegin;
25592d61bbb3SSatish Balay   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
2560d0f46423SBarry Smith   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
2561273d9f13SBarry Smith     *flg = PETSC_FALSE;
2562273d9f13SBarry Smith     PetscFunctionReturn(0);
25632d61bbb3SSatish Balay   }
25642d61bbb3SSatish Balay 
25652d61bbb3SSatish Balay   /* if the a->i are the same */
2566690b6cddSBarry Smith   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr);
256726fbe8dcSKarl Rupp   if (!*flg) PetscFunctionReturn(0);
25682d61bbb3SSatish Balay 
25692d61bbb3SSatish Balay   /* if a->j are the same */
2570690b6cddSBarry Smith   ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr);
257126fbe8dcSKarl Rupp   if (!*flg) PetscFunctionReturn(0);
257226fbe8dcSKarl Rupp 
25732d61bbb3SSatish Balay   /* if a->a are the same */
2574d0f46423SBarry Smith   ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
25752d61bbb3SSatish Balay   PetscFunctionReturn(0);
25762d61bbb3SSatish Balay 
25772d61bbb3SSatish Balay }
25782d61bbb3SSatish Balay 
25794a2ae208SSatish Balay #undef __FUNCT__
25804a2ae208SSatish Balay #define __FUNCT__ "MatGetDiagonal_SeqBAIJ"
2581dfbe8321SBarry Smith PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
25822d61bbb3SSatish Balay {
25832d61bbb3SSatish Balay   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2584dfbe8321SBarry Smith   PetscErrorCode ierr;
2585690b6cddSBarry Smith   PetscInt       i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
258687828ca2SBarry Smith   PetscScalar    *x,zero = 0.0;
25873f1db9ecSBarry Smith   MatScalar      *aa,*aa_j;
25882d61bbb3SSatish Balay 
25892d61bbb3SSatish Balay   PetscFunctionBegin;
2590e32f2f54SBarry Smith   if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
2591d0f46423SBarry Smith   bs   = A->rmap->bs;
25922d61bbb3SSatish Balay   aa   = a->a;
25932d61bbb3SSatish Balay   ai   = a->i;
25942d61bbb3SSatish Balay   aj   = a->j;
25952d61bbb3SSatish Balay   ambs = a->mbs;
25962d61bbb3SSatish Balay   bs2  = a->bs2;
25972d61bbb3SSatish Balay 
25982dcb1b2aSMatthew Knepley   ierr = VecSet(v,zero);CHKERRQ(ierr);
25991ebc52fbSHong Zhang   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
2600e1311b90SBarry Smith   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
2601e32f2f54SBarry Smith   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
26022d61bbb3SSatish Balay   for (i=0; i<ambs; i++) {
26032d61bbb3SSatish Balay     for (j=ai[i]; j<ai[i+1]; j++) {
26042d61bbb3SSatish Balay       if (aj[j] == i) {
26052d61bbb3SSatish Balay         row  = i*bs;
26062d61bbb3SSatish Balay         aa_j = aa+j*bs2;
26072d61bbb3SSatish Balay         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
26082d61bbb3SSatish Balay         break;
26092d61bbb3SSatish Balay       }
26102d61bbb3SSatish Balay     }
26112d61bbb3SSatish Balay   }
26121ebc52fbSHong Zhang   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
26132d61bbb3SSatish Balay   PetscFunctionReturn(0);
26142d61bbb3SSatish Balay }
26152d61bbb3SSatish Balay 
26164a2ae208SSatish Balay #undef __FUNCT__
26174a2ae208SSatish Balay #define __FUNCT__ "MatDiagonalScale_SeqBAIJ"
2618dfbe8321SBarry Smith PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
26192d61bbb3SSatish Balay {
26202d61bbb3SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
262153ef36baSBarry Smith   const PetscScalar *l,*r,*li,*ri;
262253ef36baSBarry Smith   PetscScalar       x;
26233f1db9ecSBarry Smith   MatScalar         *aa, *v;
2624dfbe8321SBarry Smith   PetscErrorCode    ierr;
262553ef36baSBarry Smith   PetscInt          i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai;
262653ef36baSBarry Smith   const PetscInt    *ai,*aj;
26272d61bbb3SSatish Balay 
26282d61bbb3SSatish Balay   PetscFunctionBegin;
26292d61bbb3SSatish Balay   ai  = a->i;
26302d61bbb3SSatish Balay   aj  = a->j;
26312d61bbb3SSatish Balay   aa  = a->a;
2632d0f46423SBarry Smith   m   = A->rmap->n;
2633d0f46423SBarry Smith   n   = A->cmap->n;
2634d0f46423SBarry Smith   bs  = A->rmap->bs;
26352d61bbb3SSatish Balay   mbs = a->mbs;
26362d61bbb3SSatish Balay   bs2 = a->bs2;
26372d61bbb3SSatish Balay   if (ll) {
263853ef36baSBarry Smith     ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr);
2639e1311b90SBarry Smith     ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
2640e32f2f54SBarry Smith     if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
26412d61bbb3SSatish Balay     for (i=0; i<mbs; i++) { /* for each block row */
26422d61bbb3SSatish Balay       M  = ai[i+1] - ai[i];
26432d61bbb3SSatish Balay       li = l + i*bs;
26442d61bbb3SSatish Balay       v  = aa + bs2*ai[i];
26452d61bbb3SSatish Balay       for (j=0; j<M; j++) { /* for each block */
26462d61bbb3SSatish Balay         for (k=0; k<bs2; k++) {
26472d61bbb3SSatish Balay           (*v++) *= li[k%bs];
26482d61bbb3SSatish Balay         }
26492d61bbb3SSatish Balay       }
26502d61bbb3SSatish Balay     }
265153ef36baSBarry Smith     ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr);
2652efee365bSSatish Balay     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
26532d61bbb3SSatish Balay   }
26542d61bbb3SSatish Balay 
26552d61bbb3SSatish Balay   if (rr) {
265653ef36baSBarry Smith     ierr = VecGetArrayRead(rr,&r);CHKERRQ(ierr);
2657e1311b90SBarry Smith     ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr);
2658e32f2f54SBarry Smith     if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
26592d61bbb3SSatish Balay     for (i=0; i<mbs; i++) { /* for each block row */
266053ef36baSBarry Smith       iai = ai[i];
266153ef36baSBarry Smith       M   = ai[i+1] - iai;
266253ef36baSBarry Smith       v   = aa + bs2*iai;
26632d61bbb3SSatish Balay       for (j=0; j<M; j++) { /* for each block */
266453ef36baSBarry Smith         ri = r + bs*aj[iai+j];
26652d61bbb3SSatish Balay         for (k=0; k<bs; k++) {
26662d61bbb3SSatish Balay           x = ri[k];
266753ef36baSBarry Smith           for (tmp=0; tmp<bs; tmp++) v[tmp] *= x;
266853ef36baSBarry Smith           v += bs;
26692d61bbb3SSatish Balay         }
26702d61bbb3SSatish Balay       }
26712d61bbb3SSatish Balay     }
267253ef36baSBarry Smith     ierr = VecRestoreArrayRead(rr,&r);CHKERRQ(ierr);
2673efee365bSSatish Balay     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
26742d61bbb3SSatish Balay   }
26752d61bbb3SSatish Balay   PetscFunctionReturn(0);
26762d61bbb3SSatish Balay }
26772d61bbb3SSatish Balay 
26782d61bbb3SSatish Balay 
26794a2ae208SSatish Balay #undef __FUNCT__
26804a2ae208SSatish Balay #define __FUNCT__ "MatGetInfo_SeqBAIJ"
2681dfbe8321SBarry Smith PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
26822d61bbb3SSatish Balay {
26832d61bbb3SSatish Balay   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
26842d61bbb3SSatish Balay 
26852d61bbb3SSatish Balay   PetscFunctionBegin;
26862d61bbb3SSatish Balay   info->block_size   = a->bs2;
2687ceed8ce5SJed Brown   info->nz_allocated = a->bs2*a->maxnz;
26882d61bbb3SSatish Balay   info->nz_used      = a->bs2*a->nz;
26892d61bbb3SSatish Balay   info->nz_unneeded  = (double)(info->nz_allocated - info->nz_used);
26902d61bbb3SSatish Balay   info->assemblies   = A->num_ass;
26918e58a170SBarry Smith   info->mallocs      = A->info.mallocs;
26927adad957SLisandro Dalcin   info->memory       = ((PetscObject)A)->mem;
2693d5f3da31SBarry Smith   if (A->factortype) {
26942d61bbb3SSatish Balay     info->fill_ratio_given  = A->info.fill_ratio_given;
26952d61bbb3SSatish Balay     info->fill_ratio_needed = A->info.fill_ratio_needed;
26962d61bbb3SSatish Balay     info->factor_mallocs    = A->info.factor_mallocs;
26972d61bbb3SSatish Balay   } else {
26982d61bbb3SSatish Balay     info->fill_ratio_given  = 0;
26992d61bbb3SSatish Balay     info->fill_ratio_needed = 0;
27002d61bbb3SSatish Balay     info->factor_mallocs    = 0;
27012d61bbb3SSatish Balay   }
27022d61bbb3SSatish Balay   PetscFunctionReturn(0);
27032d61bbb3SSatish Balay }
27042d61bbb3SSatish Balay 
27052d61bbb3SSatish Balay 
27069bfb1392SMichael Lange #if defined(PETSC_THREADCOMM_ACTIVE)
27079bfb1392SMichael Lange PetscErrorCode MatZeroEntries_SeqBAIJ_Kernel(PetscInt thread_id,Mat A)
27089bfb1392SMichael Lange {
27099bfb1392SMichael Lange   PetscErrorCode ierr;
27109bfb1392SMichael Lange   PetscInt       *trstarts=A->rmap->trstarts;
27119bfb1392SMichael Lange   PetscInt       n,start,end;
27129bfb1392SMichael Lange   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
27139bfb1392SMichael Lange 
27149bfb1392SMichael Lange   start = trstarts[thread_id];
27159bfb1392SMichael Lange   end   = trstarts[thread_id+1];
27169bfb1392SMichael Lange   n     = a->i[end] - a->i[start];
27179bfb1392SMichael Lange   ierr  = PetscMemzero(a->a+a->bs2*a->i[start],a->bs2*n*sizeof(PetscScalar));CHKERRQ(ierr);
27189bfb1392SMichael Lange   return 0;
27199bfb1392SMichael Lange }
27209bfb1392SMichael Lange 
27219bfb1392SMichael Lange #undef __FUNCT__
27229bfb1392SMichael Lange #define __FUNCT__ "MatZeroEntries_SeqBAIJ"
27239bfb1392SMichael Lange PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
27249bfb1392SMichael Lange {
27259bfb1392SMichael Lange   PetscErrorCode ierr;
27269bfb1392SMichael Lange 
27279bfb1392SMichael Lange 
27289bfb1392SMichael Lange   PetscFunctionBegin;
27299bfb1392SMichael Lange   ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatZeroEntries_SeqBAIJ_Kernel,1,A);CHKERRQ(ierr);
27309bfb1392SMichael Lange   PetscFunctionReturn(0);
27319bfb1392SMichael Lange }
27329bfb1392SMichael Lange #else
27334a2ae208SSatish Balay #undef __FUNCT__
27344a2ae208SSatish Balay #define __FUNCT__ "MatZeroEntries_SeqBAIJ"
2735dfbe8321SBarry Smith PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
27362d61bbb3SSatish Balay {
27372d61bbb3SSatish Balay   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2738dfbe8321SBarry Smith   PetscErrorCode ierr;
27392d61bbb3SSatish Balay 
27402d61bbb3SSatish Balay   PetscFunctionBegin;
2741549d3d68SSatish Balay   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
27422d61bbb3SSatish Balay   PetscFunctionReturn(0);
27432d61bbb3SSatish Balay }
27449bfb1392SMichael Lange #endif
2745