xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision 6c4ed00291fe44f94936dd2f04c02ab3c442e77c)
149b5e25fSSatish Balay 
2c6db04a5SJed Brown #include <../src/mat/impls/baij/seq/baij.h>
3af0996ceSBarry Smith #include <petsc/private/kernels/blockinvert.h>
4c6db04a5SJed Brown #include <petscbt.h>
5c6db04a5SJed Brown #include <../src/mat/impls/sbaij/seq/sbaij.h>
6c6db04a5SJed Brown #include <petscblaslapack.h>
749b5e25fSSatish Balay 
84a2ae208SSatish Balay #undef __FUNCT__
94a2ae208SSatish Balay #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ"
1013f74950SBarry Smith PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
1149b5e25fSSatish Balay {
125eee224dSHong Zhang   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
136849ba73SBarry Smith   PetscErrorCode ierr;
145d0c19d7SBarry Smith   PetscInt       brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2;
155d0c19d7SBarry Smith   const PetscInt *idx;
16db41cccfSHong Zhang   PetscBT        table_out,table_in;
17d94109b8SHong Zhang 
18d94109b8SHong Zhang   PetscFunctionBegin;
19e32f2f54SBarry Smith   if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
20d94109b8SHong Zhang   mbs  = a->mbs;
21d94109b8SHong Zhang   ai   = a->i;
22d94109b8SHong Zhang   aj   = a->j;
23d0f46423SBarry Smith   bs   = A->rmap->bs;
2453b8de81SBarry Smith   ierr = PetscBTCreate(mbs,&table_out);CHKERRQ(ierr);
25854ce69bSBarry Smith   ierr = PetscMalloc1(mbs+1,&nidx);CHKERRQ(ierr);
26854ce69bSBarry Smith   ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr);
2753b8de81SBarry Smith   ierr = PetscBTCreate(mbs,&table_in);CHKERRQ(ierr);
28d94109b8SHong Zhang 
29d94109b8SHong Zhang   for (i=0; i<is_max; i++) { /* for each is */
30d94109b8SHong Zhang     isz  = 0;
31db41cccfSHong Zhang     ierr = PetscBTMemzero(mbs,table_out);CHKERRQ(ierr);
32d94109b8SHong Zhang 
33d94109b8SHong Zhang     /* Extract the indices, assume there can be duplicate entries */
34d94109b8SHong Zhang     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
35d94109b8SHong Zhang     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
36d94109b8SHong Zhang 
37db41cccfSHong Zhang     /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */
38dbe03f88SHong Zhang     bcol_max = 0;
39d94109b8SHong Zhang     for (j=0; j<n; ++j) {
40d94109b8SHong Zhang       brow = idx[j]/bs; /* convert the indices into block indices */
41e32f2f54SBarry Smith       if (brow >= mbs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
42db41cccfSHong Zhang       if (!PetscBTLookupSet(table_out,brow)) {
43dbe03f88SHong Zhang         nidx[isz++] = brow;
44dbe03f88SHong Zhang         if (bcol_max < brow) bcol_max = brow;
45dbe03f88SHong Zhang       }
46d94109b8SHong Zhang     }
47d94109b8SHong Zhang     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
486bf464f9SBarry Smith     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
49d94109b8SHong Zhang 
50d94109b8SHong Zhang     k = 0;
51d94109b8SHong Zhang     for (j=0; j<ov; j++) { /* for each overlap */
52db41cccfSHong Zhang       /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
53db41cccfSHong Zhang       ierr = PetscBTMemzero(mbs,table_in);CHKERRQ(ierr);
54db41cccfSHong Zhang       for (l=k; l<isz; l++) { ierr = PetscBTSet(table_in,nidx[l]);CHKERRQ(ierr); }
55d94109b8SHong Zhang 
56d94109b8SHong Zhang       n = isz;  /* length of the updated is[i] */
57d94109b8SHong Zhang       for (brow=0; brow<mbs; brow++) {
58d94109b8SHong Zhang         start = ai[brow]; end   = ai[brow+1];
59db41cccfSHong Zhang         if (PetscBTLookup(table_in,brow)) { /* brow is on nidx - row search: collect all bcol in this brow */
60d94109b8SHong Zhang           for (l = start; l<end; l++) {
61d94109b8SHong Zhang             bcol = aj[l];
62d7b97159SHong Zhang             if (!PetscBTLookupSet(table_out,bcol)) {
63d7b97159SHong Zhang               nidx[isz++] = bcol;
64d7b97159SHong Zhang               if (bcol_max < bcol) bcol_max = bcol;
65d7b97159SHong Zhang             }
66d94109b8SHong Zhang           }
67d94109b8SHong Zhang           k++;
68d94109b8SHong Zhang           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
69d94109b8SHong Zhang         } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
70d94109b8SHong Zhang           for (l = start; l<end; l++) {
71d94109b8SHong Zhang             bcol = aj[l];
72dbe03f88SHong Zhang             if (bcol > bcol_max) break;
73db41cccfSHong Zhang             if (PetscBTLookup(table_in,bcol)) {
7426fbe8dcSKarl Rupp               if (!PetscBTLookupSet(table_out,brow)) nidx[isz++] = brow;
75d94109b8SHong Zhang               break; /* for l = start; l<end ; l++) */
76d94109b8SHong Zhang             }
77d94109b8SHong Zhang           }
78d94109b8SHong Zhang         }
79d94109b8SHong Zhang       }
80d94109b8SHong Zhang     } /* for each overlap */
81d94109b8SHong Zhang 
82d94109b8SHong Zhang     /* expand the Index Set */
83d94109b8SHong Zhang     for (j=0; j<isz; j++) {
8426fbe8dcSKarl Rupp       for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
85d94109b8SHong Zhang     }
8670b3c8c7SBarry Smith     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
87d94109b8SHong Zhang   }
8894bacf5dSBarry Smith   ierr = PetscBTDestroy(&table_out);CHKERRQ(ierr);
89d94109b8SHong Zhang   ierr = PetscFree(nidx);CHKERRQ(ierr);
90d94109b8SHong Zhang   ierr = PetscFree(nidx2);CHKERRQ(ierr);
9194bacf5dSBarry Smith   ierr = PetscBTDestroy(&table_in);CHKERRQ(ierr);
925eee224dSHong Zhang   PetscFunctionReturn(0);
9349b5e25fSSatish Balay }
9449b5e25fSSatish Balay 
954a2ae208SSatish Balay #undef __FUNCT__
964a2ae208SSatish Balay #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private"
9755d14e7dSHong Zhang PetscErrorCode MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,MatReuse scall,Mat *B)
9849b5e25fSSatish Balay {
9949b5e25fSSatish Balay   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data,*c;
1006849ba73SBarry Smith   PetscErrorCode  ierr;
10113f74950SBarry Smith   PetscInt        *smap,i,k,kstart,kend,oldcols = a->mbs,*lens;
10213f74950SBarry Smith   PetscInt        row,mat_i,*mat_j,tcol,*mat_ilen;
1035d0c19d7SBarry Smith   PetscInt        nrows,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
1045d0c19d7SBarry Smith   const PetscInt  *irow,*aj = a->j,*ai = a->i;
10549b5e25fSSatish Balay   MatScalar       *mat_a;
10649b5e25fSSatish Balay   Mat             C;
107ace3abfcSBarry Smith   PetscBool       flag,sorted;
10849b5e25fSSatish Balay 
10949b5e25fSSatish Balay   PetscFunctionBegin;
11055d14e7dSHong Zhang   ierr = ISSorted(isrow,&sorted);CHKERRQ(ierr);
11155d14e7dSHong Zhang   if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
11249b5e25fSSatish Balay 
11349b5e25fSSatish Balay   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
11449b5e25fSSatish Balay   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
11549b5e25fSSatish Balay 
116785e854fSJed Brown   ierr  = PetscMalloc1(oldcols,&smap);CHKERRQ(ierr);
11774ed9c26SBarry Smith   ierr  = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr);
11849b5e25fSSatish Balay   ssmap = smap;
119854ce69bSBarry Smith   ierr  = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr);
12049b5e25fSSatish Balay   for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */
12149b5e25fSSatish Balay   /* determine lens of each row */
12249b5e25fSSatish Balay   for (i=0; i<nrows; i++) {
12349b5e25fSSatish Balay     kstart  = ai[irow[i]];
12449b5e25fSSatish Balay     kend    = kstart + a->ilen[irow[i]];
12549b5e25fSSatish Balay     lens[i] = 0;
12649b5e25fSSatish Balay     for (k=kstart; k<kend; k++) {
12726fbe8dcSKarl Rupp       if (ssmap[aj[k]]) lens[i]++;
12849b5e25fSSatish Balay     }
12949b5e25fSSatish Balay   }
13049b5e25fSSatish Balay   /* Create and fill new matrix */
13149b5e25fSSatish Balay   if (scall == MAT_REUSE_MATRIX) {
13249b5e25fSSatish Balay     c = (Mat_SeqSBAIJ*)((*B)->data);
13349b5e25fSSatish Balay 
134e32f2f54SBarry Smith     if (c->mbs!=nrows || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
13513f74950SBarry Smith     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr);
136e7e72b3dSBarry Smith     if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
13713f74950SBarry Smith     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr);
13849b5e25fSSatish Balay     C    = *B;
13949b5e25fSSatish Balay   } else {
140ce94432eSBarry Smith     ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr);
141f69a0ea3SMatthew Knepley     ierr = MatSetSizes(C,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1427adad957SLisandro Dalcin     ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
143ab93d7beSBarry Smith     ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(C,bs,0,lens);CHKERRQ(ierr);
14449b5e25fSSatish Balay   }
14549b5e25fSSatish Balay   c = (Mat_SeqSBAIJ*)(C->data);
14649b5e25fSSatish Balay   for (i=0; i<nrows; i++) {
14749b5e25fSSatish Balay     row      = irow[i];
14849b5e25fSSatish Balay     kstart   = ai[row];
14949b5e25fSSatish Balay     kend     = kstart + a->ilen[row];
15049b5e25fSSatish Balay     mat_i    = c->i[i];
15149b5e25fSSatish Balay     mat_j    = c->j + mat_i;
15249b5e25fSSatish Balay     mat_a    = c->a + mat_i*bs2;
15349b5e25fSSatish Balay     mat_ilen = c->ilen + i;
15449b5e25fSSatish Balay     for (k=kstart; k<kend; k++) {
15549b5e25fSSatish Balay       if ((tcol=ssmap[a->j[k]])) {
15649b5e25fSSatish Balay         *mat_j++ = tcol - 1;
15749b5e25fSSatish Balay         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
15849b5e25fSSatish Balay         mat_a   += bs2;
15949b5e25fSSatish Balay         (*mat_ilen)++;
16049b5e25fSSatish Balay       }
16149b5e25fSSatish Balay     }
16249b5e25fSSatish Balay   }
16349b5e25fSSatish Balay 
16449b5e25fSSatish Balay   /* Free work space */
16549b5e25fSSatish Balay   ierr = PetscFree(smap);CHKERRQ(ierr);
16649b5e25fSSatish Balay   ierr = PetscFree(lens);CHKERRQ(ierr);
16749b5e25fSSatish Balay   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
16849b5e25fSSatish Balay   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
16949b5e25fSSatish Balay 
17049b5e25fSSatish Balay   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
17149b5e25fSSatish Balay   *B   = C;
17249b5e25fSSatish Balay   PetscFunctionReturn(0);
17349b5e25fSSatish Balay }
17449b5e25fSSatish Balay 
1754a2ae208SSatish Balay #undef __FUNCT__
1764a2ae208SSatish Balay #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ"
1774aa3045dSJed Brown PetscErrorCode MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
17849b5e25fSSatish Balay {
17949b5e25fSSatish Balay   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
18049b5e25fSSatish Balay   IS             is1;
1816849ba73SBarry Smith   PetscErrorCode ierr;
1825d0c19d7SBarry Smith   PetscInt       *vary,*iary,nrows,i,bs=A->rmap->bs,count;
1835d0c19d7SBarry Smith   const PetscInt *irow;
18449b5e25fSSatish Balay 
18549b5e25fSSatish Balay   PetscFunctionBegin;
1868f46ffcaSHong Zhang   if (isrow != iscol) {
1878f46ffcaSHong Zhang     PetscBool isequal;
1888f46ffcaSHong Zhang     ierr = ISEqual(isrow,iscol,&isequal);CHKERRQ(ierr);
189*6c4ed002SBarry Smith     if (!isequal) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isrow");
1908f46ffcaSHong Zhang   }
19149b5e25fSSatish Balay 
19255d14e7dSHong Zhang   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
19349b5e25fSSatish Balay   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
19449b5e25fSSatish Balay 
19549b5e25fSSatish Balay   /* Verify if the indices corespond to each element in a block
19655d14e7dSHong Zhang    and form the IS with compressed IS */
197dcca6d9dSJed Brown   ierr = PetscMalloc2(a->mbs,&vary,a->mbs,&iary);CHKERRQ(ierr);
19874ed9c26SBarry Smith   ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr);
19949b5e25fSSatish Balay   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
20049b5e25fSSatish Balay 
20149b5e25fSSatish Balay   count = 0;
20249b5e25fSSatish Balay   for (i=0; i<a->mbs; i++) {
203e32f2f54SBarry Smith     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Index set does not match blocks");
20449b5e25fSSatish Balay     if (vary[i]==bs) iary[count++] = i;
20549b5e25fSSatish Balay   }
20649b5e25fSSatish Balay   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
20770b3c8c7SBarry Smith   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr);
20874ed9c26SBarry Smith   ierr = PetscFree2(vary,iary);CHKERRQ(ierr);
20949b5e25fSSatish Balay 
21055d14e7dSHong Zhang   ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,scall,B);CHKERRQ(ierr);
2116bf464f9SBarry Smith   ierr = ISDestroy(&is1);CHKERRQ(ierr);
21249b5e25fSSatish Balay   PetscFunctionReturn(0);
21349b5e25fSSatish Balay }
21449b5e25fSSatish Balay 
2154a2ae208SSatish Balay #undef __FUNCT__
2164a2ae208SSatish Balay #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ"
21713f74950SBarry Smith PetscErrorCode MatGetSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
21849b5e25fSSatish Balay {
2196849ba73SBarry Smith   PetscErrorCode ierr;
22013f74950SBarry Smith   PetscInt       i;
22149b5e25fSSatish Balay 
22249b5e25fSSatish Balay   PetscFunctionBegin;
22349b5e25fSSatish Balay   if (scall == MAT_INITIAL_MATRIX) {
224854ce69bSBarry Smith     ierr = PetscMalloc1(n+1,B);CHKERRQ(ierr);
22549b5e25fSSatish Balay   }
22649b5e25fSSatish Balay 
22749b5e25fSSatish Balay   for (i=0; i<n; i++) {
2284aa3045dSJed Brown     ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr);
22949b5e25fSSatish Balay   }
23049b5e25fSSatish Balay   PetscFunctionReturn(0);
23149b5e25fSSatish Balay }
23249b5e25fSSatish Balay 
23349b5e25fSSatish Balay /* -------------------------------------------------------*/
23449b5e25fSSatish Balay /* Should check that shapes of vectors and matrices match */
23549b5e25fSSatish Balay /* -------------------------------------------------------*/
23649b5e25fSSatish Balay 
2374a2ae208SSatish Balay #undef __FUNCT__
2384a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_2"
239dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
24049b5e25fSSatish Balay {
24149b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
242d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,zero=0.0;
243d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
244d9ca1df4SBarry Smith   const MatScalar   *v;
2456849ba73SBarry Smith   PetscErrorCode    ierr;
246d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
247d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
24898c9bda7SSatish Balay   PetscInt          nonzerorow=0;
24949b5e25fSSatish Balay 
25049b5e25fSSatish Balay   PetscFunctionBegin;
2512dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
252d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
2531ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
25449b5e25fSSatish Balay 
25549b5e25fSSatish Balay   v  = a->a;
25649b5e25fSSatish Balay   xb = x;
25749b5e25fSSatish Balay 
25849b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
25949b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
26049b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1];
26149b5e25fSSatish Balay     ib          = aj + *ai;
262831a3094SHong Zhang     jmin        = 0;
26398c9bda7SSatish Balay     nonzerorow += (n>0);
2647fbae186SHong Zhang     if (*ib == i) {     /* (diag of A)*x */
26549b5e25fSSatish Balay       z[2*i]   += v[0]*x1 + v[2]*x2;
26649b5e25fSSatish Balay       z[2*i+1] += v[2]*x1 + v[3]*x2;
267831a3094SHong Zhang       v        += 4; jmin++;
2687fbae186SHong Zhang     }
269444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
270444d8c10SJed Brown     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
271831a3094SHong Zhang     for (j=jmin; j<n; j++) {
27249b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
27349b5e25fSSatish Balay       cval       = ib[j]*2;
27449b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2;
27549b5e25fSSatish Balay       z[cval+1] += v[2]*x1 + v[3]*x2;
27649b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
27749b5e25fSSatish Balay       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
27849b5e25fSSatish Balay       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
27949b5e25fSSatish Balay       v        += 4;
28049b5e25fSSatish Balay     }
28149b5e25fSSatish Balay     xb +=2; ai++;
28249b5e25fSSatish Balay   }
28349b5e25fSSatish Balay 
284d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
2851ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
286dc0b31edSSatish Balay   ierr = PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
28749b5e25fSSatish Balay   PetscFunctionReturn(0);
28849b5e25fSSatish Balay }
28949b5e25fSSatish Balay 
2904a2ae208SSatish Balay #undef __FUNCT__
2914a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_3"
292dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
29349b5e25fSSatish Balay {
29449b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
295d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,zero=0.0;
296d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
297d9ca1df4SBarry Smith   const MatScalar   *v;
2986849ba73SBarry Smith   PetscErrorCode    ierr;
299d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
300d9ca1df4SBarry Smith   const PetscInt    *aj = a->j,*ai = a->i,*ib;
30198c9bda7SSatish Balay   PetscInt          nonzerorow=0;
30249b5e25fSSatish Balay 
30349b5e25fSSatish Balay   PetscFunctionBegin;
3042dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
305d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
3061ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
30749b5e25fSSatish Balay 
30849b5e25fSSatish Balay   v  = a->a;
30949b5e25fSSatish Balay   xb = x;
31049b5e25fSSatish Balay 
31149b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
31249b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
31349b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2];
31449b5e25fSSatish Balay     ib          = aj + *ai;
315831a3094SHong Zhang     jmin        = 0;
31698c9bda7SSatish Balay     nonzerorow += (n>0);
3177fbae186SHong Zhang     if (*ib == i) {     /* (diag of A)*x */
31849b5e25fSSatish Balay       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
31949b5e25fSSatish Balay       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
32049b5e25fSSatish Balay       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
321831a3094SHong Zhang       v        += 9; jmin++;
3227fbae186SHong Zhang     }
323444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
324444d8c10SJed Brown     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
325831a3094SHong Zhang     for (j=jmin; j<n; j++) {
32649b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
32749b5e25fSSatish Balay       cval       = ib[j]*3;
32849b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3;
32949b5e25fSSatish Balay       z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
33049b5e25fSSatish Balay       z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
33149b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
33249b5e25fSSatish Balay       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
33349b5e25fSSatish Balay       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
33449b5e25fSSatish Balay       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
33549b5e25fSSatish Balay       v        += 9;
33649b5e25fSSatish Balay     }
33749b5e25fSSatish Balay     xb +=3; ai++;
33849b5e25fSSatish Balay   }
33949b5e25fSSatish Balay 
340d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
3411ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
342dc0b31edSSatish Balay   ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
34349b5e25fSSatish Balay   PetscFunctionReturn(0);
34449b5e25fSSatish Balay }
34549b5e25fSSatish Balay 
3464a2ae208SSatish Balay #undef __FUNCT__
3474a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_4"
348dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
34949b5e25fSSatish Balay {
35049b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
351d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,zero=0.0;
352d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
353d9ca1df4SBarry Smith   const MatScalar   *v;
3546849ba73SBarry Smith   PetscErrorCode    ierr;
355d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
356d9ca1df4SBarry Smith   const PetscInt    *aj = a->j,*ai = a->i,*ib;
35798c9bda7SSatish Balay   PetscInt          nonzerorow = 0;
35849b5e25fSSatish Balay 
35949b5e25fSSatish Balay   PetscFunctionBegin;
3602dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
361d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
3621ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
36349b5e25fSSatish Balay 
36449b5e25fSSatish Balay   v  = a->a;
36549b5e25fSSatish Balay   xb = x;
36649b5e25fSSatish Balay 
36749b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
36849b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
36949b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
37049b5e25fSSatish Balay     ib          = aj + *ai;
371831a3094SHong Zhang     jmin        = 0;
37298c9bda7SSatish Balay     nonzerorow += (n>0);
3737fbae186SHong Zhang     if (*ib == i) {     /* (diag of A)*x */
37449b5e25fSSatish Balay       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
37549b5e25fSSatish Balay       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
37649b5e25fSSatish Balay       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
37749b5e25fSSatish Balay       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
378831a3094SHong Zhang       v        += 16; jmin++;
3797fbae186SHong Zhang     }
380444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
381444d8c10SJed Brown     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
382831a3094SHong Zhang     for (j=jmin; j<n; j++) {
38349b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
38449b5e25fSSatish Balay       cval       = ib[j]*4;
38549b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
38649b5e25fSSatish Balay       z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
38749b5e25fSSatish Balay       z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
38849b5e25fSSatish Balay       z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
38949b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
39049b5e25fSSatish Balay       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
39149b5e25fSSatish Balay       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
39249b5e25fSSatish Balay       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
39349b5e25fSSatish Balay       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
39449b5e25fSSatish Balay       v        += 16;
39549b5e25fSSatish Balay     }
39649b5e25fSSatish Balay     xb +=4; ai++;
39749b5e25fSSatish Balay   }
39849b5e25fSSatish Balay 
399d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
4001ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
401dc0b31edSSatish Balay   ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
40249b5e25fSSatish Balay   PetscFunctionReturn(0);
40349b5e25fSSatish Balay }
40449b5e25fSSatish Balay 
4054a2ae208SSatish Balay #undef __FUNCT__
4064a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_5"
407dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
40849b5e25fSSatish Balay {
40949b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
410d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5,zero=0.0;
411d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
412d9ca1df4SBarry Smith   const MatScalar   *v;
4136849ba73SBarry Smith   PetscErrorCode    ierr;
414d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
415d9ca1df4SBarry Smith   const PetscInt    *aj = a->j,*ai = a->i,*ib;
41698c9bda7SSatish Balay   PetscInt          nonzerorow=0;
41749b5e25fSSatish Balay 
41849b5e25fSSatish Balay   PetscFunctionBegin;
4192dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
420d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
4211ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
42249b5e25fSSatish Balay 
42349b5e25fSSatish Balay   v  = a->a;
42449b5e25fSSatish Balay   xb = x;
42549b5e25fSSatish Balay 
42649b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
42749b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
42849b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
42949b5e25fSSatish Balay     ib          = aj + *ai;
430831a3094SHong Zhang     jmin        = 0;
43198c9bda7SSatish Balay     nonzerorow += (n>0);
4327fbae186SHong Zhang     if (*ib == i) {      /* (diag of A)*x */
43349b5e25fSSatish Balay       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
43449b5e25fSSatish Balay       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
43549b5e25fSSatish Balay       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
43649b5e25fSSatish Balay       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
43749b5e25fSSatish Balay       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
438831a3094SHong Zhang       v        += 25; jmin++;
4397fbae186SHong Zhang     }
440444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
441444d8c10SJed Brown     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
442831a3094SHong Zhang     for (j=jmin; j<n; j++) {
44349b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
44449b5e25fSSatish Balay       cval       = ib[j]*5;
44549b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
44649b5e25fSSatish Balay       z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
44749b5e25fSSatish Balay       z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
44849b5e25fSSatish Balay       z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
44949b5e25fSSatish Balay       z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
45049b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
45149b5e25fSSatish Balay       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
45249b5e25fSSatish Balay       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
45349b5e25fSSatish Balay       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
45449b5e25fSSatish Balay       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
45549b5e25fSSatish Balay       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
45649b5e25fSSatish Balay       v        += 25;
45749b5e25fSSatish Balay     }
45849b5e25fSSatish Balay     xb +=5; ai++;
45949b5e25fSSatish Balay   }
46049b5e25fSSatish Balay 
461d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
4621ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
463dc0b31edSSatish Balay   ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
46449b5e25fSSatish Balay   PetscFunctionReturn(0);
46549b5e25fSSatish Balay }
46649b5e25fSSatish Balay 
46749b5e25fSSatish Balay 
4684a2ae208SSatish Balay #undef __FUNCT__
4694a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_6"
470dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
47149b5e25fSSatish Balay {
47249b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
473d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5,x6,zero=0.0;
474d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
475d9ca1df4SBarry Smith   const MatScalar   *v;
4766849ba73SBarry Smith   PetscErrorCode    ierr;
477d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
478d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
47998c9bda7SSatish Balay   PetscInt          nonzerorow=0;
48049b5e25fSSatish Balay 
48149b5e25fSSatish Balay   PetscFunctionBegin;
4822dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
483d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
4841ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
48549b5e25fSSatish Balay 
48649b5e25fSSatish Balay   v  = a->a;
48749b5e25fSSatish Balay   xb = x;
48849b5e25fSSatish Balay 
48949b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
49049b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
49149b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
49249b5e25fSSatish Balay     ib          = aj + *ai;
493831a3094SHong Zhang     jmin        = 0;
49498c9bda7SSatish Balay     nonzerorow += (n>0);
4957fbae186SHong Zhang     if (*ib == i) {      /* (diag of A)*x */
49649b5e25fSSatish Balay       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
49749b5e25fSSatish Balay       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
49849b5e25fSSatish Balay       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
49949b5e25fSSatish Balay       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
50049b5e25fSSatish Balay       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
50149b5e25fSSatish Balay       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
502831a3094SHong Zhang       v        += 36; jmin++;
5037fbae186SHong Zhang     }
504444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
505444d8c10SJed Brown     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
506831a3094SHong Zhang     for (j=jmin; j<n; j++) {
50749b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
50849b5e25fSSatish Balay       cval       = ib[j]*6;
50949b5e25fSSatish Balay       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
51049b5e25fSSatish Balay       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
51149b5e25fSSatish Balay       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
51249b5e25fSSatish Balay       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
51349b5e25fSSatish Balay       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
51449b5e25fSSatish Balay       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
51549b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
51649b5e25fSSatish Balay       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
51749b5e25fSSatish Balay       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
51849b5e25fSSatish Balay       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
51949b5e25fSSatish Balay       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
52049b5e25fSSatish Balay       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
52149b5e25fSSatish Balay       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
52249b5e25fSSatish Balay       v        += 36;
52349b5e25fSSatish Balay     }
52449b5e25fSSatish Balay     xb +=6; ai++;
52549b5e25fSSatish Balay   }
52649b5e25fSSatish Balay 
527d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
5281ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
529dc0b31edSSatish Balay   ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
53049b5e25fSSatish Balay   PetscFunctionReturn(0);
53149b5e25fSSatish Balay }
5324a2ae208SSatish Balay #undef __FUNCT__
5334a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_7"
534dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
53549b5e25fSSatish Balay {
53649b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
537d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
538d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
539d9ca1df4SBarry Smith   const MatScalar   *v;
5406849ba73SBarry Smith   PetscErrorCode    ierr;
541d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
542d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
54398c9bda7SSatish Balay   PetscInt          nonzerorow=0;
54449b5e25fSSatish Balay 
54549b5e25fSSatish Balay   PetscFunctionBegin;
5462dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
547d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
5481ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
54949b5e25fSSatish Balay 
55049b5e25fSSatish Balay   v  = a->a;
55149b5e25fSSatish Balay   xb = x;
55249b5e25fSSatish Balay 
55349b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
55449b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
55549b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
55649b5e25fSSatish Balay     ib          = aj + *ai;
557831a3094SHong Zhang     jmin        = 0;
55898c9bda7SSatish Balay     nonzerorow += (n>0);
5597fbae186SHong Zhang     if (*ib == i) {      /* (diag of A)*x */
56049b5e25fSSatish Balay       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
56149b5e25fSSatish Balay       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
56249b5e25fSSatish Balay       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
56349b5e25fSSatish Balay       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
56449b5e25fSSatish Balay       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
56549b5e25fSSatish Balay       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
56649b5e25fSSatish Balay       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
567831a3094SHong Zhang       v        += 49; jmin++;
5687fbae186SHong Zhang     }
569444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
570444d8c10SJed Brown     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
571831a3094SHong Zhang     for (j=jmin; j<n; j++) {
57249b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
57349b5e25fSSatish Balay       cval       = ib[j]*7;
57449b5e25fSSatish Balay       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
57549b5e25fSSatish Balay       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
57649b5e25fSSatish Balay       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
57749b5e25fSSatish Balay       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
57849b5e25fSSatish Balay       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
57949b5e25fSSatish Balay       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
58049b5e25fSSatish Balay       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
58149b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
58249b5e25fSSatish Balay       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
58349b5e25fSSatish Balay       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
58449b5e25fSSatish Balay       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
58549b5e25fSSatish Balay       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
58649b5e25fSSatish Balay       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
58749b5e25fSSatish Balay       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
58849b5e25fSSatish Balay       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
58949b5e25fSSatish Balay       v       += 49;
59049b5e25fSSatish Balay     }
59149b5e25fSSatish Balay     xb +=7; ai++;
59249b5e25fSSatish Balay   }
593d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
5941ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
595dc0b31edSSatish Balay   ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
59649b5e25fSSatish Balay   PetscFunctionReturn(0);
59749b5e25fSSatish Balay }
59849b5e25fSSatish Balay 
59949b5e25fSSatish Balay /*
60049b5e25fSSatish Balay     This will not work with MatScalar == float because it calls the BLAS
60149b5e25fSSatish Balay */
6024a2ae208SSatish Balay #undef __FUNCT__
6034a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_N"
604dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
60549b5e25fSSatish Balay {
60649b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
607d9ca1df4SBarry Smith   PetscScalar       *z,*z_ptr,*zb,*work,*workt,zero=0.0;
608d9ca1df4SBarry Smith   const PetscScalar *x,*x_ptr,*xb;
609d9ca1df4SBarry Smith   const MatScalar   *v;
610dfbe8321SBarry Smith   PetscErrorCode    ierr;
611d9ca1df4SBarry Smith   PetscInt          mbs =a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
612d9ca1df4SBarry Smith   const PetscInt    *idx,*aj,*ii;
61398c9bda7SSatish Balay   PetscInt          nonzerorow=0;
61449b5e25fSSatish Balay 
61549b5e25fSSatish Balay   PetscFunctionBegin;
6162dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
617d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);x_ptr = x;
6181ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
61949b5e25fSSatish Balay 
62049b5e25fSSatish Balay   aj = a->j;
62149b5e25fSSatish Balay   v  = a->a;
62249b5e25fSSatish Balay   ii = a->i;
62349b5e25fSSatish Balay 
62449b5e25fSSatish Balay   if (!a->mult_work) {
625854ce69bSBarry Smith     ierr = PetscMalloc1(A->rmap->N+1,&a->mult_work);CHKERRQ(ierr);
62649b5e25fSSatish Balay   }
62749b5e25fSSatish Balay   work = a->mult_work;
62849b5e25fSSatish Balay 
62949b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
63049b5e25fSSatish Balay     n           = ii[1] - ii[0]; ncols = n*bs;
63149b5e25fSSatish Balay     workt       = work; idx=aj+ii[0];
63298c9bda7SSatish Balay     nonzerorow += (n>0);
63349b5e25fSSatish Balay 
63449b5e25fSSatish Balay     /* upper triangular part */
63549b5e25fSSatish Balay     for (j=0; j<n; j++) {
63649b5e25fSSatish Balay       xb = x_ptr + bs*(*idx++);
63749b5e25fSSatish Balay       for (k=0; k<bs; k++) workt[k] = xb[k];
63849b5e25fSSatish Balay       workt += bs;
63949b5e25fSSatish Balay     }
64049b5e25fSSatish Balay     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
64196b95a6bSBarry Smith     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
64249b5e25fSSatish Balay 
64349b5e25fSSatish Balay     /* strict lower triangular part */
644831a3094SHong Zhang     idx = aj+ii[0];
645831a3094SHong Zhang     if (*idx == i) {
64696b9376eSHong Zhang       ncols -= bs; v += bs2; idx++; n--;
647831a3094SHong Zhang     }
64896b9376eSHong Zhang 
64949b5e25fSSatish Balay     if (ncols > 0) {
65049b5e25fSSatish Balay       workt = work;
65187828ca2SBarry Smith       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
65296b95a6bSBarry Smith       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
653831a3094SHong Zhang       for (j=0; j<n; j++) {
654831a3094SHong Zhang         zb = z_ptr + bs*(*idx++);
65549b5e25fSSatish Balay         for (k=0; k<bs; k++) zb[k] += workt[k];
65649b5e25fSSatish Balay         workt += bs;
65749b5e25fSSatish Balay       }
65849b5e25fSSatish Balay     }
65949b5e25fSSatish Balay     x += bs; v += n*bs2; z += bs; ii++;
66049b5e25fSSatish Balay   }
66149b5e25fSSatish Balay 
662d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
6631ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
664dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);CHKERRQ(ierr);
66549b5e25fSSatish Balay   PetscFunctionReturn(0);
66649b5e25fSSatish Balay }
66749b5e25fSSatish Balay 
6684a2ae208SSatish Balay #undef __FUNCT__
6694a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1"
670dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
67149b5e25fSSatish Balay {
67249b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
673d9ca1df4SBarry Smith   PetscScalar       *z,x1;
674d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
675d9ca1df4SBarry Smith   const MatScalar   *v;
6766849ba73SBarry Smith   PetscErrorCode    ierr;
677d9ca1df4SBarry Smith   PetscInt          mbs =a->mbs,i,n,cval,j,jmin;
678d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
67998c9bda7SSatish Balay   PetscInt          nonzerorow=0;
68049b5e25fSSatish Balay 
68149b5e25fSSatish Balay   PetscFunctionBegin;
682343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
683d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
6841ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
68549b5e25fSSatish Balay   v    = a->a;
68649b5e25fSSatish Balay   xb   = x;
68749b5e25fSSatish Balay 
68849b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
68949b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th row of A */
69049b5e25fSSatish Balay     x1          = xb[0];
69149b5e25fSSatish Balay     ib          = aj + *ai;
692831a3094SHong Zhang     jmin        = 0;
69398c9bda7SSatish Balay     nonzerorow += (n>0);
694831a3094SHong Zhang     if (*ib == i) {            /* (diag of A)*x */
695831a3094SHong Zhang       z[i] += *v++ * x[*ib++]; jmin++;
696831a3094SHong Zhang     }
697831a3094SHong Zhang     for (j=jmin; j<n; j++) {
69849b5e25fSSatish Balay       cval    = *ib;
69949b5e25fSSatish Balay       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
70049b5e25fSSatish Balay       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
70149b5e25fSSatish Balay     }
70249b5e25fSSatish Balay     xb++; ai++;
70349b5e25fSSatish Balay   }
70449b5e25fSSatish Balay 
705d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
706bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
70749b5e25fSSatish Balay 
708dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
70949b5e25fSSatish Balay   PetscFunctionReturn(0);
71049b5e25fSSatish Balay }
71149b5e25fSSatish Balay 
7124a2ae208SSatish Balay #undef __FUNCT__
7134a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2"
714dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
71549b5e25fSSatish Balay {
71649b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
717d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2;
718d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
719d9ca1df4SBarry Smith   const MatScalar   *v;
7206849ba73SBarry Smith   PetscErrorCode    ierr;
721d9ca1df4SBarry Smith   PetscInt          mbs =a->mbs,i,n,cval,j,jmin;
722d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
72398c9bda7SSatish Balay   PetscInt          nonzerorow=0;
72449b5e25fSSatish Balay 
72549b5e25fSSatish Balay   PetscFunctionBegin;
726343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
727d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
7281ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
72949b5e25fSSatish Balay 
73049b5e25fSSatish Balay   v  = a->a;
73149b5e25fSSatish Balay   xb = x;
73249b5e25fSSatish Balay 
73349b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
73449b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
73549b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1];
73649b5e25fSSatish Balay     ib          = aj + *ai;
737831a3094SHong Zhang     jmin        = 0;
73898c9bda7SSatish Balay     nonzerorow += (n>0);
7397fbae186SHong Zhang     if (*ib == i) {      /* (diag of A)*x */
74049b5e25fSSatish Balay       z[2*i]   += v[0]*x1 + v[2]*x2;
74149b5e25fSSatish Balay       z[2*i+1] += v[2]*x1 + v[3]*x2;
742831a3094SHong Zhang       v        += 4; jmin++;
7437fbae186SHong Zhang     }
744444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
745444d8c10SJed Brown     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
746831a3094SHong Zhang     for (j=jmin; j<n; j++) {
74749b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
74849b5e25fSSatish Balay       cval       = ib[j]*2;
74949b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2;
75049b5e25fSSatish Balay       z[cval+1] += v[2]*x1 + v[3]*x2;
75149b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
75249b5e25fSSatish Balay       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
75349b5e25fSSatish Balay       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
75449b5e25fSSatish Balay       v        += 4;
75549b5e25fSSatish Balay     }
75649b5e25fSSatish Balay     xb +=2; ai++;
75749b5e25fSSatish Balay   }
758d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
759bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
76049b5e25fSSatish Balay 
761dc0b31edSSatish Balay   ierr = PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
76249b5e25fSSatish Balay   PetscFunctionReturn(0);
76349b5e25fSSatish Balay }
76449b5e25fSSatish Balay 
7654a2ae208SSatish Balay #undef __FUNCT__
7664a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3"
767dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
76849b5e25fSSatish Balay {
76949b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
770d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3;
771d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
772d9ca1df4SBarry Smith   const MatScalar   *v;
7736849ba73SBarry Smith   PetscErrorCode    ierr;
774d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
775d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
77698c9bda7SSatish Balay   PetscInt          nonzerorow=0;
77749b5e25fSSatish Balay 
77849b5e25fSSatish Balay   PetscFunctionBegin;
779343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
780d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
7811ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
78249b5e25fSSatish Balay 
78349b5e25fSSatish Balay   v  = a->a;
78449b5e25fSSatish Balay   xb = x;
78549b5e25fSSatish Balay 
78649b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
78749b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
78849b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2];
78949b5e25fSSatish Balay     ib          = aj + *ai;
790831a3094SHong Zhang     jmin        = 0;
79198c9bda7SSatish Balay     nonzerorow += (n>0);
7927fbae186SHong Zhang     if (*ib == i) {     /* (diag of A)*x */
79349b5e25fSSatish Balay       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
79449b5e25fSSatish Balay       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
79549b5e25fSSatish Balay       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
796831a3094SHong Zhang       v        += 9; jmin++;
7977fbae186SHong Zhang     }
798444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
799444d8c10SJed Brown     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
800831a3094SHong Zhang     for (j=jmin; j<n; j++) {
80149b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
80249b5e25fSSatish Balay       cval       = ib[j]*3;
80349b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3;
80449b5e25fSSatish Balay       z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
80549b5e25fSSatish Balay       z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
80649b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
80749b5e25fSSatish Balay       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
80849b5e25fSSatish Balay       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
80949b5e25fSSatish Balay       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
81049b5e25fSSatish Balay       v        += 9;
81149b5e25fSSatish Balay     }
81249b5e25fSSatish Balay     xb +=3; ai++;
81349b5e25fSSatish Balay   }
81449b5e25fSSatish Balay 
815d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
816bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
81749b5e25fSSatish Balay 
818dc0b31edSSatish Balay   ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
81949b5e25fSSatish Balay   PetscFunctionReturn(0);
82049b5e25fSSatish Balay }
82149b5e25fSSatish Balay 
8224a2ae208SSatish Balay #undef __FUNCT__
8234a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4"
824dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
82549b5e25fSSatish Balay {
82649b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
827d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4;
828d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
829d9ca1df4SBarry Smith   const MatScalar   *v;
8306849ba73SBarry Smith   PetscErrorCode    ierr;
831d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
832d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
83398c9bda7SSatish Balay   PetscInt          nonzerorow=0;
83449b5e25fSSatish Balay 
83549b5e25fSSatish Balay   PetscFunctionBegin;
836343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
837d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
8381ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
83949b5e25fSSatish Balay 
84049b5e25fSSatish Balay   v  = a->a;
84149b5e25fSSatish Balay   xb = x;
84249b5e25fSSatish Balay 
84349b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
84449b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
84549b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
84649b5e25fSSatish Balay     ib          = aj + *ai;
847831a3094SHong Zhang     jmin        = 0;
84898c9bda7SSatish Balay     nonzerorow += (n>0);
8497fbae186SHong Zhang     if (*ib == i) {      /* (diag of A)*x */
85049b5e25fSSatish Balay       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
85149b5e25fSSatish Balay       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
85249b5e25fSSatish Balay       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
85349b5e25fSSatish Balay       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
854831a3094SHong Zhang       v        += 16; jmin++;
8557fbae186SHong Zhang     }
856444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
857444d8c10SJed Brown     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
858831a3094SHong Zhang     for (j=jmin; j<n; j++) {
85949b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
86049b5e25fSSatish Balay       cval       = ib[j]*4;
86149b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
86249b5e25fSSatish Balay       z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
86349b5e25fSSatish Balay       z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
86449b5e25fSSatish Balay       z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
86549b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
86649b5e25fSSatish Balay       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
86749b5e25fSSatish Balay       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
86849b5e25fSSatish Balay       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
86949b5e25fSSatish Balay       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
87049b5e25fSSatish Balay       v        += 16;
87149b5e25fSSatish Balay     }
87249b5e25fSSatish Balay     xb +=4; ai++;
87349b5e25fSSatish Balay   }
87449b5e25fSSatish Balay 
875d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
876bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
87749b5e25fSSatish Balay 
878dc0b31edSSatish Balay   ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
87949b5e25fSSatish Balay   PetscFunctionReturn(0);
88049b5e25fSSatish Balay }
88149b5e25fSSatish Balay 
8824a2ae208SSatish Balay #undef __FUNCT__
8834a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5"
884dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
88549b5e25fSSatish Balay {
88649b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
887d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5;
888d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
889d9ca1df4SBarry Smith   const MatScalar   *v;
8906849ba73SBarry Smith   PetscErrorCode    ierr;
891d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
892d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
89398c9bda7SSatish Balay   PetscInt          nonzerorow=0;
89449b5e25fSSatish Balay 
89549b5e25fSSatish Balay   PetscFunctionBegin;
896343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
897d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
8981ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
89949b5e25fSSatish Balay 
90049b5e25fSSatish Balay   v  = a->a;
90149b5e25fSSatish Balay   xb = x;
90249b5e25fSSatish Balay 
90349b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
90449b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
90549b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
90649b5e25fSSatish Balay     ib          = aj + *ai;
907831a3094SHong Zhang     jmin        = 0;
90898c9bda7SSatish Balay     nonzerorow += (n>0);
9097fbae186SHong Zhang     if (*ib == i) {      /* (diag of A)*x */
91049b5e25fSSatish Balay       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
91149b5e25fSSatish Balay       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
91249b5e25fSSatish Balay       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
91349b5e25fSSatish Balay       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
91449b5e25fSSatish Balay       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
915831a3094SHong Zhang       v        += 25; jmin++;
9167fbae186SHong Zhang     }
917444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
918444d8c10SJed Brown     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
919831a3094SHong Zhang     for (j=jmin; j<n; j++) {
92049b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
92149b5e25fSSatish Balay       cval       = ib[j]*5;
92249b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
92349b5e25fSSatish Balay       z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
92449b5e25fSSatish Balay       z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
92549b5e25fSSatish Balay       z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
92649b5e25fSSatish Balay       z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
92749b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
92849b5e25fSSatish Balay       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
92949b5e25fSSatish Balay       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
93049b5e25fSSatish Balay       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
93149b5e25fSSatish Balay       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
93249b5e25fSSatish Balay       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
93349b5e25fSSatish Balay       v        += 25;
93449b5e25fSSatish Balay     }
93549b5e25fSSatish Balay     xb +=5; ai++;
93649b5e25fSSatish Balay   }
93749b5e25fSSatish Balay 
938d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
939bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
94049b5e25fSSatish Balay 
941dc0b31edSSatish Balay   ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
94249b5e25fSSatish Balay   PetscFunctionReturn(0);
94349b5e25fSSatish Balay }
9444a2ae208SSatish Balay #undef __FUNCT__
9454a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6"
946dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
94749b5e25fSSatish Balay {
94849b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
949d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5,x6;
950d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
951d9ca1df4SBarry Smith   const MatScalar   *v;
9526849ba73SBarry Smith   PetscErrorCode    ierr;
953d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
954d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
95598c9bda7SSatish Balay   PetscInt          nonzerorow=0;
95649b5e25fSSatish Balay 
95749b5e25fSSatish Balay   PetscFunctionBegin;
958343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
959d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
9601ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
96149b5e25fSSatish Balay 
96249b5e25fSSatish Balay   v  = a->a;
96349b5e25fSSatish Balay   xb = x;
96449b5e25fSSatish Balay 
96549b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
96649b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
96749b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
96849b5e25fSSatish Balay     ib          = aj + *ai;
969831a3094SHong Zhang     jmin        = 0;
97098c9bda7SSatish Balay     nonzerorow += (n>0);
9717fbae186SHong Zhang     if (*ib == i) {     /* (diag of A)*x */
97249b5e25fSSatish Balay       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
97349b5e25fSSatish Balay       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
97449b5e25fSSatish Balay       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
97549b5e25fSSatish Balay       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
97649b5e25fSSatish Balay       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
97749b5e25fSSatish Balay       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
978831a3094SHong Zhang       v        += 36; jmin++;
9797fbae186SHong Zhang     }
980444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
981444d8c10SJed Brown     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
982831a3094SHong Zhang     for (j=jmin; j<n; j++) {
98349b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
98449b5e25fSSatish Balay       cval       = ib[j]*6;
98549b5e25fSSatish Balay       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
98649b5e25fSSatish Balay       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
98749b5e25fSSatish Balay       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
98849b5e25fSSatish Balay       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
98949b5e25fSSatish Balay       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
99049b5e25fSSatish Balay       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
99149b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
99249b5e25fSSatish Balay       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
99349b5e25fSSatish Balay       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
99449b5e25fSSatish Balay       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
99549b5e25fSSatish Balay       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
99649b5e25fSSatish Balay       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
99749b5e25fSSatish Balay       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
99849b5e25fSSatish Balay       v        += 36;
99949b5e25fSSatish Balay     }
100049b5e25fSSatish Balay     xb +=6; ai++;
100149b5e25fSSatish Balay   }
100249b5e25fSSatish Balay 
1003d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1004bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
100549b5e25fSSatish Balay 
1006dc0b31edSSatish Balay   ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
100749b5e25fSSatish Balay   PetscFunctionReturn(0);
100849b5e25fSSatish Balay }
100949b5e25fSSatish Balay 
10104a2ae208SSatish Balay #undef __FUNCT__
10114a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7"
1012dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
101349b5e25fSSatish Balay {
101449b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1015d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5,x6,x7;
1016d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
1017d9ca1df4SBarry Smith   const MatScalar   *v;
10186849ba73SBarry Smith   PetscErrorCode    ierr;
1019d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
1020d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
102198c9bda7SSatish Balay   PetscInt          nonzerorow=0;
102249b5e25fSSatish Balay 
102349b5e25fSSatish Balay   PetscFunctionBegin;
1024343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1025d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
10261ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
102749b5e25fSSatish Balay 
102849b5e25fSSatish Balay   v  = a->a;
102949b5e25fSSatish Balay   xb = x;
103049b5e25fSSatish Balay 
103149b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
103249b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
103349b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
103449b5e25fSSatish Balay     ib          = aj + *ai;
1035831a3094SHong Zhang     jmin        = 0;
103698c9bda7SSatish Balay     nonzerorow += (n>0);
10377fbae186SHong Zhang     if (*ib == i) {     /* (diag of A)*x */
103849b5e25fSSatish Balay       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
103949b5e25fSSatish Balay       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
104049b5e25fSSatish Balay       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
104149b5e25fSSatish Balay       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
104249b5e25fSSatish Balay       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
104349b5e25fSSatish Balay       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
104449b5e25fSSatish Balay       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1045831a3094SHong Zhang       v        += 49; jmin++;
10467fbae186SHong Zhang     }
1047444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1048444d8c10SJed Brown     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1049831a3094SHong Zhang     for (j=jmin; j<n; j++) {
105049b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
105149b5e25fSSatish Balay       cval       = ib[j]*7;
105249b5e25fSSatish Balay       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
105349b5e25fSSatish Balay       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
105449b5e25fSSatish Balay       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
105549b5e25fSSatish Balay       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
105649b5e25fSSatish Balay       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
105749b5e25fSSatish Balay       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
105849b5e25fSSatish Balay       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
105949b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
106049b5e25fSSatish Balay       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
106149b5e25fSSatish Balay       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
106249b5e25fSSatish Balay       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
106349b5e25fSSatish Balay       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
106449b5e25fSSatish Balay       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
106549b5e25fSSatish Balay       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
106649b5e25fSSatish Balay       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
106749b5e25fSSatish Balay       v       += 49;
106849b5e25fSSatish Balay     }
106949b5e25fSSatish Balay     xb +=7; ai++;
107049b5e25fSSatish Balay   }
107149b5e25fSSatish Balay 
1072d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1073bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
107449b5e25fSSatish Balay 
1075dc0b31edSSatish Balay   ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
107649b5e25fSSatish Balay   PetscFunctionReturn(0);
107749b5e25fSSatish Balay }
107849b5e25fSSatish Balay 
10794a2ae208SSatish Balay #undef __FUNCT__
10804a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N"
1081dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
108249b5e25fSSatish Balay {
108349b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1084d9ca1df4SBarry Smith   PetscScalar       *z,*z_ptr=0,*zb,*work,*workt;
1085d9ca1df4SBarry Smith   const PetscScalar *x,*x_ptr,*xb;
1086d9ca1df4SBarry Smith   const MatScalar   *v;
1087dfbe8321SBarry Smith   PetscErrorCode    ierr;
1088d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
1089d9ca1df4SBarry Smith   const PetscInt    *idx,*aj,*ii;
109098c9bda7SSatish Balay   PetscInt          nonzerorow=0;
109149b5e25fSSatish Balay 
109249b5e25fSSatish Balay   PetscFunctionBegin;
1093343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1094d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); x_ptr=x;
10951ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
109649b5e25fSSatish Balay 
109749b5e25fSSatish Balay   aj = a->j;
109849b5e25fSSatish Balay   v  = a->a;
109949b5e25fSSatish Balay   ii = a->i;
110049b5e25fSSatish Balay 
110149b5e25fSSatish Balay   if (!a->mult_work) {
1102854ce69bSBarry Smith     ierr = PetscMalloc1(A->rmap->n+1,&a->mult_work);CHKERRQ(ierr);
110349b5e25fSSatish Balay   }
110449b5e25fSSatish Balay   work = a->mult_work;
110549b5e25fSSatish Balay 
110649b5e25fSSatish Balay 
110749b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
110849b5e25fSSatish Balay     n           = ii[1] - ii[0]; ncols = n*bs;
110949b5e25fSSatish Balay     workt       = work; idx=aj+ii[0];
111098c9bda7SSatish Balay     nonzerorow += (n>0);
111149b5e25fSSatish Balay 
111249b5e25fSSatish Balay     /* upper triangular part */
111349b5e25fSSatish Balay     for (j=0; j<n; j++) {
111449b5e25fSSatish Balay       xb = x_ptr + bs*(*idx++);
111549b5e25fSSatish Balay       for (k=0; k<bs; k++) workt[k] = xb[k];
111649b5e25fSSatish Balay       workt += bs;
111749b5e25fSSatish Balay     }
111849b5e25fSSatish Balay     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
111996b95a6bSBarry Smith     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
112049b5e25fSSatish Balay 
112149b5e25fSSatish Balay     /* strict lower triangular part */
1122831a3094SHong Zhang     idx = aj+ii[0];
1123831a3094SHong Zhang     if (*idx == i) {
112496b9376eSHong Zhang       ncols -= bs; v += bs2; idx++; n--;
1125831a3094SHong Zhang     }
112649b5e25fSSatish Balay     if (ncols > 0) {
112749b5e25fSSatish Balay       workt = work;
112887828ca2SBarry Smith       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
112996b95a6bSBarry Smith       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1130831a3094SHong Zhang       for (j=0; j<n; j++) {
1131831a3094SHong Zhang         zb = z_ptr + bs*(*idx++);
113249b5e25fSSatish Balay         for (k=0; k<bs; k++) zb[k] += workt[k];
113349b5e25fSSatish Balay         workt += bs;
113449b5e25fSSatish Balay       }
113549b5e25fSSatish Balay     }
113649b5e25fSSatish Balay 
113749b5e25fSSatish Balay     x += bs; v += n*bs2; z += bs; ii++;
113849b5e25fSSatish Balay   }
113949b5e25fSSatish Balay 
1140d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1141bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
114249b5e25fSSatish Balay 
1143dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
114449b5e25fSSatish Balay   PetscFunctionReturn(0);
114549b5e25fSSatish Balay }
114649b5e25fSSatish Balay 
11474a2ae208SSatish Balay #undef __FUNCT__
11484a2ae208SSatish Balay #define __FUNCT__ "MatScale_SeqSBAIJ"
1149f4df32b1SMatthew Knepley PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha)
115049b5e25fSSatish Balay {
115149b5e25fSSatish Balay   Mat_SeqSBAIJ   *a     = (Mat_SeqSBAIJ*)inA->data;
1152f4df32b1SMatthew Knepley   PetscScalar    oalpha = alpha;
1153efee365bSSatish Balay   PetscErrorCode ierr;
1154c5df96a5SBarry Smith   PetscBLASInt   one = 1,totalnz;
115549b5e25fSSatish Balay 
115649b5e25fSSatish Balay   PetscFunctionBegin;
1157c5df96a5SBarry Smith   ierr = PetscBLASIntCast(a->bs2*a->nz,&totalnz);CHKERRQ(ierr);
11588b83055fSJed Brown   PetscStackCallBLAS("BLASscal",BLASscal_(&totalnz,&oalpha,a->a,&one));
1159efee365bSSatish Balay   ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
116049b5e25fSSatish Balay   PetscFunctionReturn(0);
116149b5e25fSSatish Balay }
116249b5e25fSSatish Balay 
11634a2ae208SSatish Balay #undef __FUNCT__
11644a2ae208SSatish Balay #define __FUNCT__ "MatNorm_SeqSBAIJ"
1165dfbe8321SBarry Smith PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
116649b5e25fSSatish Balay {
116749b5e25fSSatish Balay   Mat_SeqSBAIJ    *a       = (Mat_SeqSBAIJ*)A->data;
1168d9ca1df4SBarry Smith   const MatScalar *v       = a->a;
116949b5e25fSSatish Balay   PetscReal       sum_diag = 0.0, sum_off = 0.0, *sum;
1170d9ca1df4SBarry Smith   PetscInt        i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,jmin,jmax,nexti,ik,*jl,*il;
11716849ba73SBarry Smith   PetscErrorCode  ierr;
1172d9ca1df4SBarry Smith   const PetscInt  *aj=a->j,*col;
117349b5e25fSSatish Balay 
117449b5e25fSSatish Balay   PetscFunctionBegin;
117549b5e25fSSatish Balay   if (type == NORM_FROBENIUS) {
117649b5e25fSSatish Balay     for (k=0; k<mbs; k++) {
117749b5e25fSSatish Balay       jmin = a->i[k]; jmax = a->i[k+1];
1178831a3094SHong Zhang       col  = aj + jmin;
1179831a3094SHong Zhang       if (*col == k) {         /* diagonal block */
118049b5e25fSSatish Balay         for (i=0; i<bs2; i++) {
118149b5e25fSSatish Balay           sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
118249b5e25fSSatish Balay         }
1183831a3094SHong Zhang         jmin++;
1184831a3094SHong Zhang       }
1185831a3094SHong Zhang       for (j=jmin; j<jmax; j++) {  /* off-diagonal blocks */
118649b5e25fSSatish Balay         for (i=0; i<bs2; i++) {
118749b5e25fSSatish Balay           sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
118849b5e25fSSatish Balay         }
118949b5e25fSSatish Balay       }
119049b5e25fSSatish Balay     }
11918f1a2a5eSBarry Smith     *norm = PetscSqrtReal(sum_diag + 2*sum_off);
11920b8dc8d2SHong Zhang   } else if (type == NORM_INFINITY || type == NORM_1) {  /* maximum row/column sum */
1193dcca6d9dSJed Brown     ierr = PetscMalloc3(bs,&sum,mbs,&il,mbs,&jl);CHKERRQ(ierr);
11940b8dc8d2SHong Zhang     for (i=0; i<mbs; i++) jl[i] = mbs;
11950b8dc8d2SHong Zhang     il[0] = 0;
119649b5e25fSSatish Balay 
119749b5e25fSSatish Balay     *norm = 0.0;
119849b5e25fSSatish Balay     for (k=0; k<mbs; k++) { /* k_th block row */
119949b5e25fSSatish Balay       for (j=0; j<bs; j++) sum[j]=0.0;
120049b5e25fSSatish Balay       /*-- col sum --*/
120149b5e25fSSatish Balay       i = jl[k]; /* first |A(i,k)| to be added */
1202831a3094SHong Zhang       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1203831a3094SHong Zhang                   at step k */
120449b5e25fSSatish Balay       while (i<mbs) {
120549b5e25fSSatish Balay         nexti = jl[i];  /* next block row to be added */
120649b5e25fSSatish Balay         ik    = il[i];  /* block index of A(i,k) in the array a */
120749b5e25fSSatish Balay         for (j=0; j<bs; j++) {
120849b5e25fSSatish Balay           v = a->a + ik*bs2 + j*bs;
120949b5e25fSSatish Balay           for (k1=0; k1<bs; k1++) {
121049b5e25fSSatish Balay             sum[j] += PetscAbsScalar(*v); v++;
121149b5e25fSSatish Balay           }
121249b5e25fSSatish Balay         }
121349b5e25fSSatish Balay         /* update il, jl */
1214831a3094SHong Zhang         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1215831a3094SHong Zhang         jmax = a->i[i+1];
121649b5e25fSSatish Balay         if (jmin < jmax) {
121749b5e25fSSatish Balay           il[i] = jmin;
121849b5e25fSSatish Balay           j     = a->j[jmin];
121949b5e25fSSatish Balay           jl[i] = jl[j]; jl[j]=i;
122049b5e25fSSatish Balay         }
122149b5e25fSSatish Balay         i = nexti;
122249b5e25fSSatish Balay       }
122349b5e25fSSatish Balay       /*-- row sum --*/
122449b5e25fSSatish Balay       jmin = a->i[k]; jmax = a->i[k+1];
122549b5e25fSSatish Balay       for (i=jmin; i<jmax; i++) {
122649b5e25fSSatish Balay         for (j=0; j<bs; j++) {
122749b5e25fSSatish Balay           v = a->a + i*bs2 + j;
122849b5e25fSSatish Balay           for (k1=0; k1<bs; k1++) {
12290b8dc8d2SHong Zhang             sum[j] += PetscAbsScalar(*v); v += bs;
123049b5e25fSSatish Balay           }
123149b5e25fSSatish Balay         }
123249b5e25fSSatish Balay       }
123349b5e25fSSatish Balay       /* add k_th block row to il, jl */
1234831a3094SHong Zhang       col = aj+jmin;
1235831a3094SHong Zhang       if (*col == k) jmin++;
123649b5e25fSSatish Balay       if (jmin < jmax) {
123749b5e25fSSatish Balay         il[k] = jmin;
12380b8dc8d2SHong Zhang         j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k;
123949b5e25fSSatish Balay       }
124049b5e25fSSatish Balay       for (j=0; j<bs; j++) {
124149b5e25fSSatish Balay         if (sum[j] > *norm) *norm = sum[j];
124249b5e25fSSatish Balay       }
124349b5e25fSSatish Balay     }
124474ed9c26SBarry Smith     ierr = PetscFree3(sum,il,jl);CHKERRQ(ierr);
1245f23aa3ddSBarry Smith   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
124649b5e25fSSatish Balay   PetscFunctionReturn(0);
124749b5e25fSSatish Balay }
124849b5e25fSSatish Balay 
12494a2ae208SSatish Balay #undef __FUNCT__
12504a2ae208SSatish Balay #define __FUNCT__ "MatEqual_SeqSBAIJ"
1251ace3abfcSBarry Smith PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscBool * flg)
125249b5e25fSSatish Balay {
125349b5e25fSSatish Balay   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)B->data;
1254dfbe8321SBarry Smith   PetscErrorCode ierr;
125549b5e25fSSatish Balay 
125649b5e25fSSatish Balay   PetscFunctionBegin;
125749b5e25fSSatish Balay   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1258d0f46423SBarry Smith   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1259ef511fbeSHong Zhang     *flg = PETSC_FALSE;
1260ef511fbeSHong Zhang     PetscFunctionReturn(0);
126149b5e25fSSatish Balay   }
126249b5e25fSSatish Balay 
126349b5e25fSSatish Balay   /* if the a->i are the same */
126413f74950SBarry Smith   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr);
126526fbe8dcSKarl Rupp   if (!*flg) PetscFunctionReturn(0);
126649b5e25fSSatish Balay 
126749b5e25fSSatish Balay   /* if a->j are the same */
126813f74950SBarry Smith   ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr);
126926fbe8dcSKarl Rupp   if (!*flg) PetscFunctionReturn(0);
127026fbe8dcSKarl Rupp 
127149b5e25fSSatish Balay   /* if a->a are the same */
1272d0f46423SBarry Smith   ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
1273935af2e7SHong Zhang   PetscFunctionReturn(0);
127449b5e25fSSatish Balay }
127549b5e25fSSatish Balay 
12764a2ae208SSatish Balay #undef __FUNCT__
12774a2ae208SSatish Balay #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ"
1278dfbe8321SBarry Smith PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
127949b5e25fSSatish Balay {
128049b5e25fSSatish Balay   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1281dfbe8321SBarry Smith   PetscErrorCode  ierr;
1282d9ca1df4SBarry Smith   PetscInt        i,j,k,row,bs,ambs,bs2;
1283d9ca1df4SBarry Smith   const PetscInt  *ai,*aj;
128487828ca2SBarry Smith   PetscScalar     *x,zero = 0.0;
1285d9ca1df4SBarry Smith   const MatScalar *aa,*aa_j;
128649b5e25fSSatish Balay 
128749b5e25fSSatish Balay   PetscFunctionBegin;
1288d0f46423SBarry Smith   bs = A->rmap->bs;
1289e32f2f54SBarry Smith   if (A->factortype && bs>1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1");
129082799104SHong Zhang 
129149b5e25fSSatish Balay   aa   = a->a;
12928a0c6efdSHong Zhang   ambs = a->mbs;
12938a0c6efdSHong Zhang 
12948a0c6efdSHong Zhang   if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) {
12958a0c6efdSHong Zhang     PetscInt *diag=a->diag;
12968a0c6efdSHong Zhang     aa   = a->a;
12978a0c6efdSHong Zhang     ambs = a->mbs;
12988a0c6efdSHong Zhang     ierr = VecGetArray(v,&x);CHKERRQ(ierr);
12998a0c6efdSHong Zhang     for (i=0; i<ambs; i++) x[i] = 1.0/aa[diag[i]];
13008a0c6efdSHong Zhang     ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
13018a0c6efdSHong Zhang     PetscFunctionReturn(0);
13028a0c6efdSHong Zhang   }
13038a0c6efdSHong Zhang 
130449b5e25fSSatish Balay   ai   = a->i;
130549b5e25fSSatish Balay   aj   = a->j;
130649b5e25fSSatish Balay   bs2  = a->bs2;
13072dcb1b2aSMatthew Knepley   ierr = VecSet(v,zero);CHKERRQ(ierr);
13081ebc52fbSHong Zhang   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
130949b5e25fSSatish Balay   for (i=0; i<ambs; i++) {
131049b5e25fSSatish Balay     j=ai[i];
131149b5e25fSSatish Balay     if (aj[j] == i) {    /* if this is a diagonal element */
131249b5e25fSSatish Balay       row  = i*bs;
131349b5e25fSSatish Balay       aa_j = aa + j*bs2;
131449b5e25fSSatish Balay       for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
131549b5e25fSSatish Balay     }
131649b5e25fSSatish Balay   }
13171ebc52fbSHong Zhang   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
131849b5e25fSSatish Balay   PetscFunctionReturn(0);
131949b5e25fSSatish Balay }
132049b5e25fSSatish Balay 
13214a2ae208SSatish Balay #undef __FUNCT__
13224a2ae208SSatish Balay #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ"
1323dfbe8321SBarry Smith PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
132449b5e25fSSatish Balay {
132549b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1326d9ca1df4SBarry Smith   PetscScalar       x;
1327d9ca1df4SBarry Smith   const PetscScalar *l,*li,*ri;
132849b5e25fSSatish Balay   MatScalar         *aa,*v;
1329dfbe8321SBarry Smith   PetscErrorCode    ierr;
13305e90f9d9SHong Zhang   PetscInt          i,j,k,lm,M,m,*ai,*aj,mbs,tmp,bs,bs2;
1331ace3abfcSBarry Smith   PetscBool         flg;
133249b5e25fSSatish Balay 
133349b5e25fSSatish Balay   PetscFunctionBegin;
1334b3bf805bSHong Zhang   if (ll != rr) {
1335b3bf805bSHong Zhang     ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr);
1336e7e72b3dSBarry Smith     if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1337b3bf805bSHong Zhang   }
1338b3bf805bSHong Zhang   if (!ll) PetscFunctionReturn(0);
133949b5e25fSSatish Balay   ai  = a->i;
134049b5e25fSSatish Balay   aj  = a->j;
134149b5e25fSSatish Balay   aa  = a->a;
1342d0f46423SBarry Smith   m   = A->rmap->N;
1343d0f46423SBarry Smith   bs  = A->rmap->bs;
134449b5e25fSSatish Balay   mbs = a->mbs;
134549b5e25fSSatish Balay   bs2 = a->bs2;
134649b5e25fSSatish Balay 
1347d9ca1df4SBarry Smith   ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr);
134849b5e25fSSatish Balay   ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1349e32f2f54SBarry Smith   if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
135049b5e25fSSatish Balay   for (i=0; i<mbs; i++) { /* for each block row */
135149b5e25fSSatish Balay     M  = ai[i+1] - ai[i];
135249b5e25fSSatish Balay     li = l + i*bs;
135349b5e25fSSatish Balay     v  = aa + bs2*ai[i];
135449b5e25fSSatish Balay     for (j=0; j<M; j++) { /* for each block */
135549b5e25fSSatish Balay       ri = l + bs*aj[ai[i]+j];
13565e90f9d9SHong Zhang       for (k=0; k<bs; k++) {
135749b5e25fSSatish Balay         x = ri[k];
135849b5e25fSSatish Balay         for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
135949b5e25fSSatish Balay       }
136049b5e25fSSatish Balay     }
136149b5e25fSSatish Balay   }
1362d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr);
1363dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
136449b5e25fSSatish Balay   PetscFunctionReturn(0);
136549b5e25fSSatish Balay }
136649b5e25fSSatish Balay 
13674a2ae208SSatish Balay #undef __FUNCT__
13684a2ae208SSatish Balay #define __FUNCT__ "MatGetInfo_SeqSBAIJ"
1369dfbe8321SBarry Smith PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
137049b5e25fSSatish Balay {
137149b5e25fSSatish Balay   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
137249b5e25fSSatish Balay 
137349b5e25fSSatish Balay   PetscFunctionBegin;
137449b5e25fSSatish Balay   info->block_size   = a->bs2;
1375ceed8ce5SJed Brown   info->nz_allocated = a->bs2*a->maxnz;   /*num. of nonzeros in upper triangular part */
13766c6c5352SBarry Smith   info->nz_used      = a->bs2*a->nz;   /*num. of nonzeros in upper triangular part */
137749b5e25fSSatish Balay   info->nz_unneeded  = (double)(info->nz_allocated - info->nz_used);
137849b5e25fSSatish Balay   info->assemblies   = A->num_ass;
13798e58a170SBarry Smith   info->mallocs      = A->info.mallocs;
13807adad957SLisandro Dalcin   info->memory       = ((PetscObject)A)->mem;
1381d5f3da31SBarry Smith   if (A->factortype) {
138249b5e25fSSatish Balay     info->fill_ratio_given  = A->info.fill_ratio_given;
138349b5e25fSSatish Balay     info->fill_ratio_needed = A->info.fill_ratio_needed;
138449b5e25fSSatish Balay     info->factor_mallocs    = A->info.factor_mallocs;
138549b5e25fSSatish Balay   } else {
138649b5e25fSSatish Balay     info->fill_ratio_given  = 0;
138749b5e25fSSatish Balay     info->fill_ratio_needed = 0;
138849b5e25fSSatish Balay     info->factor_mallocs    = 0;
138949b5e25fSSatish Balay   }
139049b5e25fSSatish Balay   PetscFunctionReturn(0);
139149b5e25fSSatish Balay }
139249b5e25fSSatish Balay 
139349b5e25fSSatish Balay 
13944a2ae208SSatish Balay #undef __FUNCT__
13954a2ae208SSatish Balay #define __FUNCT__ "MatZeroEntries_SeqSBAIJ"
1396dfbe8321SBarry Smith PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
139749b5e25fSSatish Balay {
139849b5e25fSSatish Balay   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1399dfbe8321SBarry Smith   PetscErrorCode ierr;
140049b5e25fSSatish Balay 
140149b5e25fSSatish Balay   PetscFunctionBegin;
140249b5e25fSSatish Balay   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
140349b5e25fSSatish Balay   PetscFunctionReturn(0);
140449b5e25fSSatish Balay }
1405dc354874SHong Zhang 
14064a2ae208SSatish Balay #undef __FUNCT__
1407985db425SBarry Smith #define __FUNCT__ "MatGetRowMaxAbs_SeqSBAIJ"
1408985db425SBarry Smith /*
1409985db425SBarry Smith    This code does not work since it only checks the upper triangular part of
1410985db425SBarry Smith   the matrix. Hence it is not listed in the function table.
1411985db425SBarry Smith */
1412985db425SBarry Smith PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[])
1413dc354874SHong Zhang {
1414dc354874SHong Zhang   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1415dfbe8321SBarry Smith   PetscErrorCode  ierr;
1416d9ca1df4SBarry Smith   PetscInt        i,j,n,row,col,bs,mbs;
1417d9ca1df4SBarry Smith   const PetscInt  *ai,*aj;
1418c3fca9a7SHong Zhang   PetscReal       atmp;
1419d9ca1df4SBarry Smith   const MatScalar *aa;
1420985db425SBarry Smith   PetscScalar     *x;
142113f74950SBarry Smith   PetscInt        ncols,brow,bcol,krow,kcol;
1422dc354874SHong Zhang 
1423dc354874SHong Zhang   PetscFunctionBegin;
1424e32f2f54SBarry Smith   if (idx) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov");
1425e32f2f54SBarry Smith   if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1426d0f46423SBarry Smith   bs  = A->rmap->bs;
1427dc354874SHong Zhang   aa  = a->a;
1428dc354874SHong Zhang   ai  = a->i;
1429dc354874SHong Zhang   aj  = a->j;
143044117c81SHong Zhang   mbs = a->mbs;
1431dc354874SHong Zhang 
1432985db425SBarry Smith   ierr = VecSet(v,0.0);CHKERRQ(ierr);
14331ebc52fbSHong Zhang   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1434dc354874SHong Zhang   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1435e32f2f54SBarry Smith   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
143644117c81SHong Zhang   for (i=0; i<mbs; i++) {
1437d0f6400bSHong Zhang     ncols = ai[1] - ai[0]; ai++;
1438d0f6400bSHong Zhang     brow  = bs*i;
143944117c81SHong Zhang     for (j=0; j<ncols; j++) {
1440d0f6400bSHong Zhang       bcol = bs*(*aj);
144144117c81SHong Zhang       for (kcol=0; kcol<bs; kcol++) {
1442d0f6400bSHong Zhang         col = bcol + kcol;      /* col index */
144344117c81SHong Zhang         for (krow=0; krow<bs; krow++) {
1444d0f6400bSHong Zhang           atmp = PetscAbsScalar(*aa); aa++;
1445d0f6400bSHong Zhang           row  = brow + krow;   /* row index */
1446c3fca9a7SHong Zhang           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1447c3fca9a7SHong Zhang           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
144844117c81SHong Zhang         }
144944117c81SHong Zhang       }
1450d0f6400bSHong Zhang       aj++;
1451dc354874SHong Zhang     }
1452dc354874SHong Zhang   }
14531ebc52fbSHong Zhang   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1454dc354874SHong Zhang   PetscFunctionReturn(0);
1455dc354874SHong Zhang }
1456