xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision d9ca1df42eda25bc875149d6d493aad6e204c9ff)
149b5e25fSSatish Balay 
2c6db04a5SJed Brown #include <../src/mat/impls/baij/seq/baij.h>
306873bf2SBarry Smith #include <petsc-private/kernels/blockinvert.h>
4c6db04a5SJed Brown #include <petscbt.h>
5c6db04a5SJed Brown #include <../src/mat/impls/sbaij/seq/sbaij.h>
6c6db04a5SJed Brown #include <petscblaslapack.h>
749b5e25fSSatish Balay 
84a2ae208SSatish Balay #undef __FUNCT__
94a2ae208SSatish Balay #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ"
1013f74950SBarry Smith PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
1149b5e25fSSatish Balay {
125eee224dSHong Zhang   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
136849ba73SBarry Smith   PetscErrorCode ierr;
145d0c19d7SBarry Smith   PetscInt       brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2;
155d0c19d7SBarry Smith   const PetscInt *idx;
16db41cccfSHong Zhang   PetscBT        table_out,table_in;
17d94109b8SHong Zhang 
18d94109b8SHong Zhang   PetscFunctionBegin;
19e32f2f54SBarry Smith   if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
20d94109b8SHong Zhang   mbs  = a->mbs;
21d94109b8SHong Zhang   ai   = a->i;
22d94109b8SHong Zhang   aj   = a->j;
23d0f46423SBarry Smith   bs   = A->rmap->bs;
2453b8de81SBarry Smith   ierr = PetscBTCreate(mbs,&table_out);CHKERRQ(ierr);
25854ce69bSBarry Smith   ierr = PetscMalloc1(mbs+1,&nidx);CHKERRQ(ierr);
26854ce69bSBarry Smith   ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr);
2753b8de81SBarry Smith   ierr = PetscBTCreate(mbs,&table_in);CHKERRQ(ierr);
28d94109b8SHong Zhang 
29d94109b8SHong Zhang   for (i=0; i<is_max; i++) { /* for each is */
30d94109b8SHong Zhang     isz  = 0;
31db41cccfSHong Zhang     ierr = PetscBTMemzero(mbs,table_out);CHKERRQ(ierr);
32d94109b8SHong Zhang 
33d94109b8SHong Zhang     /* Extract the indices, assume there can be duplicate entries */
34d94109b8SHong Zhang     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
35d94109b8SHong Zhang     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
36d94109b8SHong Zhang 
37db41cccfSHong Zhang     /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */
38dbe03f88SHong Zhang     bcol_max = 0;
39d94109b8SHong Zhang     for (j=0; j<n; ++j) {
40d94109b8SHong Zhang       brow = idx[j]/bs; /* convert the indices into block indices */
41e32f2f54SBarry Smith       if (brow >= mbs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
42db41cccfSHong Zhang       if (!PetscBTLookupSet(table_out,brow)) {
43dbe03f88SHong Zhang         nidx[isz++] = brow;
44dbe03f88SHong Zhang         if (bcol_max < brow) bcol_max = brow;
45dbe03f88SHong Zhang       }
46d94109b8SHong Zhang     }
47d94109b8SHong Zhang     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
486bf464f9SBarry Smith     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
49d94109b8SHong Zhang 
50d94109b8SHong Zhang     k = 0;
51d94109b8SHong Zhang     for (j=0; j<ov; j++) { /* for each overlap */
52db41cccfSHong Zhang       /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
53db41cccfSHong Zhang       ierr = PetscBTMemzero(mbs,table_in);CHKERRQ(ierr);
54db41cccfSHong Zhang       for (l=k; l<isz; l++) { ierr = PetscBTSet(table_in,nidx[l]);CHKERRQ(ierr); }
55d94109b8SHong Zhang 
56d94109b8SHong Zhang       n = isz;  /* length of the updated is[i] */
57d94109b8SHong Zhang       for (brow=0; brow<mbs; brow++) {
58d94109b8SHong Zhang         start = ai[brow]; end   = ai[brow+1];
59db41cccfSHong Zhang         if (PetscBTLookup(table_in,brow)) { /* brow is on nidx - row search: collect all bcol in this brow */
60d94109b8SHong Zhang           for (l = start; l<end; l++) {
61d94109b8SHong Zhang             bcol = aj[l];
62d7b97159SHong Zhang             if (!PetscBTLookupSet(table_out,bcol)) {
63d7b97159SHong Zhang               nidx[isz++] = bcol;
64d7b97159SHong Zhang               if (bcol_max < bcol) bcol_max = bcol;
65d7b97159SHong Zhang             }
66d94109b8SHong Zhang           }
67d94109b8SHong Zhang           k++;
68d94109b8SHong Zhang           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
69d94109b8SHong Zhang         } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
70d94109b8SHong Zhang           for (l = start; l<end; l++) {
71d94109b8SHong Zhang             bcol = aj[l];
72dbe03f88SHong Zhang             if (bcol > bcol_max) break;
73db41cccfSHong Zhang             if (PetscBTLookup(table_in,bcol)) {
7426fbe8dcSKarl Rupp               if (!PetscBTLookupSet(table_out,brow)) nidx[isz++] = brow;
75d94109b8SHong Zhang               break; /* for l = start; l<end ; l++) */
76d94109b8SHong Zhang             }
77d94109b8SHong Zhang           }
78d94109b8SHong Zhang         }
79d94109b8SHong Zhang       }
80d94109b8SHong Zhang     } /* for each overlap */
81d94109b8SHong Zhang 
82d94109b8SHong Zhang     /* expand the Index Set */
83d94109b8SHong Zhang     for (j=0; j<isz; j++) {
8426fbe8dcSKarl Rupp       for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
85d94109b8SHong Zhang     }
8670b3c8c7SBarry Smith     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
87d94109b8SHong Zhang   }
8894bacf5dSBarry Smith   ierr = PetscBTDestroy(&table_out);CHKERRQ(ierr);
89d94109b8SHong Zhang   ierr = PetscFree(nidx);CHKERRQ(ierr);
90d94109b8SHong Zhang   ierr = PetscFree(nidx2);CHKERRQ(ierr);
9194bacf5dSBarry Smith   ierr = PetscBTDestroy(&table_in);CHKERRQ(ierr);
925eee224dSHong Zhang   PetscFunctionReturn(0);
9349b5e25fSSatish Balay }
9449b5e25fSSatish Balay 
954a2ae208SSatish Balay #undef __FUNCT__
964a2ae208SSatish Balay #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private"
974aa3045dSJed Brown PetscErrorCode MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
9849b5e25fSSatish Balay {
9949b5e25fSSatish Balay   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data,*c;
1006849ba73SBarry Smith   PetscErrorCode  ierr;
10113f74950SBarry Smith   PetscInt        *smap,i,k,kstart,kend,oldcols = a->mbs,*lens;
10213f74950SBarry Smith   PetscInt        row,mat_i,*mat_j,tcol,*mat_ilen;
1035d0c19d7SBarry Smith   PetscInt        nrows,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
1045d0c19d7SBarry Smith   const PetscInt  *irow,*aj = a->j,*ai = a->i;
10549b5e25fSSatish Balay   MatScalar       *mat_a;
10649b5e25fSSatish Balay   Mat             C;
107ace3abfcSBarry Smith   PetscBool       flag,sorted;
10849b5e25fSSatish Balay 
10949b5e25fSSatish Balay   PetscFunctionBegin;
110e32f2f54SBarry Smith   if (isrow != iscol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro");
11114ca34e6SBarry Smith   ierr = ISSorted(iscol,&sorted);CHKERRQ(ierr);
112e32f2f54SBarry Smith   if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
11349b5e25fSSatish Balay 
11449b5e25fSSatish Balay   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
11549b5e25fSSatish Balay   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
11649b5e25fSSatish Balay 
117785e854fSJed Brown   ierr  = PetscMalloc1(oldcols,&smap);CHKERRQ(ierr);
11874ed9c26SBarry Smith   ierr  = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr);
11949b5e25fSSatish Balay   ssmap = smap;
120854ce69bSBarry Smith   ierr  = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr);
12149b5e25fSSatish Balay   for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */
12249b5e25fSSatish Balay   /* determine lens of each row */
12349b5e25fSSatish Balay   for (i=0; i<nrows; i++) {
12449b5e25fSSatish Balay     kstart  = ai[irow[i]];
12549b5e25fSSatish Balay     kend    = kstart + a->ilen[irow[i]];
12649b5e25fSSatish Balay     lens[i] = 0;
12749b5e25fSSatish Balay     for (k=kstart; k<kend; k++) {
12826fbe8dcSKarl Rupp       if (ssmap[aj[k]]) lens[i]++;
12949b5e25fSSatish Balay     }
13049b5e25fSSatish Balay   }
13149b5e25fSSatish Balay   /* Create and fill new matrix */
13249b5e25fSSatish Balay   if (scall == MAT_REUSE_MATRIX) {
13349b5e25fSSatish Balay     c = (Mat_SeqSBAIJ*)((*B)->data);
13449b5e25fSSatish Balay 
135e32f2f54SBarry Smith     if (c->mbs!=nrows || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
13613f74950SBarry Smith     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr);
137e7e72b3dSBarry Smith     if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
13813f74950SBarry Smith     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr);
13949b5e25fSSatish Balay     C    = *B;
14049b5e25fSSatish Balay   } else {
141ce94432eSBarry Smith     ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr);
142f69a0ea3SMatthew Knepley     ierr = MatSetSizes(C,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1437adad957SLisandro Dalcin     ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
144ab93d7beSBarry Smith     ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(C,bs,0,lens);CHKERRQ(ierr);
14549b5e25fSSatish Balay   }
14649b5e25fSSatish Balay   c = (Mat_SeqSBAIJ*)(C->data);
14749b5e25fSSatish Balay   for (i=0; i<nrows; i++) {
14849b5e25fSSatish Balay     row      = irow[i];
14949b5e25fSSatish Balay     kstart   = ai[row];
15049b5e25fSSatish Balay     kend     = kstart + a->ilen[row];
15149b5e25fSSatish Balay     mat_i    = c->i[i];
15249b5e25fSSatish Balay     mat_j    = c->j + mat_i;
15349b5e25fSSatish Balay     mat_a    = c->a + mat_i*bs2;
15449b5e25fSSatish Balay     mat_ilen = c->ilen + i;
15549b5e25fSSatish Balay     for (k=kstart; k<kend; k++) {
15649b5e25fSSatish Balay       if ((tcol=ssmap[a->j[k]])) {
15749b5e25fSSatish Balay         *mat_j++ = tcol - 1;
15849b5e25fSSatish Balay         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
15949b5e25fSSatish Balay         mat_a   += bs2;
16049b5e25fSSatish Balay         (*mat_ilen)++;
16149b5e25fSSatish Balay       }
16249b5e25fSSatish Balay     }
16349b5e25fSSatish Balay   }
16449b5e25fSSatish Balay 
16549b5e25fSSatish Balay   /* Free work space */
16649b5e25fSSatish Balay   ierr = PetscFree(smap);CHKERRQ(ierr);
16749b5e25fSSatish Balay   ierr = PetscFree(lens);CHKERRQ(ierr);
16849b5e25fSSatish Balay   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
16949b5e25fSSatish Balay   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
17049b5e25fSSatish Balay 
17149b5e25fSSatish Balay   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
17249b5e25fSSatish Balay   *B   = C;
17349b5e25fSSatish Balay   PetscFunctionReturn(0);
17449b5e25fSSatish Balay }
17549b5e25fSSatish Balay 
1764a2ae208SSatish Balay #undef __FUNCT__
1774a2ae208SSatish Balay #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ"
1784aa3045dSJed Brown PetscErrorCode MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
17949b5e25fSSatish Balay {
18049b5e25fSSatish Balay   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
18149b5e25fSSatish Balay   IS             is1;
1826849ba73SBarry Smith   PetscErrorCode ierr;
1835d0c19d7SBarry Smith   PetscInt       *vary,*iary,nrows,i,bs=A->rmap->bs,count;
1845d0c19d7SBarry Smith   const PetscInt *irow;
18549b5e25fSSatish Balay 
18649b5e25fSSatish Balay   PetscFunctionBegin;
187e32f2f54SBarry Smith   if (isrow != iscol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro");
18849b5e25fSSatish Balay 
18949b5e25fSSatish Balay   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
19049b5e25fSSatish Balay   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
19149b5e25fSSatish Balay 
19249b5e25fSSatish Balay   /* Verify if the indices corespond to each element in a block
19349b5e25fSSatish Balay    and form the IS with compressed IS */
194dcca6d9dSJed Brown   ierr = PetscMalloc2(a->mbs,&vary,a->mbs,&iary);CHKERRQ(ierr);
19574ed9c26SBarry Smith   ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr);
19649b5e25fSSatish Balay   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
19749b5e25fSSatish Balay 
19849b5e25fSSatish Balay   count = 0;
19949b5e25fSSatish Balay   for (i=0; i<a->mbs; i++) {
200e32f2f54SBarry Smith     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Index set does not match blocks");
20149b5e25fSSatish Balay     if (vary[i]==bs) iary[count++] = i;
20249b5e25fSSatish Balay   }
20349b5e25fSSatish Balay   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
20470b3c8c7SBarry Smith   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr);
20574ed9c26SBarry Smith   ierr = PetscFree2(vary,iary);CHKERRQ(ierr);
20649b5e25fSSatish Balay 
2074aa3045dSJed Brown   ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,scall,B);CHKERRQ(ierr);
2086bf464f9SBarry Smith   ierr = ISDestroy(&is1);CHKERRQ(ierr);
20949b5e25fSSatish Balay   PetscFunctionReturn(0);
21049b5e25fSSatish Balay }
21149b5e25fSSatish Balay 
2124a2ae208SSatish Balay #undef __FUNCT__
2134a2ae208SSatish Balay #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ"
21413f74950SBarry Smith PetscErrorCode MatGetSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
21549b5e25fSSatish Balay {
2166849ba73SBarry Smith   PetscErrorCode ierr;
21713f74950SBarry Smith   PetscInt       i;
21849b5e25fSSatish Balay 
21949b5e25fSSatish Balay   PetscFunctionBegin;
22049b5e25fSSatish Balay   if (scall == MAT_INITIAL_MATRIX) {
221854ce69bSBarry Smith     ierr = PetscMalloc1(n+1,B);CHKERRQ(ierr);
22249b5e25fSSatish Balay   }
22349b5e25fSSatish Balay 
22449b5e25fSSatish Balay   for (i=0; i<n; i++) {
2254aa3045dSJed Brown     ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr);
22649b5e25fSSatish Balay   }
22749b5e25fSSatish Balay   PetscFunctionReturn(0);
22849b5e25fSSatish Balay }
22949b5e25fSSatish Balay 
23049b5e25fSSatish Balay /* -------------------------------------------------------*/
23149b5e25fSSatish Balay /* Should check that shapes of vectors and matrices match */
23249b5e25fSSatish Balay /* -------------------------------------------------------*/
23349b5e25fSSatish Balay 
2344a2ae208SSatish Balay #undef __FUNCT__
2354a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_2"
236dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
23749b5e25fSSatish Balay {
23849b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
239*d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,zero=0.0;
240*d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
241*d9ca1df4SBarry Smith   const MatScalar   *v;
2426849ba73SBarry Smith   PetscErrorCode    ierr;
243*d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
244*d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
24598c9bda7SSatish Balay   PetscInt          nonzerorow=0;
24649b5e25fSSatish Balay 
24749b5e25fSSatish Balay   PetscFunctionBegin;
2482dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
249*d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
2501ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
25149b5e25fSSatish Balay 
25249b5e25fSSatish Balay   v  = a->a;
25349b5e25fSSatish Balay   xb = x;
25449b5e25fSSatish Balay 
25549b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
25649b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
25749b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1];
25849b5e25fSSatish Balay     ib          = aj + *ai;
259831a3094SHong Zhang     jmin        = 0;
26098c9bda7SSatish Balay     nonzerorow += (n>0);
2617fbae186SHong Zhang     if (*ib == i) {     /* (diag of A)*x */
26249b5e25fSSatish Balay       z[2*i]   += v[0]*x1 + v[2]*x2;
26349b5e25fSSatish Balay       z[2*i+1] += v[2]*x1 + v[3]*x2;
264831a3094SHong Zhang       v        += 4; jmin++;
2657fbae186SHong Zhang     }
266444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
267444d8c10SJed Brown     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
268831a3094SHong Zhang     for (j=jmin; j<n; j++) {
26949b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
27049b5e25fSSatish Balay       cval       = ib[j]*2;
27149b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2;
27249b5e25fSSatish Balay       z[cval+1] += v[2]*x1 + v[3]*x2;
27349b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
27449b5e25fSSatish Balay       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
27549b5e25fSSatish Balay       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
27649b5e25fSSatish Balay       v        += 4;
27749b5e25fSSatish Balay     }
27849b5e25fSSatish Balay     xb +=2; ai++;
27949b5e25fSSatish Balay   }
28049b5e25fSSatish Balay 
281*d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
2821ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
283dc0b31edSSatish Balay   ierr = PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
28449b5e25fSSatish Balay   PetscFunctionReturn(0);
28549b5e25fSSatish Balay }
28649b5e25fSSatish Balay 
2874a2ae208SSatish Balay #undef __FUNCT__
2884a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_3"
289dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
29049b5e25fSSatish Balay {
29149b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
292*d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,zero=0.0;
293*d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
294*d9ca1df4SBarry Smith   const MatScalar   *v;
2956849ba73SBarry Smith   PetscErrorCode    ierr;
296*d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
297*d9ca1df4SBarry Smith   const PetscInt    *aj = a->j,*ai = a->i,*ib;
29898c9bda7SSatish Balay   PetscInt          nonzerorow=0;
29949b5e25fSSatish Balay 
30049b5e25fSSatish Balay   PetscFunctionBegin;
3012dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
302*d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
3031ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
30449b5e25fSSatish Balay 
30549b5e25fSSatish Balay   v  = a->a;
30649b5e25fSSatish Balay   xb = x;
30749b5e25fSSatish Balay 
30849b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
30949b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
31049b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2];
31149b5e25fSSatish Balay     ib          = aj + *ai;
312831a3094SHong Zhang     jmin        = 0;
31398c9bda7SSatish Balay     nonzerorow += (n>0);
3147fbae186SHong Zhang     if (*ib == i) {     /* (diag of A)*x */
31549b5e25fSSatish Balay       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
31649b5e25fSSatish Balay       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
31749b5e25fSSatish Balay       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
318831a3094SHong Zhang       v        += 9; jmin++;
3197fbae186SHong Zhang     }
320444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
321444d8c10SJed Brown     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
322831a3094SHong Zhang     for (j=jmin; j<n; j++) {
32349b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
32449b5e25fSSatish Balay       cval       = ib[j]*3;
32549b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3;
32649b5e25fSSatish Balay       z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
32749b5e25fSSatish Balay       z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
32849b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
32949b5e25fSSatish Balay       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
33049b5e25fSSatish Balay       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
33149b5e25fSSatish Balay       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
33249b5e25fSSatish Balay       v        += 9;
33349b5e25fSSatish Balay     }
33449b5e25fSSatish Balay     xb +=3; ai++;
33549b5e25fSSatish Balay   }
33649b5e25fSSatish Balay 
337*d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
3381ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
339dc0b31edSSatish Balay   ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
34049b5e25fSSatish Balay   PetscFunctionReturn(0);
34149b5e25fSSatish Balay }
34249b5e25fSSatish Balay 
3434a2ae208SSatish Balay #undef __FUNCT__
3444a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_4"
345dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
34649b5e25fSSatish Balay {
34749b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
348*d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,zero=0.0;
349*d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
350*d9ca1df4SBarry Smith   const MatScalar   *v;
3516849ba73SBarry Smith   PetscErrorCode    ierr;
352*d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
353*d9ca1df4SBarry Smith   const PetscInt    *aj = a->j,*ai = a->i,*ib;
35498c9bda7SSatish Balay   PetscInt          nonzerorow = 0;
35549b5e25fSSatish Balay 
35649b5e25fSSatish Balay   PetscFunctionBegin;
3572dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
358*d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
3591ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
36049b5e25fSSatish Balay 
36149b5e25fSSatish Balay   v  = a->a;
36249b5e25fSSatish Balay   xb = x;
36349b5e25fSSatish Balay 
36449b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
36549b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
36649b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
36749b5e25fSSatish Balay     ib          = aj + *ai;
368831a3094SHong Zhang     jmin        = 0;
36998c9bda7SSatish Balay     nonzerorow += (n>0);
3707fbae186SHong Zhang     if (*ib == i) {     /* (diag of A)*x */
37149b5e25fSSatish Balay       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
37249b5e25fSSatish Balay       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
37349b5e25fSSatish Balay       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
37449b5e25fSSatish Balay       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
375831a3094SHong Zhang       v        += 16; jmin++;
3767fbae186SHong Zhang     }
377444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
378444d8c10SJed Brown     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
379831a3094SHong Zhang     for (j=jmin; j<n; j++) {
38049b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
38149b5e25fSSatish Balay       cval       = ib[j]*4;
38249b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
38349b5e25fSSatish Balay       z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
38449b5e25fSSatish Balay       z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
38549b5e25fSSatish Balay       z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
38649b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
38749b5e25fSSatish Balay       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
38849b5e25fSSatish Balay       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
38949b5e25fSSatish Balay       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
39049b5e25fSSatish Balay       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
39149b5e25fSSatish Balay       v        += 16;
39249b5e25fSSatish Balay     }
39349b5e25fSSatish Balay     xb +=4; ai++;
39449b5e25fSSatish Balay   }
39549b5e25fSSatish Balay 
396*d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
3971ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
398dc0b31edSSatish Balay   ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
39949b5e25fSSatish Balay   PetscFunctionReturn(0);
40049b5e25fSSatish Balay }
40149b5e25fSSatish Balay 
4024a2ae208SSatish Balay #undef __FUNCT__
4034a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_5"
404dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
40549b5e25fSSatish Balay {
40649b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
407*d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5,zero=0.0;
408*d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
409*d9ca1df4SBarry Smith   const MatScalar   *v;
4106849ba73SBarry Smith   PetscErrorCode    ierr;
411*d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
412*d9ca1df4SBarry Smith   const PetscInt    *aj = a->j,*ai = a->i,*ib;
41398c9bda7SSatish Balay   PetscInt          nonzerorow=0;
41449b5e25fSSatish Balay 
41549b5e25fSSatish Balay   PetscFunctionBegin;
4162dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
417*d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
4181ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
41949b5e25fSSatish Balay 
42049b5e25fSSatish Balay   v  = a->a;
42149b5e25fSSatish Balay   xb = x;
42249b5e25fSSatish Balay 
42349b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
42449b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
42549b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
42649b5e25fSSatish Balay     ib          = aj + *ai;
427831a3094SHong Zhang     jmin        = 0;
42898c9bda7SSatish Balay     nonzerorow += (n>0);
4297fbae186SHong Zhang     if (*ib == i) {      /* (diag of A)*x */
43049b5e25fSSatish Balay       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
43149b5e25fSSatish Balay       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
43249b5e25fSSatish Balay       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
43349b5e25fSSatish Balay       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
43449b5e25fSSatish Balay       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
435831a3094SHong Zhang       v        += 25; jmin++;
4367fbae186SHong Zhang     }
437444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
438444d8c10SJed Brown     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
439831a3094SHong Zhang     for (j=jmin; j<n; j++) {
44049b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
44149b5e25fSSatish Balay       cval       = ib[j]*5;
44249b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
44349b5e25fSSatish Balay       z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
44449b5e25fSSatish Balay       z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
44549b5e25fSSatish Balay       z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
44649b5e25fSSatish Balay       z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
44749b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
44849b5e25fSSatish Balay       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
44949b5e25fSSatish Balay       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
45049b5e25fSSatish Balay       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
45149b5e25fSSatish Balay       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
45249b5e25fSSatish Balay       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
45349b5e25fSSatish Balay       v        += 25;
45449b5e25fSSatish Balay     }
45549b5e25fSSatish Balay     xb +=5; ai++;
45649b5e25fSSatish Balay   }
45749b5e25fSSatish Balay 
458*d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
4591ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
460dc0b31edSSatish Balay   ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
46149b5e25fSSatish Balay   PetscFunctionReturn(0);
46249b5e25fSSatish Balay }
46349b5e25fSSatish Balay 
46449b5e25fSSatish Balay 
4654a2ae208SSatish Balay #undef __FUNCT__
4664a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_6"
467dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
46849b5e25fSSatish Balay {
46949b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
470*d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5,x6,zero=0.0;
471*d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
472*d9ca1df4SBarry Smith   const MatScalar   *v;
4736849ba73SBarry Smith   PetscErrorCode    ierr;
474*d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
475*d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
47698c9bda7SSatish Balay   PetscInt          nonzerorow=0;
47749b5e25fSSatish Balay 
47849b5e25fSSatish Balay   PetscFunctionBegin;
4792dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
480*d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
4811ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
48249b5e25fSSatish Balay 
48349b5e25fSSatish Balay   v  = a->a;
48449b5e25fSSatish Balay   xb = x;
48549b5e25fSSatish Balay 
48649b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
48749b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
48849b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
48949b5e25fSSatish Balay     ib          = aj + *ai;
490831a3094SHong Zhang     jmin        = 0;
49198c9bda7SSatish Balay     nonzerorow += (n>0);
4927fbae186SHong Zhang     if (*ib == i) {      /* (diag of A)*x */
49349b5e25fSSatish Balay       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
49449b5e25fSSatish Balay       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
49549b5e25fSSatish Balay       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
49649b5e25fSSatish Balay       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
49749b5e25fSSatish Balay       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
49849b5e25fSSatish Balay       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
499831a3094SHong Zhang       v        += 36; jmin++;
5007fbae186SHong Zhang     }
501444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
502444d8c10SJed Brown     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
503831a3094SHong Zhang     for (j=jmin; j<n; j++) {
50449b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
50549b5e25fSSatish Balay       cval       = ib[j]*6;
50649b5e25fSSatish Balay       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
50749b5e25fSSatish Balay       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
50849b5e25fSSatish Balay       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
50949b5e25fSSatish Balay       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
51049b5e25fSSatish Balay       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
51149b5e25fSSatish Balay       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
51249b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
51349b5e25fSSatish Balay       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
51449b5e25fSSatish Balay       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
51549b5e25fSSatish Balay       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
51649b5e25fSSatish Balay       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
51749b5e25fSSatish Balay       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
51849b5e25fSSatish Balay       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
51949b5e25fSSatish Balay       v        += 36;
52049b5e25fSSatish Balay     }
52149b5e25fSSatish Balay     xb +=6; ai++;
52249b5e25fSSatish Balay   }
52349b5e25fSSatish Balay 
524*d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
5251ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
526dc0b31edSSatish Balay   ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
52749b5e25fSSatish Balay   PetscFunctionReturn(0);
52849b5e25fSSatish Balay }
5294a2ae208SSatish Balay #undef __FUNCT__
5304a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_7"
531dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
53249b5e25fSSatish Balay {
53349b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
534*d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
535*d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
536*d9ca1df4SBarry Smith   const MatScalar   *v;
5376849ba73SBarry Smith   PetscErrorCode    ierr;
538*d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
539*d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
54098c9bda7SSatish Balay   PetscInt          nonzerorow=0;
54149b5e25fSSatish Balay 
54249b5e25fSSatish Balay   PetscFunctionBegin;
5432dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
544*d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
5451ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
54649b5e25fSSatish Balay 
54749b5e25fSSatish Balay   v  = a->a;
54849b5e25fSSatish Balay   xb = x;
54949b5e25fSSatish Balay 
55049b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
55149b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
55249b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
55349b5e25fSSatish Balay     ib          = aj + *ai;
554831a3094SHong Zhang     jmin        = 0;
55598c9bda7SSatish Balay     nonzerorow += (n>0);
5567fbae186SHong Zhang     if (*ib == i) {      /* (diag of A)*x */
55749b5e25fSSatish Balay       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
55849b5e25fSSatish Balay       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
55949b5e25fSSatish Balay       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
56049b5e25fSSatish Balay       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
56149b5e25fSSatish Balay       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
56249b5e25fSSatish Balay       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
56349b5e25fSSatish Balay       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
564831a3094SHong Zhang       v        += 49; jmin++;
5657fbae186SHong Zhang     }
566444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
567444d8c10SJed Brown     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
568831a3094SHong Zhang     for (j=jmin; j<n; j++) {
56949b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
57049b5e25fSSatish Balay       cval       = ib[j]*7;
57149b5e25fSSatish Balay       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
57249b5e25fSSatish Balay       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
57349b5e25fSSatish Balay       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
57449b5e25fSSatish Balay       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
57549b5e25fSSatish Balay       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
57649b5e25fSSatish Balay       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
57749b5e25fSSatish Balay       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
57849b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
57949b5e25fSSatish Balay       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
58049b5e25fSSatish Balay       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
58149b5e25fSSatish Balay       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
58249b5e25fSSatish Balay       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
58349b5e25fSSatish Balay       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
58449b5e25fSSatish Balay       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
58549b5e25fSSatish Balay       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
58649b5e25fSSatish Balay       v       += 49;
58749b5e25fSSatish Balay     }
58849b5e25fSSatish Balay     xb +=7; ai++;
58949b5e25fSSatish Balay   }
590*d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
5911ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
592dc0b31edSSatish Balay   ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
59349b5e25fSSatish Balay   PetscFunctionReturn(0);
59449b5e25fSSatish Balay }
59549b5e25fSSatish Balay 
59649b5e25fSSatish Balay /*
59749b5e25fSSatish Balay     This will not work with MatScalar == float because it calls the BLAS
59849b5e25fSSatish Balay */
5994a2ae208SSatish Balay #undef __FUNCT__
6004a2ae208SSatish Balay #define __FUNCT__ "MatMult_SeqSBAIJ_N"
601dfbe8321SBarry Smith PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
60249b5e25fSSatish Balay {
60349b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
604*d9ca1df4SBarry Smith   PetscScalar       *z,*z_ptr,*zb,*work,*workt,zero=0.0;
605*d9ca1df4SBarry Smith   const PetscScalar *x,*x_ptr,*xb;
606*d9ca1df4SBarry Smith   const MatScalar   *v;
607dfbe8321SBarry Smith   PetscErrorCode    ierr;
608*d9ca1df4SBarry Smith   PetscInt          mbs =a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
609*d9ca1df4SBarry Smith   const PetscInt    *idx,*aj,*ii;
61098c9bda7SSatish Balay   PetscInt          nonzerorow=0;
61149b5e25fSSatish Balay 
61249b5e25fSSatish Balay   PetscFunctionBegin;
6132dcb1b2aSMatthew Knepley   ierr = VecSet(zz,zero);CHKERRQ(ierr);
614*d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);x_ptr = x;
6151ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
61649b5e25fSSatish Balay 
61749b5e25fSSatish Balay   aj = a->j;
61849b5e25fSSatish Balay   v  = a->a;
61949b5e25fSSatish Balay   ii = a->i;
62049b5e25fSSatish Balay 
62149b5e25fSSatish Balay   if (!a->mult_work) {
622854ce69bSBarry Smith     ierr = PetscMalloc1(A->rmap->N+1,&a->mult_work);CHKERRQ(ierr);
62349b5e25fSSatish Balay   }
62449b5e25fSSatish Balay   work = a->mult_work;
62549b5e25fSSatish Balay 
62649b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
62749b5e25fSSatish Balay     n           = ii[1] - ii[0]; ncols = n*bs;
62849b5e25fSSatish Balay     workt       = work; idx=aj+ii[0];
62998c9bda7SSatish Balay     nonzerorow += (n>0);
63049b5e25fSSatish Balay 
63149b5e25fSSatish Balay     /* upper triangular part */
63249b5e25fSSatish Balay     for (j=0; j<n; j++) {
63349b5e25fSSatish Balay       xb = x_ptr + bs*(*idx++);
63449b5e25fSSatish Balay       for (k=0; k<bs; k++) workt[k] = xb[k];
63549b5e25fSSatish Balay       workt += bs;
63649b5e25fSSatish Balay     }
63749b5e25fSSatish Balay     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
63896b95a6bSBarry Smith     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
63949b5e25fSSatish Balay 
64049b5e25fSSatish Balay     /* strict lower triangular part */
641831a3094SHong Zhang     idx = aj+ii[0];
642831a3094SHong Zhang     if (*idx == i) {
64396b9376eSHong Zhang       ncols -= bs; v += bs2; idx++; n--;
644831a3094SHong Zhang     }
64596b9376eSHong Zhang 
64649b5e25fSSatish Balay     if (ncols > 0) {
64749b5e25fSSatish Balay       workt = work;
64887828ca2SBarry Smith       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
64996b95a6bSBarry Smith       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
650831a3094SHong Zhang       for (j=0; j<n; j++) {
651831a3094SHong Zhang         zb = z_ptr + bs*(*idx++);
65249b5e25fSSatish Balay         for (k=0; k<bs; k++) zb[k] += workt[k];
65349b5e25fSSatish Balay         workt += bs;
65449b5e25fSSatish Balay       }
65549b5e25fSSatish Balay     }
65649b5e25fSSatish Balay     x += bs; v += n*bs2; z += bs; ii++;
65749b5e25fSSatish Balay   }
65849b5e25fSSatish Balay 
659*d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
6601ebc52fbSHong Zhang   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
661dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);CHKERRQ(ierr);
66249b5e25fSSatish Balay   PetscFunctionReturn(0);
66349b5e25fSSatish Balay }
66449b5e25fSSatish Balay 
6654a2ae208SSatish Balay #undef __FUNCT__
6664a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1"
667dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
66849b5e25fSSatish Balay {
66949b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
670*d9ca1df4SBarry Smith   PetscScalar       *z,x1;
671*d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
672*d9ca1df4SBarry Smith   const MatScalar   *v;
6736849ba73SBarry Smith   PetscErrorCode    ierr;
674*d9ca1df4SBarry Smith   PetscInt          mbs =a->mbs,i,n,cval,j,jmin;
675*d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
67698c9bda7SSatish Balay   PetscInt          nonzerorow=0;
67749b5e25fSSatish Balay 
67849b5e25fSSatish Balay   PetscFunctionBegin;
679343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
680*d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
6811ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
68249b5e25fSSatish Balay   v    = a->a;
68349b5e25fSSatish Balay   xb   = x;
68449b5e25fSSatish Balay 
68549b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
68649b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th row of A */
68749b5e25fSSatish Balay     x1          = xb[0];
68849b5e25fSSatish Balay     ib          = aj + *ai;
689831a3094SHong Zhang     jmin        = 0;
69098c9bda7SSatish Balay     nonzerorow += (n>0);
691831a3094SHong Zhang     if (*ib == i) {            /* (diag of A)*x */
692831a3094SHong Zhang       z[i] += *v++ * x[*ib++]; jmin++;
693831a3094SHong Zhang     }
694831a3094SHong Zhang     for (j=jmin; j<n; j++) {
69549b5e25fSSatish Balay       cval    = *ib;
69649b5e25fSSatish Balay       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
69749b5e25fSSatish Balay       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
69849b5e25fSSatish Balay     }
69949b5e25fSSatish Balay     xb++; ai++;
70049b5e25fSSatish Balay   }
70149b5e25fSSatish Balay 
702*d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
703bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
70449b5e25fSSatish Balay 
705dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
70649b5e25fSSatish Balay   PetscFunctionReturn(0);
70749b5e25fSSatish Balay }
70849b5e25fSSatish Balay 
7094a2ae208SSatish Balay #undef __FUNCT__
7104a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2"
711dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
71249b5e25fSSatish Balay {
71349b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
714*d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2;
715*d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
716*d9ca1df4SBarry Smith   const MatScalar   *v;
7176849ba73SBarry Smith   PetscErrorCode    ierr;
718*d9ca1df4SBarry Smith   PetscInt          mbs =a->mbs,i,n,cval,j,jmin;
719*d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
72098c9bda7SSatish Balay   PetscInt          nonzerorow=0;
72149b5e25fSSatish Balay 
72249b5e25fSSatish Balay   PetscFunctionBegin;
723343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
724*d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
7251ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
72649b5e25fSSatish Balay 
72749b5e25fSSatish Balay   v  = a->a;
72849b5e25fSSatish Balay   xb = x;
72949b5e25fSSatish Balay 
73049b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
73149b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
73249b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1];
73349b5e25fSSatish Balay     ib          = aj + *ai;
734831a3094SHong Zhang     jmin        = 0;
73598c9bda7SSatish Balay     nonzerorow += (n>0);
7367fbae186SHong Zhang     if (*ib == i) {      /* (diag of A)*x */
73749b5e25fSSatish Balay       z[2*i]   += v[0]*x1 + v[2]*x2;
73849b5e25fSSatish Balay       z[2*i+1] += v[2]*x1 + v[3]*x2;
739831a3094SHong Zhang       v        += 4; jmin++;
7407fbae186SHong Zhang     }
741444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
742444d8c10SJed Brown     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
743831a3094SHong Zhang     for (j=jmin; j<n; j++) {
74449b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
74549b5e25fSSatish Balay       cval       = ib[j]*2;
74649b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2;
74749b5e25fSSatish Balay       z[cval+1] += v[2]*x1 + v[3]*x2;
74849b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
74949b5e25fSSatish Balay       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
75049b5e25fSSatish Balay       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
75149b5e25fSSatish Balay       v        += 4;
75249b5e25fSSatish Balay     }
75349b5e25fSSatish Balay     xb +=2; ai++;
75449b5e25fSSatish Balay   }
755*d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
756bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
75749b5e25fSSatish Balay 
758dc0b31edSSatish Balay   ierr = PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
75949b5e25fSSatish Balay   PetscFunctionReturn(0);
76049b5e25fSSatish Balay }
76149b5e25fSSatish Balay 
7624a2ae208SSatish Balay #undef __FUNCT__
7634a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3"
764dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
76549b5e25fSSatish Balay {
76649b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
767*d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3;
768*d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
769*d9ca1df4SBarry Smith   const MatScalar   *v;
7706849ba73SBarry Smith   PetscErrorCode    ierr;
771*d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
772*d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
77398c9bda7SSatish Balay   PetscInt          nonzerorow=0;
77449b5e25fSSatish Balay 
77549b5e25fSSatish Balay   PetscFunctionBegin;
776343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
777*d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
7781ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
77949b5e25fSSatish Balay 
78049b5e25fSSatish Balay   v  = a->a;
78149b5e25fSSatish Balay   xb = x;
78249b5e25fSSatish Balay 
78349b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
78449b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
78549b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2];
78649b5e25fSSatish Balay     ib          = aj + *ai;
787831a3094SHong Zhang     jmin        = 0;
78898c9bda7SSatish Balay     nonzerorow += (n>0);
7897fbae186SHong Zhang     if (*ib == i) {     /* (diag of A)*x */
79049b5e25fSSatish Balay       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
79149b5e25fSSatish Balay       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
79249b5e25fSSatish Balay       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
793831a3094SHong Zhang       v        += 9; jmin++;
7947fbae186SHong Zhang     }
795444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
796444d8c10SJed Brown     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
797831a3094SHong Zhang     for (j=jmin; j<n; j++) {
79849b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
79949b5e25fSSatish Balay       cval       = ib[j]*3;
80049b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3;
80149b5e25fSSatish Balay       z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
80249b5e25fSSatish Balay       z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
80349b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
80449b5e25fSSatish Balay       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
80549b5e25fSSatish Balay       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
80649b5e25fSSatish Balay       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
80749b5e25fSSatish Balay       v        += 9;
80849b5e25fSSatish Balay     }
80949b5e25fSSatish Balay     xb +=3; ai++;
81049b5e25fSSatish Balay   }
81149b5e25fSSatish Balay 
812*d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
813bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
81449b5e25fSSatish Balay 
815dc0b31edSSatish Balay   ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
81649b5e25fSSatish Balay   PetscFunctionReturn(0);
81749b5e25fSSatish Balay }
81849b5e25fSSatish Balay 
8194a2ae208SSatish Balay #undef __FUNCT__
8204a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4"
821dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
82249b5e25fSSatish Balay {
82349b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
824*d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4;
825*d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
826*d9ca1df4SBarry Smith   const MatScalar   *v;
8276849ba73SBarry Smith   PetscErrorCode    ierr;
828*d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
829*d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
83098c9bda7SSatish Balay   PetscInt          nonzerorow=0;
83149b5e25fSSatish Balay 
83249b5e25fSSatish Balay   PetscFunctionBegin;
833343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
834*d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
8351ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
83649b5e25fSSatish Balay 
83749b5e25fSSatish Balay   v  = a->a;
83849b5e25fSSatish Balay   xb = x;
83949b5e25fSSatish Balay 
84049b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
84149b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
84249b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
84349b5e25fSSatish Balay     ib          = aj + *ai;
844831a3094SHong Zhang     jmin        = 0;
84598c9bda7SSatish Balay     nonzerorow += (n>0);
8467fbae186SHong Zhang     if (*ib == i) {      /* (diag of A)*x */
84749b5e25fSSatish Balay       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
84849b5e25fSSatish Balay       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
84949b5e25fSSatish Balay       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
85049b5e25fSSatish Balay       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
851831a3094SHong Zhang       v        += 16; jmin++;
8527fbae186SHong Zhang     }
853444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
854444d8c10SJed Brown     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
855831a3094SHong Zhang     for (j=jmin; j<n; j++) {
85649b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
85749b5e25fSSatish Balay       cval       = ib[j]*4;
85849b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
85949b5e25fSSatish Balay       z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
86049b5e25fSSatish Balay       z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
86149b5e25fSSatish Balay       z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
86249b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
86349b5e25fSSatish Balay       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
86449b5e25fSSatish Balay       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
86549b5e25fSSatish Balay       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
86649b5e25fSSatish Balay       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
86749b5e25fSSatish Balay       v        += 16;
86849b5e25fSSatish Balay     }
86949b5e25fSSatish Balay     xb +=4; ai++;
87049b5e25fSSatish Balay   }
87149b5e25fSSatish Balay 
872*d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
873bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
87449b5e25fSSatish Balay 
875dc0b31edSSatish Balay   ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
87649b5e25fSSatish Balay   PetscFunctionReturn(0);
87749b5e25fSSatish Balay }
87849b5e25fSSatish Balay 
8794a2ae208SSatish Balay #undef __FUNCT__
8804a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5"
881dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
88249b5e25fSSatish Balay {
88349b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
884*d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5;
885*d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
886*d9ca1df4SBarry Smith   const MatScalar   *v;
8876849ba73SBarry Smith   PetscErrorCode    ierr;
888*d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
889*d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
89098c9bda7SSatish Balay   PetscInt          nonzerorow=0;
89149b5e25fSSatish Balay 
89249b5e25fSSatish Balay   PetscFunctionBegin;
893343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
894*d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
8951ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
89649b5e25fSSatish Balay 
89749b5e25fSSatish Balay   v  = a->a;
89849b5e25fSSatish Balay   xb = x;
89949b5e25fSSatish Balay 
90049b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
90149b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
90249b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
90349b5e25fSSatish Balay     ib          = aj + *ai;
904831a3094SHong Zhang     jmin        = 0;
90598c9bda7SSatish Balay     nonzerorow += (n>0);
9067fbae186SHong Zhang     if (*ib == i) {      /* (diag of A)*x */
90749b5e25fSSatish Balay       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
90849b5e25fSSatish Balay       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
90949b5e25fSSatish Balay       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
91049b5e25fSSatish Balay       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
91149b5e25fSSatish Balay       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
912831a3094SHong Zhang       v        += 25; jmin++;
9137fbae186SHong Zhang     }
914444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
915444d8c10SJed Brown     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
916831a3094SHong Zhang     for (j=jmin; j<n; j++) {
91749b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
91849b5e25fSSatish Balay       cval       = ib[j]*5;
91949b5e25fSSatish Balay       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
92049b5e25fSSatish Balay       z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
92149b5e25fSSatish Balay       z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
92249b5e25fSSatish Balay       z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
92349b5e25fSSatish Balay       z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
92449b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
92549b5e25fSSatish Balay       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
92649b5e25fSSatish Balay       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
92749b5e25fSSatish Balay       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
92849b5e25fSSatish Balay       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
92949b5e25fSSatish Balay       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
93049b5e25fSSatish Balay       v        += 25;
93149b5e25fSSatish Balay     }
93249b5e25fSSatish Balay     xb +=5; ai++;
93349b5e25fSSatish Balay   }
93449b5e25fSSatish Balay 
935*d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
936bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
93749b5e25fSSatish Balay 
938dc0b31edSSatish Balay   ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
93949b5e25fSSatish Balay   PetscFunctionReturn(0);
94049b5e25fSSatish Balay }
9414a2ae208SSatish Balay #undef __FUNCT__
9424a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6"
943dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
94449b5e25fSSatish Balay {
94549b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
946*d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5,x6;
947*d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
948*d9ca1df4SBarry Smith   const MatScalar   *v;
9496849ba73SBarry Smith   PetscErrorCode    ierr;
950*d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
951*d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
95298c9bda7SSatish Balay   PetscInt          nonzerorow=0;
95349b5e25fSSatish Balay 
95449b5e25fSSatish Balay   PetscFunctionBegin;
955343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
956*d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
9571ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
95849b5e25fSSatish Balay 
95949b5e25fSSatish Balay   v  = a->a;
96049b5e25fSSatish Balay   xb = x;
96149b5e25fSSatish Balay 
96249b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
96349b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
96449b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
96549b5e25fSSatish Balay     ib          = aj + *ai;
966831a3094SHong Zhang     jmin        = 0;
96798c9bda7SSatish Balay     nonzerorow += (n>0);
9687fbae186SHong Zhang     if (*ib == i) {     /* (diag of A)*x */
96949b5e25fSSatish Balay       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
97049b5e25fSSatish Balay       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
97149b5e25fSSatish Balay       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
97249b5e25fSSatish Balay       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
97349b5e25fSSatish Balay       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
97449b5e25fSSatish Balay       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
975831a3094SHong Zhang       v        += 36; jmin++;
9767fbae186SHong Zhang     }
977444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
978444d8c10SJed Brown     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
979831a3094SHong Zhang     for (j=jmin; j<n; j++) {
98049b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
98149b5e25fSSatish Balay       cval       = ib[j]*6;
98249b5e25fSSatish Balay       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
98349b5e25fSSatish Balay       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
98449b5e25fSSatish Balay       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
98549b5e25fSSatish Balay       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
98649b5e25fSSatish Balay       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
98749b5e25fSSatish Balay       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
98849b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
98949b5e25fSSatish Balay       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
99049b5e25fSSatish Balay       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
99149b5e25fSSatish Balay       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
99249b5e25fSSatish Balay       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
99349b5e25fSSatish Balay       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
99449b5e25fSSatish Balay       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
99549b5e25fSSatish Balay       v        += 36;
99649b5e25fSSatish Balay     }
99749b5e25fSSatish Balay     xb +=6; ai++;
99849b5e25fSSatish Balay   }
99949b5e25fSSatish Balay 
1000*d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1001bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
100249b5e25fSSatish Balay 
1003dc0b31edSSatish Balay   ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
100449b5e25fSSatish Balay   PetscFunctionReturn(0);
100549b5e25fSSatish Balay }
100649b5e25fSSatish Balay 
10074a2ae208SSatish Balay #undef __FUNCT__
10084a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7"
1009dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
101049b5e25fSSatish Balay {
101149b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1012*d9ca1df4SBarry Smith   PetscScalar       *z,x1,x2,x3,x4,x5,x6,x7;
1013*d9ca1df4SBarry Smith   const PetscScalar *x,*xb;
1014*d9ca1df4SBarry Smith   const MatScalar   *v;
10156849ba73SBarry Smith   PetscErrorCode    ierr;
1016*d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
1017*d9ca1df4SBarry Smith   const PetscInt    *aj=a->j,*ai=a->i,*ib;
101898c9bda7SSatish Balay   PetscInt          nonzerorow=0;
101949b5e25fSSatish Balay 
102049b5e25fSSatish Balay   PetscFunctionBegin;
1021343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1022*d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
10231ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
102449b5e25fSSatish Balay 
102549b5e25fSSatish Balay   v  = a->a;
102649b5e25fSSatish Balay   xb = x;
102749b5e25fSSatish Balay 
102849b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
102949b5e25fSSatish Balay     n           = ai[1] - ai[0]; /* length of i_th block row of A */
103049b5e25fSSatish Balay     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
103149b5e25fSSatish Balay     ib          = aj + *ai;
1032831a3094SHong Zhang     jmin        = 0;
103398c9bda7SSatish Balay     nonzerorow += (n>0);
10347fbae186SHong Zhang     if (*ib == i) {     /* (diag of A)*x */
103549b5e25fSSatish Balay       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
103649b5e25fSSatish Balay       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
103749b5e25fSSatish Balay       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
103849b5e25fSSatish Balay       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
103949b5e25fSSatish Balay       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
104049b5e25fSSatish Balay       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
104149b5e25fSSatish Balay       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1042831a3094SHong Zhang       v        += 49; jmin++;
10437fbae186SHong Zhang     }
1044444d8c10SJed Brown     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1045444d8c10SJed Brown     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1046831a3094SHong Zhang     for (j=jmin; j<n; j++) {
104749b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
104849b5e25fSSatish Balay       cval       = ib[j]*7;
104949b5e25fSSatish Balay       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
105049b5e25fSSatish Balay       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
105149b5e25fSSatish Balay       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
105249b5e25fSSatish Balay       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
105349b5e25fSSatish Balay       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
105449b5e25fSSatish Balay       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
105549b5e25fSSatish Balay       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
105649b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
105749b5e25fSSatish Balay       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
105849b5e25fSSatish Balay       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
105949b5e25fSSatish Balay       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
106049b5e25fSSatish Balay       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
106149b5e25fSSatish Balay       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
106249b5e25fSSatish Balay       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
106349b5e25fSSatish Balay       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
106449b5e25fSSatish Balay       v       += 49;
106549b5e25fSSatish Balay     }
106649b5e25fSSatish Balay     xb +=7; ai++;
106749b5e25fSSatish Balay   }
106849b5e25fSSatish Balay 
1069*d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1070bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
107149b5e25fSSatish Balay 
1072dc0b31edSSatish Balay   ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
107349b5e25fSSatish Balay   PetscFunctionReturn(0);
107449b5e25fSSatish Balay }
107549b5e25fSSatish Balay 
10764a2ae208SSatish Balay #undef __FUNCT__
10774a2ae208SSatish Balay #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N"
1078dfbe8321SBarry Smith PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
107949b5e25fSSatish Balay {
108049b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1081*d9ca1df4SBarry Smith   PetscScalar       *z,*z_ptr=0,*zb,*work,*workt;
1082*d9ca1df4SBarry Smith   const PetscScalar *x,*x_ptr,*xb;
1083*d9ca1df4SBarry Smith   const MatScalar   *v;
1084dfbe8321SBarry Smith   PetscErrorCode    ierr;
1085*d9ca1df4SBarry Smith   PetscInt          mbs = a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
1086*d9ca1df4SBarry Smith   const PetscInt    *idx,*aj,*ii;
108798c9bda7SSatish Balay   PetscInt          nonzerorow=0;
108849b5e25fSSatish Balay 
108949b5e25fSSatish Balay   PetscFunctionBegin;
1090343551c4SBarry Smith   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1091*d9ca1df4SBarry Smith   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); x_ptr=x;
10921ebc52fbSHong Zhang   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
109349b5e25fSSatish Balay 
109449b5e25fSSatish Balay   aj = a->j;
109549b5e25fSSatish Balay   v  = a->a;
109649b5e25fSSatish Balay   ii = a->i;
109749b5e25fSSatish Balay 
109849b5e25fSSatish Balay   if (!a->mult_work) {
1099854ce69bSBarry Smith     ierr = PetscMalloc1(A->rmap->n+1,&a->mult_work);CHKERRQ(ierr);
110049b5e25fSSatish Balay   }
110149b5e25fSSatish Balay   work = a->mult_work;
110249b5e25fSSatish Balay 
110349b5e25fSSatish Balay 
110449b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
110549b5e25fSSatish Balay     n           = ii[1] - ii[0]; ncols = n*bs;
110649b5e25fSSatish Balay     workt       = work; idx=aj+ii[0];
110798c9bda7SSatish Balay     nonzerorow += (n>0);
110849b5e25fSSatish Balay 
110949b5e25fSSatish Balay     /* upper triangular part */
111049b5e25fSSatish Balay     for (j=0; j<n; j++) {
111149b5e25fSSatish Balay       xb = x_ptr + bs*(*idx++);
111249b5e25fSSatish Balay       for (k=0; k<bs; k++) workt[k] = xb[k];
111349b5e25fSSatish Balay       workt += bs;
111449b5e25fSSatish Balay     }
111549b5e25fSSatish Balay     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
111696b95a6bSBarry Smith     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
111749b5e25fSSatish Balay 
111849b5e25fSSatish Balay     /* strict lower triangular part */
1119831a3094SHong Zhang     idx = aj+ii[0];
1120831a3094SHong Zhang     if (*idx == i) {
112196b9376eSHong Zhang       ncols -= bs; v += bs2; idx++; n--;
1122831a3094SHong Zhang     }
112349b5e25fSSatish Balay     if (ncols > 0) {
112449b5e25fSSatish Balay       workt = work;
112587828ca2SBarry Smith       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
112696b95a6bSBarry Smith       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1127831a3094SHong Zhang       for (j=0; j<n; j++) {
1128831a3094SHong Zhang         zb = z_ptr + bs*(*idx++);
112949b5e25fSSatish Balay         for (k=0; k<bs; k++) zb[k] += workt[k];
113049b5e25fSSatish Balay         workt += bs;
113149b5e25fSSatish Balay       }
113249b5e25fSSatish Balay     }
113349b5e25fSSatish Balay 
113449b5e25fSSatish Balay     x += bs; v += n*bs2; z += bs; ii++;
113549b5e25fSSatish Balay   }
113649b5e25fSSatish Balay 
1137*d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1138bba805e6SBarry Smith   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
113949b5e25fSSatish Balay 
1140dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
114149b5e25fSSatish Balay   PetscFunctionReturn(0);
114249b5e25fSSatish Balay }
114349b5e25fSSatish Balay 
11444a2ae208SSatish Balay #undef __FUNCT__
11454a2ae208SSatish Balay #define __FUNCT__ "MatScale_SeqSBAIJ"
1146f4df32b1SMatthew Knepley PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha)
114749b5e25fSSatish Balay {
114849b5e25fSSatish Balay   Mat_SeqSBAIJ   *a     = (Mat_SeqSBAIJ*)inA->data;
1149f4df32b1SMatthew Knepley   PetscScalar    oalpha = alpha;
1150efee365bSSatish Balay   PetscErrorCode ierr;
1151c5df96a5SBarry Smith   PetscBLASInt   one = 1,totalnz;
115249b5e25fSSatish Balay 
115349b5e25fSSatish Balay   PetscFunctionBegin;
1154c5df96a5SBarry Smith   ierr = PetscBLASIntCast(a->bs2*a->nz,&totalnz);CHKERRQ(ierr);
11558b83055fSJed Brown   PetscStackCallBLAS("BLASscal",BLASscal_(&totalnz,&oalpha,a->a,&one));
1156efee365bSSatish Balay   ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
115749b5e25fSSatish Balay   PetscFunctionReturn(0);
115849b5e25fSSatish Balay }
115949b5e25fSSatish Balay 
11604a2ae208SSatish Balay #undef __FUNCT__
11614a2ae208SSatish Balay #define __FUNCT__ "MatNorm_SeqSBAIJ"
1162dfbe8321SBarry Smith PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
116349b5e25fSSatish Balay {
116449b5e25fSSatish Balay   Mat_SeqSBAIJ    *a       = (Mat_SeqSBAIJ*)A->data;
1165*d9ca1df4SBarry Smith   const MatScalar *v       = a->a;
116649b5e25fSSatish Balay   PetscReal       sum_diag = 0.0, sum_off = 0.0, *sum;
1167*d9ca1df4SBarry Smith   PetscInt        i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,jmin,jmax,nexti,ik,*jl,*il;
11686849ba73SBarry Smith   PetscErrorCode  ierr;
1169*d9ca1df4SBarry Smith   const PetscInt  *aj=a->j,*col;
117049b5e25fSSatish Balay 
117149b5e25fSSatish Balay   PetscFunctionBegin;
117249b5e25fSSatish Balay   if (type == NORM_FROBENIUS) {
117349b5e25fSSatish Balay     for (k=0; k<mbs; k++) {
117449b5e25fSSatish Balay       jmin = a->i[k]; jmax = a->i[k+1];
1175831a3094SHong Zhang       col  = aj + jmin;
1176831a3094SHong Zhang       if (*col == k) {         /* diagonal block */
117749b5e25fSSatish Balay         for (i=0; i<bs2; i++) {
117849b5e25fSSatish Balay           sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
117949b5e25fSSatish Balay         }
1180831a3094SHong Zhang         jmin++;
1181831a3094SHong Zhang       }
1182831a3094SHong Zhang       for (j=jmin; j<jmax; j++) {  /* off-diagonal blocks */
118349b5e25fSSatish Balay         for (i=0; i<bs2; i++) {
118449b5e25fSSatish Balay           sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
118549b5e25fSSatish Balay         }
118649b5e25fSSatish Balay       }
118749b5e25fSSatish Balay     }
11888f1a2a5eSBarry Smith     *norm = PetscSqrtReal(sum_diag + 2*sum_off);
11890b8dc8d2SHong Zhang   } else if (type == NORM_INFINITY || type == NORM_1) {  /* maximum row/column sum */
1190dcca6d9dSJed Brown     ierr = PetscMalloc3(bs,&sum,mbs,&il,mbs,&jl);CHKERRQ(ierr);
11910b8dc8d2SHong Zhang     for (i=0; i<mbs; i++) jl[i] = mbs;
11920b8dc8d2SHong Zhang     il[0] = 0;
119349b5e25fSSatish Balay 
119449b5e25fSSatish Balay     *norm = 0.0;
119549b5e25fSSatish Balay     for (k=0; k<mbs; k++) { /* k_th block row */
119649b5e25fSSatish Balay       for (j=0; j<bs; j++) sum[j]=0.0;
119749b5e25fSSatish Balay       /*-- col sum --*/
119849b5e25fSSatish Balay       i = jl[k]; /* first |A(i,k)| to be added */
1199831a3094SHong Zhang       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1200831a3094SHong Zhang                   at step k */
120149b5e25fSSatish Balay       while (i<mbs) {
120249b5e25fSSatish Balay         nexti = jl[i];  /* next block row to be added */
120349b5e25fSSatish Balay         ik    = il[i];  /* block index of A(i,k) in the array a */
120449b5e25fSSatish Balay         for (j=0; j<bs; j++) {
120549b5e25fSSatish Balay           v = a->a + ik*bs2 + j*bs;
120649b5e25fSSatish Balay           for (k1=0; k1<bs; k1++) {
120749b5e25fSSatish Balay             sum[j] += PetscAbsScalar(*v); v++;
120849b5e25fSSatish Balay           }
120949b5e25fSSatish Balay         }
121049b5e25fSSatish Balay         /* update il, jl */
1211831a3094SHong Zhang         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1212831a3094SHong Zhang         jmax = a->i[i+1];
121349b5e25fSSatish Balay         if (jmin < jmax) {
121449b5e25fSSatish Balay           il[i] = jmin;
121549b5e25fSSatish Balay           j     = a->j[jmin];
121649b5e25fSSatish Balay           jl[i] = jl[j]; jl[j]=i;
121749b5e25fSSatish Balay         }
121849b5e25fSSatish Balay         i = nexti;
121949b5e25fSSatish Balay       }
122049b5e25fSSatish Balay       /*-- row sum --*/
122149b5e25fSSatish Balay       jmin = a->i[k]; jmax = a->i[k+1];
122249b5e25fSSatish Balay       for (i=jmin; i<jmax; i++) {
122349b5e25fSSatish Balay         for (j=0; j<bs; j++) {
122449b5e25fSSatish Balay           v = a->a + i*bs2 + j;
122549b5e25fSSatish Balay           for (k1=0; k1<bs; k1++) {
12260b8dc8d2SHong Zhang             sum[j] += PetscAbsScalar(*v); v += bs;
122749b5e25fSSatish Balay           }
122849b5e25fSSatish Balay         }
122949b5e25fSSatish Balay       }
123049b5e25fSSatish Balay       /* add k_th block row to il, jl */
1231831a3094SHong Zhang       col = aj+jmin;
1232831a3094SHong Zhang       if (*col == k) jmin++;
123349b5e25fSSatish Balay       if (jmin < jmax) {
123449b5e25fSSatish Balay         il[k] = jmin;
12350b8dc8d2SHong Zhang         j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k;
123649b5e25fSSatish Balay       }
123749b5e25fSSatish Balay       for (j=0; j<bs; j++) {
123849b5e25fSSatish Balay         if (sum[j] > *norm) *norm = sum[j];
123949b5e25fSSatish Balay       }
124049b5e25fSSatish Balay     }
124174ed9c26SBarry Smith     ierr = PetscFree3(sum,il,jl);CHKERRQ(ierr);
1242f23aa3ddSBarry Smith   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
124349b5e25fSSatish Balay   PetscFunctionReturn(0);
124449b5e25fSSatish Balay }
124549b5e25fSSatish Balay 
12464a2ae208SSatish Balay #undef __FUNCT__
12474a2ae208SSatish Balay #define __FUNCT__ "MatEqual_SeqSBAIJ"
1248ace3abfcSBarry Smith PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscBool * flg)
124949b5e25fSSatish Balay {
125049b5e25fSSatish Balay   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)B->data;
1251dfbe8321SBarry Smith   PetscErrorCode ierr;
125249b5e25fSSatish Balay 
125349b5e25fSSatish Balay   PetscFunctionBegin;
125449b5e25fSSatish Balay   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1255d0f46423SBarry Smith   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1256ef511fbeSHong Zhang     *flg = PETSC_FALSE;
1257ef511fbeSHong Zhang     PetscFunctionReturn(0);
125849b5e25fSSatish Balay   }
125949b5e25fSSatish Balay 
126049b5e25fSSatish Balay   /* if the a->i are the same */
126113f74950SBarry Smith   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr);
126226fbe8dcSKarl Rupp   if (!*flg) PetscFunctionReturn(0);
126349b5e25fSSatish Balay 
126449b5e25fSSatish Balay   /* if a->j are the same */
126513f74950SBarry Smith   ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr);
126626fbe8dcSKarl Rupp   if (!*flg) PetscFunctionReturn(0);
126726fbe8dcSKarl Rupp 
126849b5e25fSSatish Balay   /* if a->a are the same */
1269d0f46423SBarry Smith   ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
1270935af2e7SHong Zhang   PetscFunctionReturn(0);
127149b5e25fSSatish Balay }
127249b5e25fSSatish Balay 
12734a2ae208SSatish Balay #undef __FUNCT__
12744a2ae208SSatish Balay #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ"
1275dfbe8321SBarry Smith PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
127649b5e25fSSatish Balay {
127749b5e25fSSatish Balay   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1278dfbe8321SBarry Smith   PetscErrorCode  ierr;
1279*d9ca1df4SBarry Smith   PetscInt        i,j,k,row,bs,ambs,bs2;
1280*d9ca1df4SBarry Smith   const PetscInt  *ai,*aj;
128187828ca2SBarry Smith   PetscScalar     *x,zero = 0.0;
1282*d9ca1df4SBarry Smith   const MatScalar *aa,*aa_j;
128349b5e25fSSatish Balay 
128449b5e25fSSatish Balay   PetscFunctionBegin;
1285d0f46423SBarry Smith   bs = A->rmap->bs;
1286e32f2f54SBarry Smith   if (A->factortype && bs>1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1");
128782799104SHong Zhang 
128849b5e25fSSatish Balay   aa   = a->a;
12898a0c6efdSHong Zhang   ambs = a->mbs;
12908a0c6efdSHong Zhang 
12918a0c6efdSHong Zhang   if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) {
12928a0c6efdSHong Zhang     PetscInt *diag=a->diag;
12938a0c6efdSHong Zhang     aa   = a->a;
12948a0c6efdSHong Zhang     ambs = a->mbs;
12958a0c6efdSHong Zhang     ierr = VecGetArray(v,&x);CHKERRQ(ierr);
12968a0c6efdSHong Zhang     for (i=0; i<ambs; i++) x[i] = 1.0/aa[diag[i]];
12978a0c6efdSHong Zhang     ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
12988a0c6efdSHong Zhang     PetscFunctionReturn(0);
12998a0c6efdSHong Zhang   }
13008a0c6efdSHong Zhang 
130149b5e25fSSatish Balay   ai   = a->i;
130249b5e25fSSatish Balay   aj   = a->j;
130349b5e25fSSatish Balay   bs2  = a->bs2;
13042dcb1b2aSMatthew Knepley   ierr = VecSet(v,zero);CHKERRQ(ierr);
13051ebc52fbSHong Zhang   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
130649b5e25fSSatish Balay   for (i=0; i<ambs; i++) {
130749b5e25fSSatish Balay     j=ai[i];
130849b5e25fSSatish Balay     if (aj[j] == i) {    /* if this is a diagonal element */
130949b5e25fSSatish Balay       row  = i*bs;
131049b5e25fSSatish Balay       aa_j = aa + j*bs2;
131149b5e25fSSatish Balay       for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
131249b5e25fSSatish Balay     }
131349b5e25fSSatish Balay   }
13141ebc52fbSHong Zhang   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
131549b5e25fSSatish Balay   PetscFunctionReturn(0);
131649b5e25fSSatish Balay }
131749b5e25fSSatish Balay 
13184a2ae208SSatish Balay #undef __FUNCT__
13194a2ae208SSatish Balay #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ"
1320dfbe8321SBarry Smith PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
132149b5e25fSSatish Balay {
132249b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1323*d9ca1df4SBarry Smith   PetscScalar       x;
1324*d9ca1df4SBarry Smith   const PetscScalar *l,*li,*ri;
132549b5e25fSSatish Balay   MatScalar         *aa,*v;
1326dfbe8321SBarry Smith   PetscErrorCode    ierr;
13275e90f9d9SHong Zhang   PetscInt          i,j,k,lm,M,m,*ai,*aj,mbs,tmp,bs,bs2;
1328ace3abfcSBarry Smith   PetscBool         flg;
132949b5e25fSSatish Balay 
133049b5e25fSSatish Balay   PetscFunctionBegin;
1331b3bf805bSHong Zhang   if (ll != rr) {
1332b3bf805bSHong Zhang     ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr);
1333e7e72b3dSBarry Smith     if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1334b3bf805bSHong Zhang   }
1335b3bf805bSHong Zhang   if (!ll) PetscFunctionReturn(0);
133649b5e25fSSatish Balay   ai  = a->i;
133749b5e25fSSatish Balay   aj  = a->j;
133849b5e25fSSatish Balay   aa  = a->a;
1339d0f46423SBarry Smith   m   = A->rmap->N;
1340d0f46423SBarry Smith   bs  = A->rmap->bs;
134149b5e25fSSatish Balay   mbs = a->mbs;
134249b5e25fSSatish Balay   bs2 = a->bs2;
134349b5e25fSSatish Balay 
1344*d9ca1df4SBarry Smith   ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr);
134549b5e25fSSatish Balay   ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1346e32f2f54SBarry Smith   if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
134749b5e25fSSatish Balay   for (i=0; i<mbs; i++) { /* for each block row */
134849b5e25fSSatish Balay     M  = ai[i+1] - ai[i];
134949b5e25fSSatish Balay     li = l + i*bs;
135049b5e25fSSatish Balay     v  = aa + bs2*ai[i];
135149b5e25fSSatish Balay     for (j=0; j<M; j++) { /* for each block */
135249b5e25fSSatish Balay       ri = l + bs*aj[ai[i]+j];
13535e90f9d9SHong Zhang       for (k=0; k<bs; k++) {
135449b5e25fSSatish Balay         x = ri[k];
135549b5e25fSSatish Balay         for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
135649b5e25fSSatish Balay       }
135749b5e25fSSatish Balay     }
135849b5e25fSSatish Balay   }
1359*d9ca1df4SBarry Smith   ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr);
1360dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
136149b5e25fSSatish Balay   PetscFunctionReturn(0);
136249b5e25fSSatish Balay }
136349b5e25fSSatish Balay 
13644a2ae208SSatish Balay #undef __FUNCT__
13654a2ae208SSatish Balay #define __FUNCT__ "MatGetInfo_SeqSBAIJ"
1366dfbe8321SBarry Smith PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
136749b5e25fSSatish Balay {
136849b5e25fSSatish Balay   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
136949b5e25fSSatish Balay 
137049b5e25fSSatish Balay   PetscFunctionBegin;
137149b5e25fSSatish Balay   info->block_size   = a->bs2;
1372ceed8ce5SJed Brown   info->nz_allocated = a->bs2*a->maxnz;   /*num. of nonzeros in upper triangular part */
13736c6c5352SBarry Smith   info->nz_used      = a->bs2*a->nz;   /*num. of nonzeros in upper triangular part */
137449b5e25fSSatish Balay   info->nz_unneeded  = (double)(info->nz_allocated - info->nz_used);
137549b5e25fSSatish Balay   info->assemblies   = A->num_ass;
13768e58a170SBarry Smith   info->mallocs      = A->info.mallocs;
13777adad957SLisandro Dalcin   info->memory       = ((PetscObject)A)->mem;
1378d5f3da31SBarry Smith   if (A->factortype) {
137949b5e25fSSatish Balay     info->fill_ratio_given  = A->info.fill_ratio_given;
138049b5e25fSSatish Balay     info->fill_ratio_needed = A->info.fill_ratio_needed;
138149b5e25fSSatish Balay     info->factor_mallocs    = A->info.factor_mallocs;
138249b5e25fSSatish Balay   } else {
138349b5e25fSSatish Balay     info->fill_ratio_given  = 0;
138449b5e25fSSatish Balay     info->fill_ratio_needed = 0;
138549b5e25fSSatish Balay     info->factor_mallocs    = 0;
138649b5e25fSSatish Balay   }
138749b5e25fSSatish Balay   PetscFunctionReturn(0);
138849b5e25fSSatish Balay }
138949b5e25fSSatish Balay 
139049b5e25fSSatish Balay 
13914a2ae208SSatish Balay #undef __FUNCT__
13924a2ae208SSatish Balay #define __FUNCT__ "MatZeroEntries_SeqSBAIJ"
1393dfbe8321SBarry Smith PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
139449b5e25fSSatish Balay {
139549b5e25fSSatish Balay   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1396dfbe8321SBarry Smith   PetscErrorCode ierr;
139749b5e25fSSatish Balay 
139849b5e25fSSatish Balay   PetscFunctionBegin;
139949b5e25fSSatish Balay   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
140049b5e25fSSatish Balay   PetscFunctionReturn(0);
140149b5e25fSSatish Balay }
1402dc354874SHong Zhang 
14034a2ae208SSatish Balay #undef __FUNCT__
1404985db425SBarry Smith #define __FUNCT__ "MatGetRowMaxAbs_SeqSBAIJ"
1405985db425SBarry Smith /*
1406985db425SBarry Smith    This code does not work since it only checks the upper triangular part of
1407985db425SBarry Smith   the matrix. Hence it is not listed in the function table.
1408985db425SBarry Smith */
1409985db425SBarry Smith PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[])
1410dc354874SHong Zhang {
1411dc354874SHong Zhang   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1412dfbe8321SBarry Smith   PetscErrorCode  ierr;
1413*d9ca1df4SBarry Smith   PetscInt        i,j,n,row,col,bs,mbs;
1414*d9ca1df4SBarry Smith   const PetscInt  *ai,*aj;
1415c3fca9a7SHong Zhang   PetscReal       atmp;
1416*d9ca1df4SBarry Smith   const MatScalar *aa;
1417985db425SBarry Smith   PetscScalar     *x;
141813f74950SBarry Smith   PetscInt        ncols,brow,bcol,krow,kcol;
1419dc354874SHong Zhang 
1420dc354874SHong Zhang   PetscFunctionBegin;
1421e32f2f54SBarry Smith   if (idx) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov");
1422e32f2f54SBarry Smith   if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1423d0f46423SBarry Smith   bs  = A->rmap->bs;
1424dc354874SHong Zhang   aa  = a->a;
1425dc354874SHong Zhang   ai  = a->i;
1426dc354874SHong Zhang   aj  = a->j;
142744117c81SHong Zhang   mbs = a->mbs;
1428dc354874SHong Zhang 
1429985db425SBarry Smith   ierr = VecSet(v,0.0);CHKERRQ(ierr);
14301ebc52fbSHong Zhang   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1431dc354874SHong Zhang   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1432e32f2f54SBarry Smith   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
143344117c81SHong Zhang   for (i=0; i<mbs; i++) {
1434d0f6400bSHong Zhang     ncols = ai[1] - ai[0]; ai++;
1435d0f6400bSHong Zhang     brow  = bs*i;
143644117c81SHong Zhang     for (j=0; j<ncols; j++) {
1437d0f6400bSHong Zhang       bcol = bs*(*aj);
143844117c81SHong Zhang       for (kcol=0; kcol<bs; kcol++) {
1439d0f6400bSHong Zhang         col = bcol + kcol;      /* col index */
144044117c81SHong Zhang         for (krow=0; krow<bs; krow++) {
1441d0f6400bSHong Zhang           atmp = PetscAbsScalar(*aa); aa++;
1442d0f6400bSHong Zhang           row  = brow + krow;   /* row index */
1443c3fca9a7SHong Zhang           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1444c3fca9a7SHong Zhang           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
144544117c81SHong Zhang         }
144644117c81SHong Zhang       }
1447d0f6400bSHong Zhang       aj++;
1448dc354874SHong Zhang     }
1449dc354874SHong Zhang   }
14501ebc52fbSHong Zhang   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1451dc354874SHong Zhang   PetscFunctionReturn(0);
1452dc354874SHong Zhang }
1453