xref: /petsc/src/mat/impls/baij/seq/baij.c (revision c005e166144a6ac88a244813670ca9f5d5dfe78d)
1 
2 #ifndef lint
3 static char vcid[] = "$Id: baij.c,v 1.60 1996/07/31 19:21:20 balay Exp balay $";
4 #endif
5 
6 /*
7     Defines the basic matrix operations for the BAIJ (compressed row)
8   matrix storage format.
9 */
10 #include "baij.h"
11 #include "src/vec/vecimpl.h"
12 #include "src/inline/spops.h"
13 #include "petsc.h"
14 
15 extern   int MatToSymmetricIJ_SeqAIJ(int,int*,int*,int,int,int**,int**);
16 
17 static int MatGetReordering_SeqBAIJ(Mat A,MatReordering type,IS *rperm,IS *cperm)
18 {
19   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
20   int         ierr, *ia, *ja,n = a->mbs,*idx,i,ishift,oshift;
21 
22   /*
23      this is tacky: In the future when we have written special factorization
24      and solve routines for the identity permutation we should use a
25      stride index set instead of the general one.
26   */
27   if (type  == ORDER_NATURAL) {
28     idx = (int *) PetscMalloc( n*sizeof(int) ); CHKPTRQ(idx);
29     for ( i=0; i<n; i++ ) idx[i] = i;
30     ierr = ISCreateSeq(MPI_COMM_SELF,n,idx,rperm); CHKERRQ(ierr);
31     ierr = ISCreateSeq(MPI_COMM_SELF,n,idx,cperm); CHKERRQ(ierr);
32     PetscFree(idx);
33     ISSetPermutation(*rperm);
34     ISSetPermutation(*cperm);
35     ISSetIdentity(*rperm);
36     ISSetIdentity(*cperm);
37     return 0;
38   }
39 
40   MatReorderingRegisterAll();
41   ishift = 0;
42   oshift = -MatReorderingIndexShift[(int)type];
43   if (MatReorderingRequiresSymmetric[(int)type]) {
44     ierr = MatToSymmetricIJ_SeqAIJ(n,a->i,a->j,ishift,oshift,&ia,&ja);CHKERRQ(ierr);
45     ierr = MatGetReordering_IJ(n,ia,ja,type,rperm,cperm); CHKERRQ(ierr);
46     PetscFree(ia); PetscFree(ja);
47   } else {
48     if (ishift == oshift) {
49       ierr = MatGetReordering_IJ(n,a->i,a->j,type,rperm,cperm);CHKERRQ(ierr);
50     }
51     else if (ishift == -1) {
52       /* temporarily subtract 1 from i and j indices */
53       int nz = a->i[n] - 1;
54       for ( i=0; i<nz; i++ ) a->j[i]--;
55       for ( i=0; i<n+1; i++ ) a->i[i]--;
56       ierr = MatGetReordering_IJ(n,a->i,a->j,type,rperm,cperm);CHKERRQ(ierr);
57       for ( i=0; i<nz; i++ ) a->j[i]++;
58       for ( i=0; i<n+1; i++ ) a->i[i]++;
59     } else {
60       /* temporarily add 1 to i and j indices */
61       int nz = a->i[n] - 1;
62       for ( i=0; i<nz; i++ ) a->j[i]++;
63       for ( i=0; i<n+1; i++ ) a->i[i]++;
64       ierr = MatGetReordering_IJ(n,a->i,a->j,type,rperm,cperm);CHKERRQ(ierr);
65       for ( i=0; i<nz; i++ ) a->j[i]--;
66       for ( i=0; i<n+1; i++ ) a->i[i]--;
67     }
68   }
69   return 0;
70 }
71 
72 /*
73      Adds diagonal pointers to sparse matrix structure.
74 */
75 
76 int MatMarkDiag_SeqBAIJ(Mat A)
77 {
78   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
79   int         i,j, *diag, m = a->mbs;
80 
81   diag = (int *) PetscMalloc( (m+1)*sizeof(int)); CHKPTRQ(diag);
82   PLogObjectMemory(A,(m+1)*sizeof(int));
83   for ( i=0; i<m; i++ ) {
84     for ( j=a->i[i]; j<a->i[i+1]; j++ ) {
85       if (a->j[j] == i) {
86         diag[i] = j;
87         break;
88       }
89     }
90   }
91   a->diag = diag;
92   return 0;
93 }
94 
95 #include "draw.h"
96 #include "pinclude/pviewer.h"
97 #include "sys.h"
98 
99 static int MatView_SeqBAIJ_Binary(Mat A,Viewer viewer)
100 {
101   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
102   int         i, fd, *col_lens, ierr, bs = a->bs,count,*jj,j,k,l,bs2=a->bs2;
103   Scalar      *aa;
104 
105   ierr = ViewerBinaryGetDescriptor(viewer,&fd); CHKERRQ(ierr);
106   col_lens = (int *) PetscMalloc((4+a->m)*sizeof(int));CHKPTRQ(col_lens);
107   col_lens[0] = MAT_COOKIE;
108   col_lens[1] = a->m;
109   col_lens[2] = a->n;
110   col_lens[3] = a->nz*bs2;
111 
112   /* store lengths of each row and write (including header) to file */
113   count = 0;
114   for ( i=0; i<a->mbs; i++ ) {
115     for ( j=0; j<bs; j++ ) {
116       col_lens[4+count++] = bs*(a->i[i+1] - a->i[i]);
117     }
118   }
119   ierr = PetscBinaryWrite(fd,col_lens,4+a->m,BINARY_INT,1); CHKERRQ(ierr);
120   PetscFree(col_lens);
121 
122   /* store column indices (zero start index) */
123   jj = (int *) PetscMalloc( a->nz*bs2*sizeof(int) ); CHKPTRQ(jj);
124   count = 0;
125   for ( i=0; i<a->mbs; i++ ) {
126     for ( j=0; j<bs; j++ ) {
127       for ( k=a->i[i]; k<a->i[i+1]; k++ ) {
128         for ( l=0; l<bs; l++ ) {
129           jj[count++] = bs*a->j[k] + l;
130         }
131       }
132     }
133   }
134   ierr = PetscBinaryWrite(fd,jj,bs2*a->nz,BINARY_INT,0); CHKERRQ(ierr);
135   PetscFree(jj);
136 
137   /* store nonzero values */
138   aa = (Scalar *) PetscMalloc(a->nz*bs2*sizeof(Scalar)); CHKPTRQ(aa);
139   count = 0;
140   for ( i=0; i<a->mbs; i++ ) {
141     for ( j=0; j<bs; j++ ) {
142       for ( k=a->i[i]; k<a->i[i+1]; k++ ) {
143         for ( l=0; l<bs; l++ ) {
144           aa[count++] = a->a[bs2*k + l*bs + j];
145         }
146       }
147     }
148   }
149   ierr = PetscBinaryWrite(fd,aa,bs2*a->nz,BINARY_SCALAR,0); CHKERRQ(ierr);
150   PetscFree(aa);
151   return 0;
152 }
153 
154 static int MatView_SeqBAIJ_ASCII(Mat A,Viewer viewer)
155 {
156   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
157   int         ierr, i,j,format,bs = a->bs,k,l,bs2=a->bs2;
158   FILE        *fd;
159   char        *outputname;
160 
161   ierr = ViewerASCIIGetPointer(viewer,&fd); CHKERRQ(ierr);
162   ierr = ViewerFileGetOutputname_Private(viewer,&outputname);CHKERRQ(ierr);
163   ierr = ViewerGetFormat(viewer,&format);
164   if (format == ASCII_FORMAT_INFO || format == ASCII_FORMAT_INFO_DETAILED) {
165     fprintf(fd,"  block size is %d\n",bs);
166   }
167   else if (format == ASCII_FORMAT_MATLAB) {
168     SETERRQ(1,"MatView_SeqBAIJ_ASCII:Matlab format not supported");
169   }
170   else if (format == ASCII_FORMAT_COMMON) {
171     for ( i=0; i<a->mbs; i++ ) {
172       for ( j=0; j<bs; j++ ) {
173         fprintf(fd,"row %d:",i*bs+j);
174         for ( k=a->i[i]; k<a->i[i+1]; k++ ) {
175           for ( l=0; l<bs; l++ ) {
176 #if defined(PETSC_COMPLEX)
177           if (imag(a->a[bs2*k + l*bs + j]) != 0.0 && real(a->a[bs2*k + l*bs + j]) != 0.0)
178             fprintf(fd," %d %g + %g i",bs*a->j[k]+l,
179               real(a->a[bs2*k + l*bs + j]),imag(a->a[bs2*k + l*bs + j]));
180           else if (real(a->a[bs2*k + l*bs + j]) != 0.0)
181             fprintf(fd," %d %g ",bs*a->j[k]+l,real(a->a[bs2*k + l*bs + j]));
182 #else
183           if (a->a[bs2*k + l*bs + j] != 0.0)
184             fprintf(fd," %d %g ",bs*a->j[k]+l,a->a[bs2*k + l*bs + j]);
185 #endif
186           }
187         }
188         fprintf(fd,"\n");
189       }
190     }
191   }
192   else {
193     for ( i=0; i<a->mbs; i++ ) {
194       for ( j=0; j<bs; j++ ) {
195         fprintf(fd,"row %d:",i*bs+j);
196         for ( k=a->i[i]; k<a->i[i+1]; k++ ) {
197           for ( l=0; l<bs; l++ ) {
198 #if defined(PETSC_COMPLEX)
199           if (imag(a->a[bs2*k + l*bs + j]) != 0.0) {
200             fprintf(fd," %d %g + %g i",bs*a->j[k]+l,
201               real(a->a[bs2*k + l*bs + j]),imag(a->a[bs2*k + l*bs + j]));
202           }
203           else {
204             fprintf(fd," %d %g ",bs*a->j[k]+l,real(a->a[bs2*k + l*bs + j]));
205           }
206 #else
207             fprintf(fd," %d %g ",bs*a->j[k]+l,a->a[bs2*k + l*bs + j]);
208 #endif
209           }
210         }
211         fprintf(fd,"\n");
212       }
213     }
214   }
215   fflush(fd);
216   return 0;
217 }
218 
219 static int MatView_SeqBAIJ(PetscObject obj,Viewer viewer)
220 {
221   Mat         A = (Mat) obj;
222   ViewerType  vtype;
223   int         ierr;
224 
225   ierr = ViewerGetType(viewer,&vtype); CHKERRQ(ierr);
226   if (vtype == MATLAB_VIEWER) {
227     SETERRQ(1,"MatView_SeqBAIJ:Matlab viewer not supported");
228   }
229   else if (vtype == ASCII_FILE_VIEWER || vtype == ASCII_FILES_VIEWER){
230     return MatView_SeqBAIJ_ASCII(A,viewer);
231   }
232   else if (vtype == BINARY_FILE_VIEWER) {
233     return MatView_SeqBAIJ_Binary(A,viewer);
234   }
235   else if (vtype == DRAW_VIEWER) {
236     SETERRQ(1,"MatView_SeqBAIJ:Draw viewer not supported");
237   }
238   return 0;
239 }
240 
241 #define CHUNKSIZE  10
242 
243 /* This version has row oriented v  */
244 static int MatSetValues_SeqBAIJ(Mat A,int m,int *im,int n,int *in,Scalar *v,InsertMode is)
245 {
246   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
247   int         *rp,k,low,high,t,ii,row,nrow,i,col,l,rmax,N,sorted=a->sorted;
248   int         *imax=a->imax,*ai=a->i,*ailen=a->ilen,roworiented=a->roworiented;
249   int         *aj=a->j,nonew=a->nonew,bs=a->bs,brow,bcol;
250   int          ridx,cidx,bs2=a->bs2;
251   Scalar      *ap,value,*aa=a->a,*bap;
252 
253   for ( k=0; k<m; k++ ) { /* loop over added rows */
254     row  = im[k]; brow = row/bs;
255     if (row < 0) SETERRQ(1,"MatSetValues_SeqBAIJ:Negative row");
256     if (row >= a->m) SETERRQ(1,"MatSetValues_SeqBAIJ:Row too large");
257     rp   = aj + ai[brow]; ap = aa + bs2*ai[brow];
258     rmax = imax[brow]; nrow = ailen[brow];
259     low = 0;
260     for ( l=0; l<n; l++ ) { /* loop over added columns */
261       if (in[l] < 0) SETERRQ(1,"MatSetValues_SeqBAIJ:Negative column");
262       if (in[l] >= a->n) SETERRQ(1,"MatSetValues_SeqBAIJ:Column too large");
263       col = in[l]; bcol = col/bs;
264       ridx = row % bs; cidx = col % bs;
265       if (roworiented) {
266         value = *v++;
267       }
268       else {
269         value = v[k + l*m];
270       }
271       if (!sorted) low = 0; high = nrow;
272       while (high-low > 5) {
273         t = (low+high)/2;
274         if (rp[t] > bcol) high = t;
275         else             low  = t;
276       }
277       for ( i=low; i<high; i++ ) {
278         if (rp[i] > bcol) break;
279         if (rp[i] == bcol) {
280           bap  = ap +  bs2*i + bs*cidx + ridx;
281           if (is == ADD_VALUES) *bap += value;
282           else                  *bap  = value;
283           goto noinsert;
284         }
285       }
286       if (nonew) goto noinsert;
287       if (nrow >= rmax) {
288         /* there is no extra room in row, therefore enlarge */
289         int    new_nz = ai[a->mbs] + CHUNKSIZE,len,*new_i,*new_j;
290         Scalar *new_a;
291 
292         /* malloc new storage space */
293         len     = new_nz*(sizeof(int)+bs2*sizeof(Scalar))+(a->mbs+1)*sizeof(int);
294         new_a   = (Scalar *) PetscMalloc( len ); CHKPTRQ(new_a);
295         new_j   = (int *) (new_a + bs2*new_nz);
296         new_i   = new_j + new_nz;
297 
298         /* copy over old data into new slots */
299         for ( ii=0; ii<brow+1; ii++ ) {new_i[ii] = ai[ii];}
300         for ( ii=brow+1; ii<a->mbs+1; ii++ ) {new_i[ii] = ai[ii]+CHUNKSIZE;}
301         PetscMemcpy(new_j,aj,(ai[brow]+nrow)*sizeof(int));
302         len = (new_nz - CHUNKSIZE - ai[brow] - nrow);
303         PetscMemcpy(new_j+ai[brow]+nrow+CHUNKSIZE,aj+ai[brow]+nrow,
304                                                            len*sizeof(int));
305         PetscMemcpy(new_a,aa,(ai[brow]+nrow)*bs2*sizeof(Scalar));
306         PetscMemzero(new_a+bs2*(ai[brow]+nrow),bs2*CHUNKSIZE*sizeof(Scalar));
307         PetscMemcpy(new_a+bs2*(ai[brow]+nrow+CHUNKSIZE),
308                     aa+bs2*(ai[brow]+nrow),bs2*len*sizeof(Scalar));
309         /* free up old matrix storage */
310         PetscFree(a->a);
311         if (!a->singlemalloc) {PetscFree(a->i);PetscFree(a->j);}
312         aa = a->a = new_a; ai = a->i = new_i; aj = a->j = new_j;
313         a->singlemalloc = 1;
314 
315         rp   = aj + ai[brow]; ap = aa + bs2*ai[brow];
316         rmax = imax[brow] = imax[brow] + CHUNKSIZE;
317         PLogObjectMemory(A,CHUNKSIZE*(sizeof(int) + bs2*sizeof(Scalar)));
318         a->maxnz += bs2*CHUNKSIZE;
319         a->reallocs++;
320         a->nz++;
321       }
322       N = nrow++ - 1;
323       /* shift up all the later entries in this row */
324       for ( ii=N; ii>=i; ii-- ) {
325         rp[ii+1] = rp[ii];
326          PetscMemcpy(ap+bs2*(ii+1),ap+bs2*(ii),bs2*sizeof(Scalar));
327       }
328       if (N>=i) PetscMemzero(ap+bs2*i,bs2*sizeof(Scalar));
329       rp[i] = bcol;
330       ap[bs2*i + bs*cidx + ridx] = value;
331       noinsert:;
332       low = i;
333     }
334     ailen[brow] = nrow;
335   }
336   return 0;
337 }
338 
339 static int MatGetSize_SeqBAIJ(Mat A,int *m,int *n)
340 {
341   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
342   *m = a->m; *n = a->n;
343   return 0;
344 }
345 
346 static int MatGetOwnershipRange_SeqBAIJ(Mat A,int *m,int *n)
347 {
348   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
349   *m = 0; *n = a->m;
350   return 0;
351 }
352 
353 int MatGetRow_SeqBAIJ(Mat A,int row,int *nz,int **idx,Scalar **v)
354 {
355   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
356   int          itmp,i,j,k,M,*ai,*aj,bs,bn,bp,*idx_i,bs2;
357   Scalar      *aa,*v_i,*aa_i;
358 
359   bs  = a->bs;
360   ai  = a->i;
361   aj  = a->j;
362   aa  = a->a;
363   bs2 = a->bs2;
364 
365   if (row < 0 || row >= a->m) SETERRQ(1,"MatGetRow_SeqBAIJ:Row out of range");
366 
367   bn  = row/bs;   /* Block number */
368   bp  = row % bs; /* Block Position */
369   M   = ai[bn+1] - ai[bn];
370   *nz = bs*M;
371 
372   if (v) {
373     *v = 0;
374     if (*nz) {
375       *v = (Scalar *) PetscMalloc( (*nz)*sizeof(Scalar) ); CHKPTRQ(*v);
376       for ( i=0; i<M; i++ ) { /* for each block in the block row */
377         v_i  = *v + i*bs;
378         aa_i = aa + bs2*(ai[bn] + i);
379         for ( j=bp,k=0; j<bs2; j+=bs,k++ ) {v_i[k] = aa_i[j];}
380       }
381     }
382   }
383 
384   if (idx) {
385     *idx = 0;
386     if (*nz) {
387       *idx = (int *) PetscMalloc( (*nz)*sizeof(int) ); CHKPTRQ(*idx);
388       for ( i=0; i<M; i++ ) { /* for each block in the block row */
389         idx_i = *idx + i*bs;
390         itmp  = bs*aj[ai[bn] + i];
391         for ( j=0; j<bs; j++ ) {idx_i[j] = itmp++;}
392       }
393     }
394   }
395   return 0;
396 }
397 
398 int MatRestoreRow_SeqBAIJ(Mat A,int row,int *nz,int **idx,Scalar **v)
399 {
400   if (idx) {if (*idx) PetscFree(*idx);}
401   if (v)   {if (*v)   PetscFree(*v);}
402   return 0;
403 }
404 
405 static int MatTranspose_SeqBAIJ(Mat A,Mat *B)
406 {
407   Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
408   Mat         C;
409   int         i,j,k,ierr,*aj=a->j,*ai=a->i,bs=a->bs,mbs=a->mbs,nbs=a->nbs,len,*col;
410   int         *rows,*cols,bs2=a->bs2;
411   Scalar      *array=a->a;
412 
413   if (B==PETSC_NULL && mbs!=nbs)
414     SETERRQ(1,"MatTranspose_SeqBAIJ:Square matrix only for in-place");
415   col = (int *) PetscMalloc((1+nbs)*sizeof(int)); CHKPTRQ(col);
416   PetscMemzero(col,(1+nbs)*sizeof(int));
417 
418   for ( i=0; i<ai[mbs]; i++ ) col[aj[i]] += 1;
419   ierr = MatCreateSeqBAIJ(A->comm,bs,a->n,a->m,PETSC_NULL,col,&C); CHKERRQ(ierr);
420   PetscFree(col);
421   rows = (int *) PetscMalloc(2*bs*sizeof(int)); CHKPTRQ(rows);
422   cols = rows + bs;
423   for ( i=0; i<mbs; i++ ) {
424     cols[0] = i*bs;
425     for (k=1; k<bs; k++ ) cols[k] = cols[k-1] + 1;
426     len = ai[i+1] - ai[i];
427     for ( j=0; j<len; j++ ) {
428       rows[0] = (*aj++)*bs;
429       for (k=1; k<bs; k++ ) rows[k] = rows[k-1] + 1;
430       ierr = MatSetValues(C,bs,rows,bs,cols,array,INSERT_VALUES); CHKERRQ(ierr);
431       array += bs2;
432     }
433   }
434   PetscFree(rows);
435 
436   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
437   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
438 
439   if (B != PETSC_NULL) {
440     *B = C;
441   } else {
442     /* This isn't really an in-place transpose */
443     PetscFree(a->a);
444     if (!a->singlemalloc) {PetscFree(a->i); PetscFree(a->j);}
445     if (a->diag) PetscFree(a->diag);
446     if (a->ilen) PetscFree(a->ilen);
447     if (a->imax) PetscFree(a->imax);
448     if (a->solve_work) PetscFree(a->solve_work);
449     PetscFree(a);
450     PetscMemcpy(A,C,sizeof(struct _Mat));
451     PetscHeaderDestroy(C);
452   }
453   return 0;
454 }
455 
456 
457 static int MatAssemblyEnd_SeqBAIJ(Mat A,MatAssemblyType mode)
458 {
459   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
460   int        fshift = 0,i,j,*ai = a->i, *aj = a->j, *imax = a->imax;
461   int        m = a->m,*ip, N, *ailen = a->ilen;
462   int        mbs = a->mbs, bs2 = a->bs2;
463   Scalar     *aa = a->a, *ap;
464 
465   if (mode == MAT_FLUSH_ASSEMBLY) return 0;
466 
467   for ( i=1; i<mbs; i++ ) {
468     /* move each row back by the amount of empty slots (fshift) before it*/
469     fshift += imax[i-1] - ailen[i-1];
470     if (fshift) {
471       ip = aj + ai[i]; ap = aa + bs2*ai[i];
472       N = ailen[i];
473       for ( j=0; j<N; j++ ) {
474         ip[j-fshift] = ip[j];
475         PetscMemcpy(ap+(j-fshift)*bs2,ap+j*bs2,bs2*sizeof(Scalar));
476       }
477     }
478     ai[i] = ai[i-1] + ailen[i-1];
479   }
480   if (mbs) {
481     fshift += imax[mbs-1] - ailen[mbs-1];
482     ai[mbs] = ai[mbs-1] + ailen[mbs-1];
483   }
484   /* reset ilen and imax for each row */
485   for ( i=0; i<mbs; i++ ) {
486     ailen[i] = imax[i] = ai[i+1] - ai[i];
487   }
488   a->nz = ai[mbs];
489 
490   /* diagonals may have moved, so kill the diagonal pointers */
491   if (fshift && a->diag) {
492     PetscFree(a->diag);
493     PLogObjectMemory(A,-(m+1)*sizeof(int));
494     a->diag = 0;
495   }
496   PLogInfo(A,"MatAssemblyEnd_SeqBAIJ: Unneed storage space (blocks) %d used %d, rows %d, block size %d\n", fshift*bs2,a->nz*bs2,m,a->bs);
497   PLogInfo(A,"MatAssemblyEnd_SeqBAIJ: Number of mallocs during MatSetValues %d\n",
498            a->reallocs);
499   return 0;
500 }
501 
502 static int MatZeroEntries_SeqBAIJ(Mat A)
503 {
504   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
505   PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(Scalar));
506   return 0;
507 }
508 
509 int MatDestroy_SeqBAIJ(PetscObject obj)
510 {
511   Mat         A  = (Mat) obj;
512   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
513 
514 #if defined(PETSC_LOG)
515   PLogObjectState(obj,"Rows=%d, Cols=%d, NZ=%d",a->m,a->n,a->nz);
516 #endif
517   PetscFree(a->a);
518   if (!a->singlemalloc) { PetscFree(a->i); PetscFree(a->j);}
519   if (a->diag) PetscFree(a->diag);
520   if (a->ilen) PetscFree(a->ilen);
521   if (a->imax) PetscFree(a->imax);
522   if (a->solve_work) PetscFree(a->solve_work);
523   if (a->mult_work) PetscFree(a->mult_work);
524   PetscFree(a);
525   PLogObjectDestroy(A);
526   PetscHeaderDestroy(A);
527   return 0;
528 }
529 
530 static int MatSetOption_SeqBAIJ(Mat A,MatOption op)
531 {
532   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
533   if      (op == MAT_ROW_ORIENTED)              a->roworiented = 1;
534   else if (op == MAT_COLUMN_ORIENTED)           a->roworiented = 0;
535   else if (op == MAT_COLUMNS_SORTED)            a->sorted      = 1;
536   else if (op == MAT_NO_NEW_NONZERO_LOCATIONS)  a->nonew       = 1;
537   else if (op == MAT_YES_NEW_NONZERO_LOCATIONS) a->nonew       = 0;
538   else if (op == MAT_ROWS_SORTED ||
539            op == MAT_SYMMETRIC ||
540            op == MAT_STRUCTURALLY_SYMMETRIC ||
541            op == MAT_YES_NEW_DIAGONALS)
542     PLogInfo(A,"Info:MatSetOption_SeqBAIJ:Option ignored\n");
543   else if (op == MAT_NO_NEW_DIAGONALS)
544     {SETERRQ(PETSC_ERR_SUP,"MatSetOption_SeqBAIJ:MAT_NO_NEW_DIAGONALS");}
545   else
546     {SETERRQ(PETSC_ERR_SUP,"MatSetOption_SeqBAIJ:unknown option");}
547   return 0;
548 }
549 
550 
551 /* -------------------------------------------------------*/
552 /* Should check that shapes of vectors and matrices match */
553 /* -------------------------------------------------------*/
554 #include "pinclude/plapack.h"
555 
556 static int MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
557 {
558   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
559   Scalar          *xg,*zg;
560   register Scalar *x,*z,*v,sum;
561   int             mbs=a->mbs,i,*idx,*ii,n,ierr;
562 
563   ierr = VecGetArray(xx,&xg); CHKERRQ(ierr); x = xg;
564   ierr = VecGetArray(zz,&zg); CHKERRQ(ierr); z = zg;
565 
566   idx   = a->j;
567   v     = a->a;
568   ii    = a->i;
569 
570   for ( i=0; i<mbs; i++ ) {
571     n    = ii[1] - ii[0]; ii++;
572     sum  = 0.0;
573     while (n--) sum += *v++ * x[*idx++];
574     z[i] = sum;
575   }
576   ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
577   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
578   PLogFlops(2*a->nz - a->m);
579   return 0;
580 }
581 
582 static int MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
583 {
584   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
585   Scalar          *xg,*zg;
586   register Scalar *x,*z,*v,*xb,sum1,sum2;
587   register Scalar x1,x2;
588   int             mbs=a->mbs,i,*idx,*ii;
589   int             j,n,ierr;
590 
591   ierr = VecGetArray(xx,&xg); CHKERRQ(ierr); x = xg;
592   ierr = VecGetArray(zz,&zg); CHKERRQ(ierr); z = zg;
593 
594   idx   = a->j;
595   v     = a->a;
596   ii    = a->i;
597 
598   for ( i=0; i<mbs; i++ ) {
599     n  = ii[1] - ii[0]; ii++;
600     sum1 = 0.0; sum2 = 0.0;
601     for ( j=0; j<n; j++ ) {
602       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
603       sum1 += v[0]*x1 + v[2]*x2;
604       sum2 += v[1]*x1 + v[3]*x2;
605       v += 4;
606     }
607     z[0] = sum1; z[1] = sum2;
608     z += 2;
609   }
610   ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
611   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
612   PLogFlops(4*a->nz - a->m);
613   return 0;
614 }
615 
616 static int MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
617 {
618   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
619   Scalar          *xg,*zg;
620   register Scalar *x,*z,*v,*xb,sum1,sum2,sum3,x1,x2,x3;
621   int             mbs=a->mbs,i,*idx,*ii,j,n,ierr;
622 
623   ierr = VecGetArray(xx,&xg); CHKERRQ(ierr); x = xg;
624   ierr = VecGetArray(zz,&zg); CHKERRQ(ierr); z = zg;
625 
626   idx   = a->j;
627   v     = a->a;
628   ii    = a->i;
629 
630   for ( i=0; i<mbs; i++ ) {
631     n  = ii[1] - ii[0]; ii++;
632     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
633     for ( j=0; j<n; j++ ) {
634       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
635       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
636       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
637       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
638       v += 9;
639     }
640     z[0] = sum1; z[1] = sum2; z[2] = sum3;
641     z += 3;
642   }
643   ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
644   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
645   PLogFlops(18*a->nz - a->m);
646   return 0;
647 }
648 
649 static int MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
650 {
651   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
652   Scalar          *xg,*zg;
653   register Scalar *x,*z,*v,*xb,sum1,sum2,sum3,sum4;
654   register Scalar x1,x2,x3,x4;
655   int             mbs=a->mbs,i,*idx,*ii;
656   int             j,n,ierr;
657 
658   ierr = VecGetArray(xx,&xg); CHKERRQ(ierr); x = xg;
659   ierr = VecGetArray(zz,&zg); CHKERRQ(ierr); z = zg;
660 
661   idx   = a->j;
662   v     = a->a;
663   ii    = a->i;
664 
665   for ( i=0; i<mbs; i++ ) {
666     n  = ii[1] - ii[0]; ii++;
667     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
668     for ( j=0; j<n; j++ ) {
669       xb = x + 4*(*idx++);
670       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
671       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
672       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
673       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
674       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
675       v += 16;
676     }
677     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
678     z += 4;
679   }
680   ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
681   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
682   PLogFlops(32*a->nz - a->m);
683   return 0;
684 }
685 
686 static int MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
687 {
688   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
689   Scalar          *xg,*zg;
690   register Scalar *x,*z,*v,*xb,sum1,sum2,sum3,sum4,sum5;
691   register Scalar x1,x2,x3,x4,x5;
692   int             mbs=a->mbs,i,*idx,*ii,j,n,ierr;
693 
694   ierr = VecGetArray(xx,&xg); CHKERRQ(ierr); x = xg;
695   ierr = VecGetArray(zz,&zg); CHKERRQ(ierr); z = zg;
696 
697   idx   = a->j;
698   v     = a->a;
699   ii    = a->i;
700 
701   for ( i=0; i<mbs; i++ ) {
702     n  = ii[1] - ii[0]; ii++;
703     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
704     for ( j=0; j<n; j++ ) {
705       xb = x + 5*(*idx++);
706       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
707       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
708       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
709       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
710       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
711       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
712       v += 25;
713     }
714     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
715     z += 5;
716   }
717   ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
718   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
719   PLogFlops(50*a->nz - a->m);
720   return 0;
721 }
722 
723 static int MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
724 {
725   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
726   Scalar          *xg,*zg;
727   register Scalar *x,*z,*v,*xb;
728   int             mbs=a->mbs,i,*idx,*ii,bs=a->bs,j,n,bs2=a->bs2,ierr;
729   int             _One = 1,ncols,k; Scalar _DOne = 1.0, *work,*workt, _DZero = 0.0;
730 
731   ierr = VecGetArray(xx,&xg); CHKERRQ(ierr); x = xg;
732   ierr = VecGetArray(zz,&zg); CHKERRQ(ierr); z = zg;
733 
734   idx   = a->j;
735   v     = a->a;
736   ii    = a->i;
737 
738 
739   if (!a->mult_work) {
740     k = PetscMax(a->m,a->n);
741     a->mult_work = (Scalar *) PetscMalloc((k+1)*sizeof(Scalar));CHKPTRQ(a->mult_work);
742   }
743   work = a->mult_work;
744   for ( i=0; i<mbs; i++ ) {
745     n     = ii[1] - ii[0]; ii++;
746     ncols = n*bs;
747     workt = work;
748     for ( j=0; j<n; j++ ) {
749       xb = x + bs*(*idx++);
750       for ( k=0; k<bs; k++ ) workt[k] = xb[k];
751       workt += bs;
752     }
753     LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One);
754     v += n*bs2;
755     z += bs;
756   }
757    ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
758   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
759   PLogFlops(2*a->nz*bs2 - a->m);
760   return 0;
761 }
762 
763 static int MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
764 {
765   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
766   Scalar          *xg,*yg,*zg;
767   register Scalar *x,*y,*z,*v,sum;
768   int             mbs=a->mbs,i,*idx,*ii,n,ierr;
769 
770   ierr = VecGetArray(xx,&xg); CHKERRQ(ierr); x = xg;
771   ierr = VecGetArray(yy,&yg); CHKERRQ(ierr); y = yg;
772   ierr = VecGetArray(zz,&zg); CHKERRQ(ierr); z = zg;
773 
774   idx   = a->j;
775   v     = a->a;
776   ii    = a->i;
777 
778   for ( i=0; i<mbs; i++ ) {
779     n    = ii[1] - ii[0]; ii++;
780     sum  = y[i];
781     while (n--) sum += *v++ * x[*idx++];
782     z[i] = sum;
783   }
784   ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
785   ierr = VecRestoreArray(yy,&yg); CHKERRQ(ierr);
786   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
787   PLogFlops(2*a->nz);
788   return 0;
789 }
790 
791 static int MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
792 {
793   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
794   Scalar          *xg,*yg,*zg;
795   register Scalar *x,*y,*z,*v,*xb,sum1,sum2;
796   register Scalar x1,x2;
797   int             mbs=a->mbs,i,*idx,*ii;
798   int             j,n,ierr;
799 
800   ierr = VecGetArray(xx,&xg); CHKERRQ(ierr); x = xg;
801   ierr = VecGetArray(yy,&yg); CHKERRQ(ierr); y = yg;
802   ierr = VecGetArray(zz,&zg); CHKERRQ(ierr); z = zg;
803 
804   idx   = a->j;
805   v     = a->a;
806   ii    = a->i;
807 
808   for ( i=0; i<mbs; i++ ) {
809     n  = ii[1] - ii[0]; ii++;
810     sum1 = y[0]; sum2 = y[1];
811     for ( j=0; j<n; j++ ) {
812       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
813       sum1 += v[0]*x1 + v[2]*x2;
814       sum2 += v[1]*x1 + v[3]*x2;
815       v += 4;
816     }
817     z[0] = sum1; z[1] = sum2;
818     z += 2; y += 2;
819   }
820   ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
821   ierr = VecRestoreArray(yy,&yg); CHKERRQ(ierr);
822   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
823   PLogFlops(4*a->nz);
824   return 0;
825 }
826 
827 static int MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
828 {
829   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
830   Scalar          *xg,*yg,*zg;
831   register Scalar *x,*y,*z,*v,*xb,sum1,sum2,sum3,x1,x2,x3;
832   int             mbs=a->mbs,i,*idx,*ii,j,n,ierr;
833 
834   ierr = VecGetArray(xx,&xg); CHKERRQ(ierr); x = xg;
835   ierr = VecGetArray(yy,&yg); CHKERRQ(ierr); y = yg;
836   ierr = VecGetArray(zz,&zg); CHKERRQ(ierr); z = zg;
837 
838   idx   = a->j;
839   v     = a->a;
840   ii    = a->i;
841 
842   for ( i=0; i<mbs; i++ ) {
843     n  = ii[1] - ii[0]; ii++;
844     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
845     for ( j=0; j<n; j++ ) {
846       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
847       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
848       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
849       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
850       v += 9;
851     }
852     z[0] = sum1; z[1] = sum2; z[2] = sum3;
853     z += 3; y += 3;
854   }
855   ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
856   ierr = VecRestoreArray(yy,&yg); CHKERRQ(ierr);
857   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
858   PLogFlops(18*a->nz);
859   return 0;
860 }
861 
862 static int MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
863 {
864   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
865   Scalar          *xg,*yg,*zg;
866   register Scalar *x,*y,*z,*v,*xb,sum1,sum2,sum3,sum4;
867   register Scalar x1,x2,x3,x4;
868   int             mbs=a->mbs,i,*idx,*ii;
869   int             j,n,ierr;
870 
871   ierr = VecGetArray(xx,&xg); CHKERRQ(ierr); x = xg;
872   ierr = VecGetArray(yy,&yg); CHKERRQ(ierr); y = yg;
873   ierr = VecGetArray(zz,&zg); CHKERRQ(ierr); z = zg;
874 
875   idx   = a->j;
876   v     = a->a;
877   ii    = a->i;
878 
879   for ( i=0; i<mbs; i++ ) {
880     n  = ii[1] - ii[0]; ii++;
881     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
882     for ( j=0; j<n; j++ ) {
883       xb = x + 4*(*idx++);
884       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
885       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
886       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
887       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
888       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
889       v += 16;
890     }
891     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
892     z += 4; y += 4;
893   }
894   ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
895   ierr = VecRestoreArray(yy,&yg); CHKERRQ(ierr);
896   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
897   PLogFlops(32*a->nz);
898   return 0;
899 }
900 
901 static int MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
902 {
903   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
904   Scalar          *xg,*yg,*zg;
905   register Scalar *x,*y,*z,*v,*xb,sum1,sum2,sum3,sum4,sum5;
906   register Scalar x1,x2,x3,x4,x5;
907   int             mbs=a->mbs,i,*idx,*ii,j,n,ierr;
908 
909   ierr = VecGetArray(xx,&xg); CHKERRQ(ierr); x = xg;
910   ierr = VecGetArray(yy,&yg); CHKERRQ(ierr); y = yg;
911   ierr = VecGetArray(zz,&zg); CHKERRQ(ierr); z = zg;
912 
913   idx   = a->j;
914   v     = a->a;
915   ii    = a->i;
916 
917   for ( i=0; i<mbs; i++ ) {
918     n  = ii[1] - ii[0]; ii++;
919     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
920     for ( j=0; j<n; j++ ) {
921       xb = x + 5*(*idx++);
922       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
923       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
924       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
925       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
926       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
927       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
928       v += 25;
929     }
930     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
931     z += 5; y += 5;
932   }
933   ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
934   ierr = VecRestoreArray(yy,&yg); CHKERRQ(ierr);
935   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
936   PLogFlops(50*a->nz);
937   return 0;
938 }
939 
940 static int MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
941 {
942   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
943   Scalar          *xg,*zg;
944   register Scalar *x,*z,*v,*xb;
945   int             mbs=a->mbs,i,*idx,*ii,bs=a->bs,j,n,bs2=a->bs2,ierr;
946   int             _One = 1,ncols,k; Scalar _DOne = 1.0, *work,*workt;
947 
948   if ( xx != yy) { ierr = VecCopy(yy,zz); CHKERRQ(ierr); }
949 
950   ierr = VecGetArray(xx,&xg); CHKERRQ(ierr); x = xg;
951   ierr = VecGetArray(zz,&zg); CHKERRQ(ierr); z = zg;
952 
953   idx   = a->j;
954   v     = a->a;
955   ii    = a->i;
956 
957 
958   if (!a->mult_work) {
959     k = PetscMax(a->m,a->n);
960     a->mult_work = (Scalar *) PetscMalloc(k*sizeof(Scalar));CHKPTRQ(a->mult_work);
961   }
962   work = a->mult_work;
963   for ( i=0; i<mbs; i++ ) {
964     n     = ii[1] - ii[0]; ii++;
965     ncols = n*bs;
966     workt = work;
967     for ( j=0; j<n; j++ ) {
968       xb = x + bs*(*idx++);
969       for ( k=0; k<bs; k++ ) workt[k] = xb[k];
970       workt += bs;
971     }
972     LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One);
973     v += n*bs2;
974     z += bs;
975   }
976   ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
977   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
978   PLogFlops(2*a->nz*bs2 );
979   return 0;
980 }
981 
982 static int MatMultTrans_SeqBAIJ(Mat A,Vec xx,Vec zz)
983 {
984   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
985   Scalar          *xg,*zg,*zb;
986   register Scalar *x,*z,*v,*xb,x1,x2,x3,x4,x5;
987   int             mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval,N=a->n;
988   int             bs=a->bs,j,n,bs2=a->bs2,*ib,ierr;
989 
990 
991   ierr = VecGetArray(xx,&xg); CHKERRQ(ierr); x = xg;
992   ierr = VecGetArray(zz,&zg); CHKERRQ(ierr); z = zg;
993   PetscMemzero(z,N*sizeof(Scalar));
994 
995   idx   = a->j;
996   v     = a->a;
997   ii    = a->i;
998 
999   switch (bs) {
1000   case 1:
1001     for ( i=0; i<mbs; i++ ) {
1002       n  = ii[1] - ii[0]; ii++;
1003       xb = x + i; x1 = xb[0];
1004       ib = idx + ai[i];
1005       for ( j=0; j<n; j++ ) {
1006         rval    = ib[j];
1007         z[rval] += *v++ * x1;
1008       }
1009     }
1010     break;
1011   case 2:
1012     for ( i=0; i<mbs; i++ ) {
1013       n  = ii[1] - ii[0]; ii++;
1014       xb = x + 2*i; x1 = xb[0]; x2 = xb[1];
1015       ib = idx + ai[i];
1016       for ( j=0; j<n; j++ ) {
1017         rval      = ib[j]*2;
1018         z[rval++] += v[0]*x1 + v[1]*x2;
1019         z[rval++] += v[2]*x1 + v[3]*x2;
1020         v += 4;
1021       }
1022     }
1023     break;
1024   case 3:
1025     for ( i=0; i<mbs; i++ ) {
1026       n  = ii[1] - ii[0]; ii++;
1027       xb = x + 3*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1028       ib = idx + ai[i];
1029       for ( j=0; j<n; j++ ) {
1030         rval      = ib[j]*3;
1031         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1032         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1033         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1034         v += 9;
1035       }
1036     }
1037     break;
1038   case 4:
1039     for ( i=0; i<mbs; i++ ) {
1040       n  = ii[1] - ii[0]; ii++;
1041       xb = x + 4*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1042       ib = idx + ai[i];
1043       for ( j=0; j<n; j++ ) {
1044         rval      = ib[j]*4;
1045         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1046         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1047         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1048         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1049         v += 16;
1050       }
1051     }
1052     break;
1053   case 5:
1054     for ( i=0; i<mbs; i++ ) {
1055       n  = ii[1] - ii[0]; ii++;
1056       xb = x + 5*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1057       x4 = xb[3];   x5 = xb[4];
1058       ib = idx + ai[i];
1059       for ( j=0; j<n; j++ ) {
1060         rval      = ib[j]*5;
1061         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1062         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1063         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1064         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1065         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1066         v += 25;
1067       }
1068     }
1069     break;
1070       /* block sizes larger then 5 by 5 are handled by BLAS */
1071     default: {
1072       int  _One = 1,ncols,k; Scalar _DOne = 1.0, *work,*workt;
1073       if (!a->mult_work) {
1074         k = PetscMax(a->m,a->n);
1075         a->mult_work = (Scalar *) PetscMalloc(k*sizeof(Scalar));
1076         CHKPTRQ(a->mult_work);
1077       }
1078       work = a->mult_work;
1079       for ( i=0; i<mbs; i++ ) {
1080         n     = ii[1] - ii[0]; ii++;
1081         ncols = n*bs;
1082         PetscMemzero(work,ncols*sizeof(Scalar));
1083         LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One);
1084         v += n*bs2;
1085         x += bs;
1086         workt = work;
1087         for ( j=0; j<n; j++ ) {
1088           zb = z + bs*(*idx++);
1089           for ( k=0; k<bs; k++ ) zb[k] += workt[k] ;
1090           workt += bs;
1091         }
1092       }
1093     }
1094   }
1095   ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
1096   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
1097   PLogFlops(2*a->nz*a->bs2 - a->n);
1098   return 0;
1099 }
1100 static int MatMultTransAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1101 {
1102   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1103   Scalar          *xg,*zg,*zb;
1104   register Scalar *x,*z,*v,*xb,x1,x2,x3,x4,x5;
1105   int             mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval,N=a->n;
1106   int             bs=a->bs,j,n,bs2=a->bs2,*ib,ierr;
1107 
1108 
1109 
1110   ierr = VecGetArray(xx,&xg); CHKERRQ(ierr); x = xg;
1111   ierr = VecGetArray(zz,&zg); CHKERRQ(ierr); z = zg;
1112 
1113   if ( yy != zz ) { ierr = VecCopy(yy,zz); CHKERRQ(ierr); }
1114   else PetscMemzero(z,N*sizeof(Scalar));
1115 
1116 
1117   idx   = a->j;
1118   v     = a->a;
1119   ii    = a->i;
1120 
1121   switch (bs) {
1122   case 1:
1123     for ( i=0; i<mbs; i++ ) {
1124       n  = ii[1] - ii[0]; ii++;
1125       xb = x + i; x1 = xb[0];
1126       ib = idx + ai[i];
1127       for ( j=0; j<n; j++ ) {
1128         rval    = ib[j];
1129         z[rval] += *v++ * x1;
1130       }
1131     }
1132     break;
1133   case 2:
1134     for ( i=0; i<mbs; i++ ) {
1135       n  = ii[1] - ii[0]; ii++;
1136       xb = x + 2*i; x1 = xb[0]; x2 = xb[1];
1137       ib = idx + ai[i];
1138       for ( j=0; j<n; j++ ) {
1139         rval      = ib[j]*2;
1140         z[rval++] += v[0]*x1 + v[1]*x2;
1141         z[rval++] += v[2]*x1 + v[3]*x2;
1142         v += 4;
1143       }
1144     }
1145     break;
1146   case 3:
1147     for ( i=0; i<mbs; i++ ) {
1148       n  = ii[1] - ii[0]; ii++;
1149       xb = x + 3*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1150       ib = idx + ai[i];
1151       for ( j=0; j<n; j++ ) {
1152         rval      = ib[j]*3;
1153         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1154         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1155         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1156         v += 9;
1157       }
1158     }
1159     break;
1160   case 4:
1161     for ( i=0; i<mbs; i++ ) {
1162       n  = ii[1] - ii[0]; ii++;
1163       xb = x + 4*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1164       ib = idx + ai[i];
1165       for ( j=0; j<n; j++ ) {
1166         rval      = ib[j]*4;
1167         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1168         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1169         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1170         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1171         v += 16;
1172       }
1173     }
1174     break;
1175   case 5:
1176     for ( i=0; i<mbs; i++ ) {
1177       n  = ii[1] - ii[0]; ii++;
1178       xb = x + 5*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1179       x4 = xb[3];   x5 = xb[4];
1180       ib = idx + ai[i];
1181       for ( j=0; j<n; j++ ) {
1182         rval      = ib[j]*5;
1183         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1184         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1185         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1186         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1187         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1188         v += 25;
1189       }
1190     }
1191     break;
1192       /* block sizes larger then 5 by 5 are handled by BLAS */
1193     default: {
1194       int  _One = 1,ncols,k; Scalar _DOne = 1.0, *work,*workt;
1195       if (!a->mult_work) {
1196         k = PetscMax(a->m,a->n);
1197         a->mult_work = (Scalar *) PetscMalloc(k*sizeof(Scalar));
1198         CHKPTRQ(a->mult_work);
1199       }
1200       work = a->mult_work;
1201       for ( i=0; i<mbs; i++ ) {
1202         n     = ii[1] - ii[0]; ii++;
1203         ncols = n*bs;
1204         PetscMemzero(work,ncols*sizeof(Scalar));
1205         LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One);
1206         v += n*bs2;
1207         x += bs;
1208         workt = work;
1209         for ( j=0; j<n; j++ ) {
1210           zb = z + bs*(*idx++);
1211           for ( k=0; k<bs; k++ ) zb[k] += workt[k] ;
1212           workt += bs;
1213         }
1214       }
1215     }
1216   }
1217   ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
1218   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
1219   PLogFlops(2*a->nz*a->bs2);
1220   return 0;
1221 }
1222 
1223 static int MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,int *nz,int *nza,int *mem)
1224 {
1225   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1226   if (nz)  *nz  = a->bs2*a->nz;
1227   if (nza) *nza = a->maxnz;
1228   if (mem) *mem = (int)A->mem;
1229   return 0;
1230 }
1231 
1232 static int MatEqual_SeqBAIJ(Mat A,Mat B, PetscTruth* flg)
1233 {
1234   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)B->data;
1235 
1236   if (B->type !=MATSEQBAIJ)SETERRQ(1,"MatEqual_SeqBAIJ:Matrices must be same type");
1237 
1238   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1239   if ((a->m != b->m) || (a->n !=b->n) || (a->bs != b->bs)||
1240       (a->nz != b->nz)) {
1241     *flg = PETSC_FALSE; return 0;
1242   }
1243 
1244   /* if the a->i are the same */
1245   if (PetscMemcmp(a->i,b->i, (a->mbs+1)*sizeof(int))) {
1246     *flg = PETSC_FALSE; return 0;
1247   }
1248 
1249   /* if a->j are the same */
1250   if (PetscMemcmp(a->j,b->j,(a->nz)*sizeof(int))) {
1251     *flg = PETSC_FALSE; return 0;
1252   }
1253 
1254   /* if a->a are the same */
1255   if (PetscMemcmp(a->a, b->a,(a->nz)*(a->bs)*(a->bs)*sizeof(Scalar))) {
1256     *flg = PETSC_FALSE; return 0;
1257   }
1258   *flg = PETSC_TRUE;
1259   return 0;
1260 
1261 }
1262 
1263 static int MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
1264 {
1265   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1266   int         i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1267   Scalar      *x, zero = 0.0,*aa,*aa_j;
1268 
1269   bs   = a->bs;
1270   aa   = a->a;
1271   ai   = a->i;
1272   aj   = a->j;
1273   ambs = a->mbs;
1274   bs2  = a->bs2;
1275 
1276   VecSet(&zero,v);
1277   VecGetArray(v,&x); VecGetLocalSize(v,&n);
1278   if (n != a->m) SETERRQ(1,"MatGetDiagonal_SeqBAIJ:Nonconforming matrix and vector");
1279   for ( i=0; i<ambs; i++ ) {
1280     for ( j=ai[i]; j<ai[i+1]; j++ ) {
1281       if (aj[j] == i) {
1282         row  = i*bs;
1283         aa_j = aa+j*bs2;
1284         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1285         break;
1286       }
1287     }
1288   }
1289   return 0;
1290 }
1291 
1292 static int MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
1293 {
1294   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1295   Scalar      *l,*r,x,*v,*aa,*li,*ri;
1296   int         i,j,k,lm,rn,M,m,n,*ai,*aj,mbs,tmp,bs,bs2;
1297 
1298   ai  = a->i;
1299   aj  = a->j;
1300   aa  = a->a;
1301   m   = a->m;
1302   n   = a->n;
1303   bs  = a->bs;
1304   mbs = a->mbs;
1305   bs2 = a->bs2;
1306   if (ll) {
1307     VecGetArray(ll,&l); VecGetSize(ll,&lm);
1308     if (lm != m) SETERRQ(1,"MatDiagonalScale_SeqBAIJ:Left scaling vector wrong length");
1309     for ( i=0; i<mbs; i++ ) { /* for each block row */
1310       M  = ai[i+1] - ai[i];
1311       li = l + i*bs;
1312       v  = aa + bs2*ai[i];
1313       for ( j=0; j<M; j++ ) { /* for each block */
1314         for ( k=0; k<bs2; k++ ) {
1315           (*v++) *= li[k%bs];
1316         }
1317       }
1318     }
1319   }
1320 
1321   if (rr) {
1322     VecGetArray(rr,&r); VecGetSize(rr,&rn);
1323     if (rn != n) SETERRQ(1,"MatDiagonalScale_SeqBAIJ:Right scaling vector wrong length");
1324     for ( i=0; i<mbs; i++ ) { /* for each block row */
1325       M  = ai[i+1] - ai[i];
1326       v  = aa + bs2*ai[i];
1327       for ( j=0; j<M; j++ ) { /* for each block */
1328         ri = r + bs*aj[ai[i]+j];
1329         for ( k=0; k<bs; k++ ) {
1330           x = ri[k];
1331           for ( tmp=0; tmp<bs; tmp++ ) (*v++) *= x;
1332         }
1333       }
1334     }
1335   }
1336   return 0;
1337 }
1338 
1339 
1340 extern int MatLUFactorSymbolic_SeqBAIJ(Mat,IS,IS,double,Mat*);
1341 extern int MatLUFactor_SeqBAIJ(Mat,IS,IS,double);
1342 extern int MatIncreaseOverlap_SeqBAIJ(Mat,int,IS*,int);
1343 extern int MatGetSubMatrix_SeqBAIJ(Mat,IS,IS,MatGetSubMatrixCall,Mat*);
1344 extern int MatGetSubMatrices_SeqBAIJ(Mat,int,IS*,IS*,MatGetSubMatrixCall,Mat**);
1345 
1346 extern int MatSolve_SeqBAIJ_N(Mat,Vec,Vec);
1347 extern int MatSolve_SeqBAIJ_1(Mat,Vec,Vec);
1348 extern int MatSolve_SeqBAIJ_2(Mat,Vec,Vec);
1349 extern int MatSolve_SeqBAIJ_3(Mat,Vec,Vec);
1350 extern int MatSolve_SeqBAIJ_4(Mat,Vec,Vec);
1351 extern int MatSolve_SeqBAIJ_5(Mat,Vec,Vec);
1352 
1353 extern int MatLUFactorNumeric_SeqBAIJ_N(Mat,Mat*);
1354 extern int MatLUFactorNumeric_SeqBAIJ_1(Mat,Mat*);
1355 extern int MatLUFactorNumeric_SeqBAIJ_2(Mat,Mat*);
1356 extern int MatLUFactorNumeric_SeqBAIJ_3(Mat,Mat*);
1357 extern int MatLUFactorNumeric_SeqBAIJ_4(Mat,Mat*);
1358 extern int MatLUFactorNumeric_SeqBAIJ_5(Mat,Mat*);
1359 
1360 static int MatNorm_SeqBAIJ(Mat A,NormType type,double *norm)
1361 {
1362   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1363   Scalar      *v = a->a;
1364   double      sum = 0.0;
1365   int         i,nz=a->nz,bs2=a->bs2;
1366 
1367   if (type == NORM_FROBENIUS) {
1368     for (i=0; i< bs2*nz; i++ ) {
1369 #if defined(PETSC_COMPLEX)
1370       sum += real(conj(*v)*(*v)); v++;
1371 #else
1372       sum += (*v)*(*v); v++;
1373 #endif
1374     }
1375     *norm = sqrt(sum);
1376   }
1377   else {
1378     SETERRQ(1,"MatNorm_SeqBAIJ:No support for this norm yet");
1379   }
1380   return 0;
1381 }
1382 
1383 /*
1384      note: This can only work for identity for row and col. It would
1385    be good to check this and otherwise generate an error.
1386 */
1387 static int MatILUFactor_SeqBAIJ(Mat inA,IS row,IS col,double efill,int fill)
1388 {
1389   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) inA->data;
1390   Mat         outA;
1391   int         ierr;
1392 
1393   if (fill != 0) SETERRQ(1,"MatILUFactor_SeqBAIJ:Only fill=0 supported");
1394 
1395   outA          = inA;
1396   inA->factor   = FACTOR_LU;
1397   a->row        = row;
1398   a->col        = col;
1399 
1400   a->solve_work = (Scalar *) PetscMalloc((a->m+a->bs)*sizeof(Scalar));CHKPTRQ(a->solve_work);
1401 
1402   if (!a->diag) {
1403     ierr = MatMarkDiag_SeqBAIJ(inA); CHKERRQ(ierr);
1404   }
1405   ierr = MatLUFactorNumeric(inA,&outA); CHKERRQ(ierr);
1406   return 0;
1407 }
1408 
1409 static int MatScale_SeqBAIJ(Scalar *alpha,Mat inA)
1410 {
1411   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) inA->data;
1412   int         one = 1, totalnz = a->bs2*a->nz;
1413   BLscal_( &totalnz, alpha, a->a, &one );
1414   PLogFlops(totalnz);
1415   return 0;
1416 }
1417 
1418 static int MatGetValues_SeqBAIJ(Mat A,int m,int *im,int n,int *in,Scalar *v)
1419 {
1420   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1421   int        *rp, k, low, high, t, row, nrow, i, col, l, *aj = a->j;
1422   int        *ai = a->i, *ailen = a->ilen;
1423   int        brow,bcol,ridx,cidx,bs=a->bs,bs2=a->bs2;
1424   Scalar     *ap, *aa = a->a, zero = 0.0;
1425 
1426   for ( k=0; k<m; k++ ) { /* loop over rows */
1427     row  = im[k]; brow = row/bs;
1428     if (row < 0) SETERRQ(1,"MatGetValues_SeqBAIJ:Negative row");
1429     if (row >= a->m) SETERRQ(1,"MatGetValues_SeqBAIJ:Row too large");
1430     rp   = aj + ai[brow] ; ap = aa + bs2*ai[brow] ;
1431     nrow = ailen[brow];
1432     for ( l=0; l<n; l++ ) { /* loop over columns */
1433       if (in[l] < 0) SETERRQ(1,"MatGetValues_SeqBAIJ:Negative column");
1434       if (in[l] >= a->n) SETERRQ(1,"MatGetValues_SeqBAIJ:Column too large");
1435       col  = in[l] ;
1436       bcol = col/bs;
1437       cidx = col%bs;
1438       ridx = row%bs;
1439       high = nrow;
1440       low  = 0; /* assume unsorted */
1441       while (high-low > 5) {
1442         t = (low+high)/2;
1443         if (rp[t] > bcol) high = t;
1444         else             low  = t;
1445       }
1446       for ( i=low; i<high; i++ ) {
1447         if (rp[i] > bcol) break;
1448         if (rp[i] == bcol) {
1449           *v++ = ap[bs2*i+bs*cidx+ridx];
1450           goto finished;
1451         }
1452       }
1453       *v++ = zero;
1454       finished:;
1455     }
1456   }
1457   return 0;
1458 }
1459 
1460 static int MatGetBlockSize_SeqBAIJ(Mat mat, int *bs)
1461 {
1462   Mat_SeqBAIJ *baij = (Mat_SeqBAIJ *) mat->data;
1463   *bs = baij->bs;
1464   return 0;
1465 }
1466 
1467 /* idx should be of length atlease bs */
1468 static int MatZeroRows_SeqBAIJ_Check_Block(int *idx, int bs, PetscTruth *flg)
1469 {
1470   int i,row;
1471   row = idx[0];
1472   if (row%bs!=0) { *flg = PETSC_FALSE; return 0; }
1473 
1474   for ( i=1; i<bs; i++ ) {
1475     if (row+i != idx[i]) { *flg = PETSC_FALSE; return 0; }
1476   }
1477   *flg = PETSC_TRUE;
1478   return 0;
1479 }
1480 
1481 static int MatZeroRows_SeqBAIJ(Mat A,IS is, Scalar *diag)
1482 {
1483   Mat_SeqBAIJ *baij=(Mat_SeqBAIJ*)A->data;
1484   IS          is_local;
1485   int         ierr,i,j,count,m=baij->m,is_n,*is_idx,*rows,bs=baij->bs,bs2=baij->bs2;
1486   PetscTruth  flg;
1487   Scalar      zero = 0.0,*aa;
1488 
1489   /* Make a copy of the IS and  sort it */
1490   ierr = ISGetSize(is,&is_n);CHKERRQ(ierr);
1491   ierr = ISGetIndices(is,&is_idx);CHKERRQ(ierr);
1492   ierr = ISCreateSeq(A->comm,is_n,is_idx,&is_local); CHKERRQ(ierr);
1493   ierr = ISSort(is_local); CHKERRQ(ierr);
1494   ierr = ISGetIndices(is_local,&rows); CHKERRQ(ierr);
1495 
1496   i = 0;
1497   while (i < is_n) {
1498     if (rows[i]<0 || rows[i]>m) SETERRQ(1,"MatZeroRows_SeqBAIJ:row out of range");
1499     flg = PETSC_FALSE;
1500     if (i+bs <= is_n) {ierr = MatZeroRows_SeqBAIJ_Check_Block(rows+i,bs,&flg); CHKERRQ(ierr); }
1501     if (flg) { /* There exists a block of rows to be Zerowed */
1502       baij->ilen[rows[i]/bs] = 0;
1503       i += bs;
1504     } else { /* Zero out only the requested row */
1505       count = (baij->i[rows[i]/bs +1] - baij->i[rows[i]/bs])*bs;
1506       aa    = baij->a + baij->i[rows[i]/bs]*bs2 + (rows[i]%bs);
1507       for ( j=0; j<count; j++ ) {
1508         aa[0] = zero;
1509         aa+=bs;
1510       }
1511       i++;
1512     }
1513   }
1514   if (diag) {
1515     for ( j=0; j<is_n; j++ ) {
1516       ierr = (*A->ops.setvalues)(A,1,rows+j,1,rows+j,diag,INSERT_VALUES);CHKERRQ(ierr);
1517     }
1518   }
1519   ierr = ISRestoreIndices(is,&is_idx); CHKERRQ(ierr);
1520   ierr = ISRestoreIndices(is_local,&rows); CHKERRQ(ierr);
1521   ierr = ISDestroy(is_local); CHKERRQ(ierr);
1522   ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1523   ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1524 
1525   return 0;
1526 }
1527 int MatPrintHelp_SeqBAIJ(Mat A)
1528 {
1529   static int called = 0;
1530   MPI_Comm   comm = A->comm;
1531 
1532   if (called) return 0; else called = 1;
1533   PetscPrintf(comm," Options for MATSEQBAIJ and MATMPIBAIJ matrix formats (the defaults):\n");
1534   PetscPrintf(comm,"  -mat_block_size <block_size>\n");
1535   return 0;
1536 }
1537 
1538 /* -------------------------------------------------------------------*/
1539 static struct _MatOps MatOps = {MatSetValues_SeqBAIJ,
1540        MatGetRow_SeqBAIJ,MatRestoreRow_SeqBAIJ,
1541        MatMult_SeqBAIJ_N,MatMultAdd_SeqBAIJ_N,
1542        MatMultTrans_SeqBAIJ,MatMultTransAdd_SeqBAIJ,
1543        MatSolve_SeqBAIJ_N,0,
1544        0,0,
1545        MatLUFactor_SeqBAIJ,0,
1546        0,
1547        MatTranspose_SeqBAIJ,
1548        MatGetInfo_SeqBAIJ,MatEqual_SeqBAIJ,
1549        MatGetDiagonal_SeqBAIJ,MatDiagonalScale_SeqBAIJ,MatNorm_SeqBAIJ,
1550        0,MatAssemblyEnd_SeqBAIJ,
1551        0,
1552        MatSetOption_SeqBAIJ,MatZeroEntries_SeqBAIJ,MatZeroRows_SeqBAIJ,
1553        MatGetReordering_SeqBAIJ,
1554        MatLUFactorSymbolic_SeqBAIJ,MatLUFactorNumeric_SeqBAIJ_N,0,0,
1555        MatGetSize_SeqBAIJ,MatGetSize_SeqBAIJ,MatGetOwnershipRange_SeqBAIJ,
1556        MatILUFactorSymbolic_SeqBAIJ,0,
1557        0,0,/*  MatConvert_SeqBAIJ  */ 0,
1558        MatGetSubMatrix_SeqBAIJ,0,
1559        MatConvertSameType_SeqBAIJ,0,0,
1560        MatILUFactor_SeqBAIJ,0,0,
1561        MatGetSubMatrices_SeqBAIJ,MatIncreaseOverlap_SeqBAIJ,
1562        MatGetValues_SeqBAIJ,0,
1563        MatPrintHelp_SeqBAIJ,MatScale_SeqBAIJ,
1564        0,0,0,MatGetBlockSize_SeqBAIJ};
1565 
1566 /*@C
1567    MatCreateSeqBAIJ - Creates a sparse matrix in block AIJ (block
1568    compressed row) format.  For good matrix assembly performance the
1569    user should preallocate the matrix storage by setting the parameter nz
1570    (or the array nzz).  By setting these parameters accurately, performance
1571    during matrix assembly can be increased by more than a factor of 50.
1572 
1573    Input Parameters:
1574 .  comm - MPI communicator, set to MPI_COMM_SELF
1575 .  bs - size of block
1576 .  m - number of rows
1577 .  n - number of columns
1578 .  nz - number of block nonzeros per block row (same for all rows)
1579 .  nzz - array containing the number of block nonzeros in the various block rows
1580          (possibly different for each block row) or PETSC_NULL
1581 
1582    Output Parameter:
1583 .  A - the matrix
1584 
1585    Notes:
1586    The block AIJ format is fully compatible with standard Fortran 77
1587    storage.  That is, the stored row and column indices can begin at
1588    either one (as in Fortran) or zero.  See the users' manual for details.
1589 
1590    Specify the preallocated storage with either nz or nnz (not both).
1591    Set nz=PETSC_DEFAULT and nnz=PETSC_NULL for PETSc to control dynamic memory
1592    allocation.  For additional details, see the users manual chapter on
1593    matrices and the file $(PETSC_DIR)/Performance.
1594 
1595 .seealso: MatCreate(), MatCreateSeqAIJ(), MatSetValues()
1596 @*/
1597 int MatCreateSeqBAIJ(MPI_Comm comm,int bs,int m,int n,int nz,int *nnz, Mat *A)
1598 {
1599   Mat         B;
1600   Mat_SeqBAIJ *b;
1601   int         i,len,ierr,flg,mbs=m/bs,nbs=n/bs,bs2=bs*bs;
1602 
1603   if (mbs*bs!=m || nbs*bs!=n)
1604     SETERRQ(1,"MatCreateSeqBAIJ:Number rows, cols must be divisible by blocksize");
1605 
1606   *A = 0;
1607   PetscHeaderCreate(B,_Mat,MAT_COOKIE,MATSEQBAIJ,comm);
1608   PLogObjectCreate(B);
1609   B->data = (void *) (b = PetscNew(Mat_SeqBAIJ)); CHKPTRQ(b);
1610   PetscMemzero(b,sizeof(Mat_SeqBAIJ));
1611   PetscMemcpy(&B->ops,&MatOps,sizeof(struct _MatOps));
1612   ierr = OptionsHasName(PETSC_NULL,"-mat_no_unroll",&flg); CHKERRQ(ierr);
1613   if (!flg) {
1614     switch (bs) {
1615       case 1:
1616         B->ops.lufactornumeric = MatLUFactorNumeric_SeqBAIJ_1;
1617         B->ops.solve           = MatSolve_SeqBAIJ_1;
1618         B->ops.mult            = MatMult_SeqBAIJ_1;
1619         B->ops.multadd         = MatMultAdd_SeqBAIJ_1;
1620        break;
1621       case 2:
1622         B->ops.lufactornumeric = MatLUFactorNumeric_SeqBAIJ_2;
1623         B->ops.solve           = MatSolve_SeqBAIJ_2;
1624         B->ops.mult            = MatMult_SeqBAIJ_2;
1625         B->ops.multadd         = MatMultAdd_SeqBAIJ_2;
1626         break;
1627       case 3:
1628         B->ops.lufactornumeric = MatLUFactorNumeric_SeqBAIJ_3;
1629         B->ops.solve           = MatSolve_SeqBAIJ_3;
1630         B->ops.mult            = MatMult_SeqBAIJ_3;
1631         B->ops.multadd         = MatMultAdd_SeqBAIJ_3;
1632         break;
1633       case 4:
1634         B->ops.lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4;
1635         B->ops.solve           = MatSolve_SeqBAIJ_4;
1636         B->ops.mult            = MatMult_SeqBAIJ_4;
1637         B->ops.multadd         = MatMultAdd_SeqBAIJ_4;
1638         break;
1639       case 5:
1640         B->ops.lufactornumeric = MatLUFactorNumeric_SeqBAIJ_5;
1641         B->ops.solve           = MatSolve_SeqBAIJ_5;
1642         B->ops.mult            = MatMult_SeqBAIJ_5;
1643         B->ops.multadd         = MatMultAdd_SeqBAIJ_5;
1644         break;
1645     }
1646   }
1647   B->destroy          = MatDestroy_SeqBAIJ;
1648   B->view             = MatView_SeqBAIJ;
1649   B->factor           = 0;
1650   B->lupivotthreshold = 1.0;
1651   b->row              = 0;
1652   b->col              = 0;
1653   b->reallocs         = 0;
1654 
1655   b->m       = m; B->m = m; B->M = m;
1656   b->n       = n; B->n = n; B->N = n;
1657   b->mbs     = mbs;
1658   b->nbs     = nbs;
1659   b->imax    = (int *) PetscMalloc( (mbs+1)*sizeof(int) ); CHKPTRQ(b->imax);
1660   if (nnz == PETSC_NULL) {
1661     if (nz == PETSC_DEFAULT) nz = 5;
1662     else if (nz <= 0)        nz = 1;
1663     for ( i=0; i<mbs; i++ ) b->imax[i] = nz;
1664     nz = nz*mbs;
1665   }
1666   else {
1667     nz = 0;
1668     for ( i=0; i<mbs; i++ ) {b->imax[i] = nnz[i]; nz += nnz[i];}
1669   }
1670 
1671   /* allocate the matrix space */
1672   len   = nz*sizeof(int) + nz*bs2*sizeof(Scalar) + (b->m+1)*sizeof(int);
1673   b->a  = (Scalar *) PetscMalloc( len ); CHKPTRQ(b->a);
1674   PetscMemzero(b->a,nz*bs2*sizeof(Scalar));
1675   b->j  = (int *) (b->a + nz*bs2);
1676   PetscMemzero(b->j,nz*sizeof(int));
1677   b->i  = b->j + nz;
1678   b->singlemalloc = 1;
1679 
1680   b->i[0] = 0;
1681   for (i=1; i<mbs+1; i++) {
1682     b->i[i] = b->i[i-1] + b->imax[i-1];
1683   }
1684 
1685   /* b->ilen will count nonzeros in each block row so far. */
1686   b->ilen = (int *) PetscMalloc((mbs+1)*sizeof(int));
1687   PLogObjectMemory(B,len+2*(mbs+1)*sizeof(int)+sizeof(struct _Mat)+sizeof(Mat_SeqBAIJ));
1688   for ( i=0; i<mbs; i++ ) { b->ilen[i] = 0;}
1689 
1690   b->bs               = bs;
1691   b->bs2              = bs2;
1692   b->mbs              = mbs;
1693   b->nz               = 0;
1694   b->maxnz            = nz*bs2;
1695   b->sorted           = 0;
1696   b->roworiented      = 1;
1697   b->nonew            = 0;
1698   b->diag             = 0;
1699   b->solve_work       = 0;
1700   b->mult_work        = 0;
1701   b->spptr            = 0;
1702   *A = B;
1703   ierr = OptionsHasName(PETSC_NULL,"-help", &flg); CHKERRQ(ierr);
1704   if (flg) {ierr = MatPrintHelp(B); CHKERRQ(ierr); }
1705   return 0;
1706 }
1707 
1708 int MatConvertSameType_SeqBAIJ(Mat A,Mat *B,int cpvalues)
1709 {
1710   Mat         C;
1711   Mat_SeqBAIJ *c,*a = (Mat_SeqBAIJ *) A->data;
1712   int         i,len, mbs = a->mbs,nz = a->nz,bs2 =a->bs2;
1713 
1714   if (a->i[mbs] != nz) SETERRQ(1,"MatConvertSameType_SeqBAIJ:Corrupt matrix");
1715 
1716   *B = 0;
1717   PetscHeaderCreate(C,_Mat,MAT_COOKIE,MATSEQBAIJ,A->comm);
1718   PLogObjectCreate(C);
1719   C->data       = (void *) (c = PetscNew(Mat_SeqBAIJ)); CHKPTRQ(c);
1720   PetscMemcpy(&C->ops,&A->ops,sizeof(struct _MatOps));
1721   C->destroy    = MatDestroy_SeqBAIJ;
1722   C->view       = MatView_SeqBAIJ;
1723   C->factor     = A->factor;
1724   c->row        = 0;
1725   c->col        = 0;
1726   C->assembled  = PETSC_TRUE;
1727 
1728   c->m = C->m   = a->m;
1729   c->n = C->n   = a->n;
1730   C->M          = a->m;
1731   C->N          = a->n;
1732 
1733   c->bs         = a->bs;
1734   c->bs2        = a->bs2;
1735   c->mbs        = a->mbs;
1736   c->nbs        = a->nbs;
1737 
1738   c->imax       = (int *) PetscMalloc((mbs+1)*sizeof(int)); CHKPTRQ(c->imax);
1739   c->ilen       = (int *) PetscMalloc((mbs+1)*sizeof(int)); CHKPTRQ(c->ilen);
1740   for ( i=0; i<mbs; i++ ) {
1741     c->imax[i] = a->imax[i];
1742     c->ilen[i] = a->ilen[i];
1743   }
1744 
1745   /* allocate the matrix space */
1746   c->singlemalloc = 1;
1747   len   = (mbs+1)*sizeof(int) + nz*(bs2*sizeof(Scalar) + sizeof(int));
1748   c->a  = (Scalar *) PetscMalloc( len ); CHKPTRQ(c->a);
1749   c->j  = (int *) (c->a + nz*bs2);
1750   c->i  = c->j + nz;
1751   PetscMemcpy(c->i,a->i,(mbs+1)*sizeof(int));
1752   if (mbs > 0) {
1753     PetscMemcpy(c->j,a->j,nz*sizeof(int));
1754     if (cpvalues == COPY_VALUES) {
1755       PetscMemcpy(c->a,a->a,bs2*nz*sizeof(Scalar));
1756     }
1757   }
1758 
1759   PLogObjectMemory(C,len+2*(mbs+1)*sizeof(int)+sizeof(struct _Mat)+sizeof(Mat_SeqBAIJ));
1760   c->sorted      = a->sorted;
1761   c->roworiented = a->roworiented;
1762   c->nonew       = a->nonew;
1763 
1764   if (a->diag) {
1765     c->diag = (int *) PetscMalloc( (mbs+1)*sizeof(int) ); CHKPTRQ(c->diag);
1766     PLogObjectMemory(C,(mbs+1)*sizeof(int));
1767     for ( i=0; i<mbs; i++ ) {
1768       c->diag[i] = a->diag[i];
1769     }
1770   }
1771   else c->diag          = 0;
1772   c->nz                 = a->nz;
1773   c->maxnz              = a->maxnz;
1774   c->solve_work         = 0;
1775   c->spptr              = 0;      /* Dangerous -I'm throwing away a->spptr */
1776   c->mult_work          = 0;
1777   *B = C;
1778   return 0;
1779 }
1780 
1781 int MatLoad_SeqBAIJ(Viewer viewer,MatType type,Mat *A)
1782 {
1783   Mat_SeqBAIJ  *a;
1784   Mat          B;
1785   int          i,nz,ierr,fd,header[4],size,*rowlengths=0,M,N,bs=1,flg;
1786   int          *mask,mbs,*jj,j,rowcount,nzcount,k,*browlengths,maskcount;
1787   int          kmax,jcount,block,idx,point,nzcountb,extra_rows;
1788   int          *masked, nmask,tmp,bs2,ishift;
1789   Scalar       *aa;
1790   MPI_Comm     comm = ((PetscObject) viewer)->comm;
1791 
1792   ierr = OptionsGetInt(PETSC_NULL,"-matload_block_size",&bs,&flg);CHKERRQ(ierr);
1793   bs2  = bs*bs;
1794 
1795   MPI_Comm_size(comm,&size);
1796   if (size > 1) SETERRQ(1,"MatLoad_SeqBAIJ:view must have one processor");
1797   ierr = ViewerBinaryGetDescriptor(viewer,&fd); CHKERRQ(ierr);
1798   ierr = PetscBinaryRead(fd,header,4,BINARY_INT); CHKERRQ(ierr);
1799   if (header[0] != MAT_COOKIE) SETERRQ(1,"MatLoad_SeqBAIJ:not Mat object");
1800   M = header[1]; N = header[2]; nz = header[3];
1801 
1802   if (M != N) SETERRQ(1,"MatLoad_SeqBAIJ:Can only do square matrices");
1803 
1804   /*
1805      This code adds extra rows to make sure the number of rows is
1806     divisible by the blocksize
1807   */
1808   mbs        = M/bs;
1809   extra_rows = bs - M + bs*(mbs);
1810   if (extra_rows == bs) extra_rows = 0;
1811   else                  mbs++;
1812   if (extra_rows) {
1813     PLogInfo(0,"MatLoad_SeqBAIJ:Padding loaded matrix to match blocksize");
1814   }
1815 
1816   /* read in row lengths */
1817   rowlengths = (int*) PetscMalloc((M+extra_rows)*sizeof(int));CHKPTRQ(rowlengths);
1818   ierr = PetscBinaryRead(fd,rowlengths,M,BINARY_INT); CHKERRQ(ierr);
1819   for ( i=0; i<extra_rows; i++ ) rowlengths[M+i] = 1;
1820 
1821   /* read in column indices */
1822   jj = (int*) PetscMalloc( (nz+extra_rows)*sizeof(int) ); CHKPTRQ(jj);
1823   ierr = PetscBinaryRead(fd,jj,nz,BINARY_INT); CHKERRQ(ierr);
1824   for ( i=0; i<extra_rows; i++ ) jj[nz+i] = M+i;
1825 
1826   /* loop over row lengths determining block row lengths */
1827   browlengths = (int *) PetscMalloc(mbs*sizeof(int));CHKPTRQ(browlengths);
1828   PetscMemzero(browlengths,mbs*sizeof(int));
1829   mask   = (int *) PetscMalloc( 2*mbs*sizeof(int) ); CHKPTRQ(mask);
1830   PetscMemzero(mask,mbs*sizeof(int));
1831   masked = mask + mbs;
1832   rowcount = 0; nzcount = 0;
1833   for ( i=0; i<mbs; i++ ) {
1834     nmask = 0;
1835     for ( j=0; j<bs; j++ ) {
1836       kmax = rowlengths[rowcount];
1837       for ( k=0; k<kmax; k++ ) {
1838         tmp = jj[nzcount++]/bs;
1839         if (!mask[tmp]) {masked[nmask++] = tmp; mask[tmp] = 1;}
1840       }
1841       rowcount++;
1842     }
1843     browlengths[i] += nmask;
1844     /* zero out the mask elements we set */
1845     for ( j=0; j<nmask; j++ ) mask[masked[j]] = 0;
1846   }
1847 
1848   /* create our matrix */
1849   ierr = MatCreateSeqBAIJ(comm,bs,M+extra_rows,N+extra_rows,0,browlengths,A);
1850          CHKERRQ(ierr);
1851   B = *A;
1852   a = (Mat_SeqBAIJ *) B->data;
1853 
1854   /* set matrix "i" values */
1855   a->i[0] = 0;
1856   for ( i=1; i<= mbs; i++ ) {
1857     a->i[i]      = a->i[i-1] + browlengths[i-1];
1858     a->ilen[i-1] = browlengths[i-1];
1859   }
1860   a->nz         = 0;
1861   for ( i=0; i<mbs; i++ ) a->nz += browlengths[i];
1862 
1863   /* read in nonzero values */
1864   aa = (Scalar *) PetscMalloc((nz+extra_rows)*sizeof(Scalar));CHKPTRQ(aa);
1865   ierr = PetscBinaryRead(fd,aa,nz,BINARY_SCALAR); CHKERRQ(ierr);
1866   for ( i=0; i<extra_rows; i++ ) aa[nz+i] = 1.0;
1867 
1868   /* set "a" and "j" values into matrix */
1869   nzcount = 0; jcount = 0;
1870   for ( i=0; i<mbs; i++ ) {
1871     nzcountb = nzcount;
1872     nmask    = 0;
1873     for ( j=0; j<bs; j++ ) {
1874       kmax = rowlengths[i*bs+j];
1875       for ( k=0; k<kmax; k++ ) {
1876         tmp = jj[nzcount++]/bs;
1877 	if (!mask[tmp]) { masked[nmask++] = tmp; mask[tmp] = 1;}
1878       }
1879       rowcount++;
1880     }
1881     /* sort the masked values */
1882     PetscSortInt(nmask,masked);
1883 
1884     /* set "j" values into matrix */
1885     maskcount = 1;
1886     for ( j=0; j<nmask; j++ ) {
1887       a->j[jcount++]  = masked[j];
1888       mask[masked[j]] = maskcount++;
1889     }
1890     /* set "a" values into matrix */
1891     ishift = bs2*a->i[i];
1892     for ( j=0; j<bs; j++ ) {
1893       kmax = rowlengths[i*bs+j];
1894       for ( k=0; k<kmax; k++ ) {
1895         tmp    = jj[nzcountb]/bs ;
1896         block  = mask[tmp] - 1;
1897         point  = jj[nzcountb] - bs*tmp;
1898         idx    = ishift + bs2*block + j + bs*point;
1899         a->a[idx] = aa[nzcountb++];
1900       }
1901     }
1902     /* zero out the mask elements we set */
1903     for ( j=0; j<nmask; j++ ) mask[masked[j]] = 0;
1904   }
1905   if (jcount != a->nz) SETERRQ(1,"MatLoad_SeqBAIJ:Error bad binary matrix");
1906 
1907   PetscFree(rowlengths);
1908   PetscFree(browlengths);
1909   PetscFree(aa);
1910   PetscFree(jj);
1911   PetscFree(mask);
1912 
1913   B->assembled = PETSC_TRUE;
1914 
1915   ierr = OptionsHasName(PETSC_NULL,"-mat_view_info",&flg); CHKERRQ(ierr);
1916   if (flg) {
1917     Viewer tviewer;
1918     ierr = ViewerFileOpenASCII(B->comm,"stdout",&tviewer);CHKERRQ(ierr);
1919     ierr = ViewerSetFormat(tviewer,ASCII_FORMAT_INFO,0);CHKERRQ(ierr);
1920     ierr = MatView(B,tviewer); CHKERRQ(ierr);
1921     ierr = ViewerDestroy(tviewer); CHKERRQ(ierr);
1922   }
1923   ierr = OptionsHasName(PETSC_NULL,"-mat_view_info_detailed",&flg);CHKERRQ(ierr);
1924   if (flg) {
1925     Viewer tviewer;
1926     ierr = ViewerFileOpenASCII(B->comm,"stdout",&tviewer);CHKERRQ(ierr);
1927     ierr = ViewerSetFormat(tviewer,ASCII_FORMAT_INFO_DETAILED,0);CHKERRQ(ierr);
1928     ierr = MatView(B,tviewer); CHKERRQ(ierr);
1929     ierr = ViewerDestroy(tviewer); CHKERRQ(ierr);
1930   }
1931   ierr = OptionsHasName(PETSC_NULL,"-mat_view",&flg); CHKERRQ(ierr);
1932   if (flg) {
1933     Viewer tviewer;
1934     ierr = ViewerFileOpenASCII(B->comm,"stdout",&tviewer);CHKERRQ(ierr);
1935     ierr = MatView(B,tviewer); CHKERRQ(ierr);
1936     ierr = ViewerDestroy(tviewer); CHKERRQ(ierr);
1937   }
1938   ierr = OptionsHasName(PETSC_NULL,"-mat_view_matlab",&flg); CHKERRQ(ierr);
1939   if (flg) {
1940     Viewer tviewer;
1941     ierr = ViewerFileOpenASCII(B->comm,"stdout",&tviewer);CHKERRQ(ierr);
1942     ierr = ViewerSetFormat(tviewer,ASCII_FORMAT_MATLAB,"M");CHKERRQ(ierr);
1943     ierr = MatView(B,tviewer); CHKERRQ(ierr);
1944     ierr = ViewerDestroy(tviewer); CHKERRQ(ierr);
1945   }
1946   ierr = OptionsHasName(PETSC_NULL,"-mat_view_draw",&flg); CHKERRQ(ierr);
1947   if (flg) {
1948     Viewer tviewer;
1949     ierr = ViewerDrawOpenX(B->comm,0,0,0,0,300,300,&tviewer); CHKERRQ(ierr);
1950     ierr = MatView(B,(Viewer)tviewer); CHKERRQ(ierr);
1951     ierr = ViewerFlush(tviewer); CHKERRQ(ierr);
1952     ierr = ViewerDestroy(tviewer); CHKERRQ(ierr);
1953   }
1954   return 0;
1955 }
1956 
1957 
1958 
1959