xref: /petsc/src/mat/impls/baij/seq/baij.c (revision 49f289441cf4e0553157c9cbf75d3484d13cb3f6)
1 
2 #ifndef lint
3 static char vcid[] = "$Id: baij.c,v 1.58 1996/07/10 14:56:44 balay Exp balay $";
4 #endif
5 
6 /*
7     Defines the basic matrix operations for the BAIJ (compressed row)
8   matrix storage format.
9 */
10 #include "baij.h"
11 #include "src/vec/vecimpl.h"
12 #include "src/inline/spops.h"
13 #include "petsc.h"
14 
15 extern   int MatToSymmetricIJ_SeqAIJ(int,int*,int*,int,int,int**,int**);
16 
17 static int MatGetReordering_SeqBAIJ(Mat A,MatReordering type,IS *rperm,IS *cperm)
18 {
19   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
20   int         ierr, *ia, *ja,n = a->mbs,*idx,i,ishift,oshift;
21 
22   /*
23      this is tacky: In the future when we have written special factorization
24      and solve routines for the identity permutation we should use a
25      stride index set instead of the general one.
26   */
27   if (type  == ORDER_NATURAL) {
28     idx = (int *) PetscMalloc( n*sizeof(int) ); CHKPTRQ(idx);
29     for ( i=0; i<n; i++ ) idx[i] = i;
30     ierr = ISCreateSeq(MPI_COMM_SELF,n,idx,rperm); CHKERRQ(ierr);
31     ierr = ISCreateSeq(MPI_COMM_SELF,n,idx,cperm); CHKERRQ(ierr);
32     PetscFree(idx);
33     ISSetPermutation(*rperm);
34     ISSetPermutation(*cperm);
35     ISSetIdentity(*rperm);
36     ISSetIdentity(*cperm);
37     return 0;
38   }
39 
40   MatReorderingRegisterAll();
41   ishift = 0;
42   oshift = -MatReorderingIndexShift[(int)type];
43   if (MatReorderingRequiresSymmetric[(int)type]) {
44     ierr = MatToSymmetricIJ_SeqAIJ(n,a->i,a->j,ishift,oshift,&ia,&ja);CHKERRQ(ierr);
45     ierr = MatGetReordering_IJ(n,ia,ja,type,rperm,cperm); CHKERRQ(ierr);
46     PetscFree(ia); PetscFree(ja);
47   } else {
48     if (ishift == oshift) {
49       ierr = MatGetReordering_IJ(n,a->i,a->j,type,rperm,cperm);CHKERRQ(ierr);
50     }
51     else if (ishift == -1) {
52       /* temporarily subtract 1 from i and j indices */
53       int nz = a->i[n] - 1;
54       for ( i=0; i<nz; i++ ) a->j[i]--;
55       for ( i=0; i<n+1; i++ ) a->i[i]--;
56       ierr = MatGetReordering_IJ(n,a->i,a->j,type,rperm,cperm);CHKERRQ(ierr);
57       for ( i=0; i<nz; i++ ) a->j[i]++;
58       for ( i=0; i<n+1; i++ ) a->i[i]++;
59     } else {
60       /* temporarily add 1 to i and j indices */
61       int nz = a->i[n] - 1;
62       for ( i=0; i<nz; i++ ) a->j[i]++;
63       for ( i=0; i<n+1; i++ ) a->i[i]++;
64       ierr = MatGetReordering_IJ(n,a->i,a->j,type,rperm,cperm);CHKERRQ(ierr);
65       for ( i=0; i<nz; i++ ) a->j[i]--;
66       for ( i=0; i<n+1; i++ ) a->i[i]--;
67     }
68   }
69   return 0;
70 }
71 
72 /*
73      Adds diagonal pointers to sparse matrix structure.
74 */
75 
76 int MatMarkDiag_SeqBAIJ(Mat A)
77 {
78   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
79   int         i,j, *diag, m = a->mbs;
80 
81   diag = (int *) PetscMalloc( (m+1)*sizeof(int)); CHKPTRQ(diag);
82   PLogObjectMemory(A,(m+1)*sizeof(int));
83   for ( i=0; i<m; i++ ) {
84     for ( j=a->i[i]; j<a->i[i+1]; j++ ) {
85       if (a->j[j] == i) {
86         diag[i] = j;
87         break;
88       }
89     }
90   }
91   a->diag = diag;
92   return 0;
93 }
94 
95 #include "draw.h"
96 #include "pinclude/pviewer.h"
97 #include "sys.h"
98 
99 static int MatView_SeqBAIJ_Binary(Mat A,Viewer viewer)
100 {
101   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
102   int         i, fd, *col_lens, ierr, bs = a->bs,count,*jj,j,k,l,bs2=a->bs2;
103   Scalar      *aa;
104 
105   ierr = ViewerBinaryGetDescriptor(viewer,&fd); CHKERRQ(ierr);
106   col_lens = (int *) PetscMalloc((4+a->m)*sizeof(int));CHKPTRQ(col_lens);
107   col_lens[0] = MAT_COOKIE;
108   col_lens[1] = a->m;
109   col_lens[2] = a->n;
110   col_lens[3] = a->nz*bs2;
111 
112   /* store lengths of each row and write (including header) to file */
113   count = 0;
114   for ( i=0; i<a->mbs; i++ ) {
115     for ( j=0; j<bs; j++ ) {
116       col_lens[4+count++] = bs*(a->i[i+1] - a->i[i]);
117     }
118   }
119   ierr = PetscBinaryWrite(fd,col_lens,4+a->m,BINARY_INT,1); CHKERRQ(ierr);
120   PetscFree(col_lens);
121 
122   /* store column indices (zero start index) */
123   jj = (int *) PetscMalloc( a->nz*bs2*sizeof(int) ); CHKPTRQ(jj);
124   count = 0;
125   for ( i=0; i<a->mbs; i++ ) {
126     for ( j=0; j<bs; j++ ) {
127       for ( k=a->i[i]; k<a->i[i+1]; k++ ) {
128         for ( l=0; l<bs; l++ ) {
129           jj[count++] = bs*a->j[k] + l;
130         }
131       }
132     }
133   }
134   ierr = PetscBinaryWrite(fd,jj,bs2*a->nz,BINARY_INT,0); CHKERRQ(ierr);
135   PetscFree(jj);
136 
137   /* store nonzero values */
138   aa = (Scalar *) PetscMalloc(a->nz*bs2*sizeof(Scalar)); CHKPTRQ(aa);
139   count = 0;
140   for ( i=0; i<a->mbs; i++ ) {
141     for ( j=0; j<bs; j++ ) {
142       for ( k=a->i[i]; k<a->i[i+1]; k++ ) {
143         for ( l=0; l<bs; l++ ) {
144           aa[count++] = a->a[bs2*k + l*bs + j];
145         }
146       }
147     }
148   }
149   ierr = PetscBinaryWrite(fd,aa,bs2*a->nz,BINARY_SCALAR,0); CHKERRQ(ierr);
150   PetscFree(aa);
151   return 0;
152 }
153 
154 static int MatView_SeqBAIJ_ASCII(Mat A,Viewer viewer)
155 {
156   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
157   int         ierr, i,j,format,bs = a->bs,k,l,bs2=a->bs2;
158   FILE        *fd;
159   char        *outputname;
160 
161   ierr = ViewerASCIIGetPointer(viewer,&fd); CHKERRQ(ierr);
162   ierr = ViewerFileGetOutputname_Private(viewer,&outputname);CHKERRQ(ierr);
163   ierr = ViewerGetFormat(viewer,&format);
164   if (format == ASCII_FORMAT_INFO || format == ASCII_FORMAT_INFO_DETAILED) {
165     fprintf(fd,"  block size is %d\n",bs);
166   }
167   else if (format == ASCII_FORMAT_MATLAB) {
168     SETERRQ(1,"MatView_SeqBAIJ_ASCII:Matlab format not supported");
169   }
170   else if (format == ASCII_FORMAT_COMMON) {
171     for ( i=0; i<a->mbs; i++ ) {
172       for ( j=0; j<bs; j++ ) {
173         fprintf(fd,"row %d:",i*bs+j);
174         for ( k=a->i[i]; k<a->i[i+1]; k++ ) {
175           for ( l=0; l<bs; l++ ) {
176 #if defined(PETSC_COMPLEX)
177           if (imag(a->a[bs2*k + l*bs + j]) != 0.0 && real(a->a[bs2*k + l*bs + j]) != 0.0)
178             fprintf(fd," %d %g + %g i",bs*a->j[k]+l,
179               real(a->a[bs2*k + l*bs + j]),imag(a->a[bs2*k + l*bs + j]));
180           else if (real(a->a[bs2*k + l*bs + j]) != 0.0)
181             fprintf(fd," %d %g ",bs*a->j[k]+l,real(a->a[bs2*k + l*bs + j]));
182 #else
183           if (a->a[bs2*k + l*bs + j] != 0.0)
184             fprintf(fd," %d %g ",bs*a->j[k]+l,a->a[bs2*k + l*bs + j]);
185 #endif
186           }
187         }
188         fprintf(fd,"\n");
189       }
190     }
191   }
192   else {
193     for ( i=0; i<a->mbs; i++ ) {
194       for ( j=0; j<bs; j++ ) {
195         fprintf(fd,"row %d:",i*bs+j);
196         for ( k=a->i[i]; k<a->i[i+1]; k++ ) {
197           for ( l=0; l<bs; l++ ) {
198 #if defined(PETSC_COMPLEX)
199           if (imag(a->a[bs2*k + l*bs + j]) != 0.0) {
200             fprintf(fd," %d %g + %g i",bs*a->j[k]+l,
201               real(a->a[bs2*k + l*bs + j]),imag(a->a[bs2*k + l*bs + j]));
202           }
203           else {
204             fprintf(fd," %d %g ",bs*a->j[k]+l,real(a->a[bs2*k + l*bs + j]));
205           }
206 #else
207             fprintf(fd," %d %g ",bs*a->j[k]+l,a->a[bs2*k + l*bs + j]);
208 #endif
209           }
210         }
211         fprintf(fd,"\n");
212       }
213     }
214   }
215   fflush(fd);
216   return 0;
217 }
218 
219 static int MatView_SeqBAIJ(PetscObject obj,Viewer viewer)
220 {
221   Mat         A = (Mat) obj;
222   ViewerType  vtype;
223   int         ierr;
224 
225   ierr = ViewerGetType(viewer,&vtype); CHKERRQ(ierr);
226   if (vtype == MATLAB_VIEWER) {
227     SETERRQ(1,"MatView_SeqBAIJ:Matlab viewer not supported");
228   }
229   else if (vtype == ASCII_FILE_VIEWER || vtype == ASCII_FILES_VIEWER){
230     return MatView_SeqBAIJ_ASCII(A,viewer);
231   }
232   else if (vtype == BINARY_FILE_VIEWER) {
233     return MatView_SeqBAIJ_Binary(A,viewer);
234   }
235   else if (vtype == DRAW_VIEWER) {
236     SETERRQ(1,"MatView_SeqBAIJ:Draw viewer not supported");
237   }
238   return 0;
239 }
240 
241 #define CHUNKSIZE  10
242 
243 /* This version has row oriented v  */
244 static int MatSetValues_SeqBAIJ(Mat A,int m,int *im,int n,int *in,Scalar *v,InsertMode is)
245 {
246   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
247   int         *rp,k,low,high,t,ii,row,nrow,i,col,l,rmax,N,sorted=a->sorted;
248   int         *imax=a->imax,*ai=a->i,*ailen=a->ilen,roworiented=a->roworiented;
249   int         *aj=a->j,nonew=a->nonew,bs=a->bs,brow,bcol;
250   int          ridx,cidx,bs2=a->bs2;
251   Scalar      *ap,value,*aa=a->a,*bap;
252 
253   for ( k=0; k<m; k++ ) { /* loop over added rows */
254     row  = im[k]; brow = row/bs;
255     if (row < 0) SETERRQ(1,"MatSetValues_SeqBAIJ:Negative row");
256     if (row >= a->m) SETERRQ(1,"MatSetValues_SeqBAIJ:Row too large");
257     rp   = aj + ai[brow]; ap = aa + bs2*ai[brow];
258     rmax = imax[brow]; nrow = ailen[brow];
259     low = 0;
260     for ( l=0; l<n; l++ ) { /* loop over added columns */
261       if (in[l] < 0) SETERRQ(1,"MatSetValues_SeqBAIJ:Negative column");
262       if (in[l] >= a->n) SETERRQ(1,"MatSetValues_SeqBAIJ:Column too large");
263       col = in[l]; bcol = col/bs;
264       ridx = row % bs; cidx = col % bs;
265       if (roworiented) {
266         value = *v++;
267       }
268       else {
269         value = v[k + l*m];
270       }
271       if (!sorted) low = 0; high = nrow;
272       while (high-low > 5) {
273         t = (low+high)/2;
274         if (rp[t] > bcol) high = t;
275         else             low  = t;
276       }
277       for ( i=low; i<high; i++ ) {
278         if (rp[i] > bcol) break;
279         if (rp[i] == bcol) {
280           bap  = ap +  bs2*i + bs*cidx + ridx;
281           if (is == ADD_VALUES) *bap += value;
282           else                  *bap  = value;
283           goto noinsert;
284         }
285       }
286       if (nonew) goto noinsert;
287       if (nrow >= rmax) {
288         /* there is no extra room in row, therefore enlarge */
289         int    new_nz = ai[a->mbs] + CHUNKSIZE,len,*new_i,*new_j;
290         Scalar *new_a;
291 
292         /* malloc new storage space */
293         len     = new_nz*(sizeof(int)+bs2*sizeof(Scalar))+(a->mbs+1)*sizeof(int);
294         new_a   = (Scalar *) PetscMalloc( len ); CHKPTRQ(new_a);
295         new_j   = (int *) (new_a + bs2*new_nz);
296         new_i   = new_j + new_nz;
297 
298         /* copy over old data into new slots */
299         for ( ii=0; ii<brow+1; ii++ ) {new_i[ii] = ai[ii];}
300         for ( ii=brow+1; ii<a->mbs+1; ii++ ) {new_i[ii] = ai[ii]+CHUNKSIZE;}
301         PetscMemcpy(new_j,aj,(ai[brow]+nrow)*sizeof(int));
302         len = (new_nz - CHUNKSIZE - ai[brow] - nrow);
303         PetscMemcpy(new_j+ai[brow]+nrow+CHUNKSIZE,aj+ai[brow]+nrow,
304                                                            len*sizeof(int));
305         PetscMemcpy(new_a,aa,(ai[brow]+nrow)*bs2*sizeof(Scalar));
306         PetscMemzero(new_a+bs2*(ai[brow]+nrow),bs2*CHUNKSIZE*sizeof(Scalar));
307         PetscMemcpy(new_a+bs2*(ai[brow]+nrow+CHUNKSIZE),
308                     aa+bs2*(ai[brow]+nrow),bs2*len*sizeof(Scalar));
309         /* free up old matrix storage */
310         PetscFree(a->a);
311         if (!a->singlemalloc) {PetscFree(a->i);PetscFree(a->j);}
312         aa = a->a = new_a; ai = a->i = new_i; aj = a->j = new_j;
313         a->singlemalloc = 1;
314 
315         rp   = aj + ai[brow]; ap = aa + bs2*ai[brow];
316         rmax = imax[brow] = imax[brow] + CHUNKSIZE;
317         PLogObjectMemory(A,CHUNKSIZE*(sizeof(int) + bs2*sizeof(Scalar)));
318         a->maxnz += bs2*CHUNKSIZE;
319         a->reallocs++;
320         a->nz++;
321       }
322       N = nrow++ - 1;
323       /* shift up all the later entries in this row */
324       for ( ii=N; ii>=i; ii-- ) {
325         rp[ii+1] = rp[ii];
326          PetscMemcpy(ap+bs2*(ii+1),ap+bs2*(ii),bs2*sizeof(Scalar));
327       }
328       if (N>=i) PetscMemzero(ap+bs2*i,bs2*sizeof(Scalar));
329       rp[i] = bcol;
330       ap[bs2*i + bs*cidx + ridx] = value;
331       noinsert:;
332       low = i;
333     }
334     ailen[brow] = nrow;
335   }
336   return 0;
337 }
338 
339 static int MatGetSize_SeqBAIJ(Mat A,int *m,int *n)
340 {
341   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
342   *m = a->m; *n = a->n;
343   return 0;
344 }
345 
346 static int MatGetOwnershipRange_SeqBAIJ(Mat A,int *m,int *n)
347 {
348   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
349   *m = 0; *n = a->m;
350   return 0;
351 }
352 
353 int MatGetRow_SeqBAIJ(Mat A,int row,int *nz,int **idx,Scalar **v)
354 {
355   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
356   int          itmp,i,j,k,M,*ai,*aj,bs,bn,bp,*idx_i,bs2;
357   Scalar      *aa,*v_i,*aa_i;
358 
359   bs  = a->bs;
360   ai  = a->i;
361   aj  = a->j;
362   aa  = a->a;
363   bs2 = a->bs2;
364 
365   if (row < 0 || row >= a->m) SETERRQ(1,"MatGetRow_SeqBAIJ:Row out of range");
366 
367   bn  = row/bs;   /* Block number */
368   bp  = row % bs; /* Block Position */
369   M   = ai[bn+1] - ai[bn];
370   *nz = bs*M;
371 
372   if (v) {
373     *v = 0;
374     if (*nz) {
375       *v = (Scalar *) PetscMalloc( (*nz)*sizeof(Scalar) ); CHKPTRQ(*v);
376       for ( i=0; i<M; i++ ) { /* for each block in the block row */
377         v_i  = *v + i*bs;
378         aa_i = aa + bs2*(ai[bn] + i);
379         for ( j=bp,k=0; j<bs2; j+=bs,k++ ) {v_i[k] = aa_i[j];}
380       }
381     }
382   }
383 
384   if (idx) {
385     *idx = 0;
386     if (*nz) {
387       *idx = (int *) PetscMalloc( (*nz)*sizeof(int) ); CHKPTRQ(*idx);
388       for ( i=0; i<M; i++ ) { /* for each block in the block row */
389         idx_i = *idx + i*bs;
390         itmp  = bs*aj[ai[bn] + i];
391         for ( j=0; j<bs; j++ ) {idx_i[j] = itmp++;}
392       }
393     }
394   }
395   return 0;
396 }
397 
398 int MatRestoreRow_SeqBAIJ(Mat A,int row,int *nz,int **idx,Scalar **v)
399 {
400   if (idx) {if (*idx) PetscFree(*idx);}
401   if (v)   {if (*v)   PetscFree(*v);}
402   return 0;
403 }
404 
405 static int MatTranspose_SeqBAIJ(Mat A,Mat *B)
406 {
407   Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
408   Mat         C;
409   int         i,j,k,ierr,*aj=a->j,*ai=a->i,bs=a->bs,mbs=a->mbs,nbs=a->nbs,len,*col;
410   int         *rows,*cols,bs2=a->bs2;
411   Scalar      *array=a->a;
412 
413   if (B==PETSC_NULL && mbs!=nbs)
414     SETERRQ(1,"MatTranspose_SeqBAIJ:Square matrix only for in-place");
415   col = (int *) PetscMalloc((1+nbs)*sizeof(int)); CHKPTRQ(col);
416   PetscMemzero(col,(1+nbs)*sizeof(int));
417 
418   for ( i=0; i<ai[mbs]; i++ ) col[aj[i]] += 1;
419   ierr = MatCreateSeqBAIJ(A->comm,bs,a->n,a->m,PETSC_NULL,col,&C); CHKERRQ(ierr);
420   PetscFree(col);
421   rows = (int *) PetscMalloc(2*bs*sizeof(int)); CHKPTRQ(rows);
422   cols = rows + bs;
423   for ( i=0; i<mbs; i++ ) {
424     cols[0] = i*bs;
425     for (k=1; k<bs; k++ ) cols[k] = cols[k-1] + 1;
426     len = ai[i+1] - ai[i];
427     for ( j=0; j<len; j++ ) {
428       rows[0] = (*aj++)*bs;
429       for (k=1; k<bs; k++ ) rows[k] = rows[k-1] + 1;
430       ierr = MatSetValues(C,bs,rows,bs,cols,array,INSERT_VALUES); CHKERRQ(ierr);
431       array += bs2;
432     }
433   }
434   PetscFree(rows);
435 
436   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
437   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
438 
439   if (B != PETSC_NULL) {
440     *B = C;
441   } else {
442     /* This isn't really an in-place transpose */
443     PetscFree(a->a);
444     if (!a->singlemalloc) {PetscFree(a->i); PetscFree(a->j);}
445     if (a->diag) PetscFree(a->diag);
446     if (a->ilen) PetscFree(a->ilen);
447     if (a->imax) PetscFree(a->imax);
448     if (a->solve_work) PetscFree(a->solve_work);
449     PetscFree(a);
450     PetscMemcpy(A,C,sizeof(struct _Mat));
451     PetscHeaderDestroy(C);
452   }
453   return 0;
454 }
455 
456 
457 static int MatAssemblyEnd_SeqBAIJ(Mat A,MatAssemblyType mode)
458 {
459   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
460   int        fshift = 0,i,j,*ai = a->i, *aj = a->j, *imax = a->imax;
461   int        m = a->m,*ip, N, *ailen = a->ilen;
462   int        mbs = a->mbs, bs2 = a->bs2;
463   Scalar     *aa = a->a, *ap;
464 
465   if (mode == MAT_FLUSH_ASSEMBLY) return 0;
466 
467   for ( i=1; i<mbs; i++ ) {
468     /* move each row back by the amount of empty slots (fshift) before it*/
469     fshift += imax[i-1] - ailen[i-1];
470     if (fshift) {
471       ip = aj + ai[i]; ap = aa + bs2*ai[i];
472       N = ailen[i];
473       for ( j=0; j<N; j++ ) {
474         ip[j-fshift] = ip[j];
475         PetscMemcpy(ap+(j-fshift)*bs2,ap+j*bs2,bs2*sizeof(Scalar));
476       }
477     }
478     ai[i] = ai[i-1] + ailen[i-1];
479   }
480   if (mbs) {
481     fshift += imax[mbs-1] - ailen[mbs-1];
482     ai[mbs] = ai[mbs-1] + ailen[mbs-1];
483   }
484   /* reset ilen and imax for each row */
485   for ( i=0; i<mbs; i++ ) {
486     ailen[i] = imax[i] = ai[i+1] - ai[i];
487   }
488   a->nz = ai[mbs];
489 
490   /* diagonals may have moved, so kill the diagonal pointers */
491   if (fshift && a->diag) {
492     PetscFree(a->diag);
493     PLogObjectMemory(A,-(m+1)*sizeof(int));
494     a->diag = 0;
495   }
496   PLogInfo(A,"MatAssemblyEnd_SeqBAIJ: Unneed storage space (blocks) %d used %d, rows %d, block size %d\n", fshift*bs2,a->nz*bs2,m,a->bs);
497   PLogInfo(A,"MatAssemblyEnd_SeqBAIJ: Number of mallocs during MatSetValues %d\n",
498            a->reallocs);
499   return 0;
500 }
501 
502 static int MatZeroEntries_SeqBAIJ(Mat A)
503 {
504   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
505   PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(Scalar));
506   return 0;
507 }
508 
509 int MatDestroy_SeqBAIJ(PetscObject obj)
510 {
511   Mat         A  = (Mat) obj;
512   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
513 
514 #if defined(PETSC_LOG)
515   PLogObjectState(obj,"Rows=%d, Cols=%d, NZ=%d",a->m,a->n,a->nz);
516 #endif
517   PetscFree(a->a);
518   if (!a->singlemalloc) { PetscFree(a->i); PetscFree(a->j);}
519   if (a->diag) PetscFree(a->diag);
520   if (a->ilen) PetscFree(a->ilen);
521   if (a->imax) PetscFree(a->imax);
522   if (a->solve_work) PetscFree(a->solve_work);
523   if (a->mult_work) PetscFree(a->mult_work);
524   PetscFree(a);
525   PLogObjectDestroy(A);
526   PetscHeaderDestroy(A);
527   return 0;
528 }
529 
530 static int MatSetOption_SeqBAIJ(Mat A,MatOption op)
531 {
532   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
533   if      (op == MAT_ROW_ORIENTED)              a->roworiented = 1;
534   else if (op == MAT_COLUMN_ORIENTED)           a->roworiented = 0;
535   else if (op == MAT_COLUMNS_SORTED)            a->sorted      = 1;
536   else if (op == MAT_NO_NEW_NONZERO_LOCATIONS)  a->nonew       = 1;
537   else if (op == MAT_YES_NEW_NONZERO_LOCATIONS) a->nonew       = 0;
538   else if (op == MAT_ROWS_SORTED ||
539            op == MAT_SYMMETRIC ||
540            op == MAT_STRUCTURALLY_SYMMETRIC ||
541            op == MAT_YES_NEW_DIAGONALS)
542     PLogInfo(A,"Info:MatSetOption_SeqBAIJ:Option ignored\n");
543   else if (op == MAT_NO_NEW_DIAGONALS)
544     {SETERRQ(PETSC_ERR_SUP,"MatSetOption_SeqBAIJ:MAT_NO_NEW_DIAGONALS");}
545   else
546     {SETERRQ(PETSC_ERR_SUP,"MatSetOption_SeqBAIJ:unknown option");}
547   return 0;
548 }
549 
550 
551 /* -------------------------------------------------------*/
552 /* Should check that shapes of vectors and matrices match */
553 /* -------------------------------------------------------*/
554 #include "pinclude/plapack.h"
555 
556 static int MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
557 {
558   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
559   Scalar          *xg,*zg;
560   register Scalar *x,*z,*v,sum;
561   int             mbs=a->mbs,i,*idx,*ii,n,ierr;
562 
563   ierr = VecGetArray(xx,&xg); CHKERRQ(ierr); x = xg;
564   ierr = VecGetArray(zz,&zg); CHKERRQ(ierr); z = zg;
565 
566   idx   = a->j;
567   v     = a->a;
568   ii    = a->i;
569 
570   for ( i=0; i<mbs; i++ ) {
571     n    = ii[1] - ii[0]; ii++;
572     sum  = 0.0;
573     while (n--) sum += *v++ * x[*idx++];
574     z[i] = sum;
575   }
576   ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
577   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
578   PLogFlops(2*a->nz - a->m);
579   return 0;
580 }
581 
582 static int MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
583 {
584   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
585   Scalar          *xg,*zg;
586   register Scalar *x,*z,*v,*xb,sum1,sum2;
587   register Scalar x1,x2;
588   int             mbs=a->mbs,i,*idx,*ii;
589   int             j,n,ierr;
590 
591   ierr = VecGetArray(xx,&xg); CHKERRQ(ierr); x = xg;
592   ierr = VecGetArray(zz,&zg); CHKERRQ(ierr); z = zg;
593 
594   idx   = a->j;
595   v     = a->a;
596   ii    = a->i;
597 
598   for ( i=0; i<mbs; i++ ) {
599     n  = ii[1] - ii[0]; ii++;
600     sum1 = 0.0; sum2 = 0.0;
601     for ( j=0; j<n; j++ ) {
602       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
603       sum1 += v[0]*x1 + v[2]*x2;
604       sum2 += v[1]*x1 + v[3]*x2;
605       v += 4;
606     }
607     z[0] = sum1; z[1] = sum2;
608     z += 2;
609   }
610   ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
611   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
612   PLogFlops(4*a->nz - a->m);
613   return 0;
614 }
615 
616 static int MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
617 {
618   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
619   Scalar          *xg,*zg;
620   register Scalar *x,*z,*v,*xb,sum1,sum2,sum3,x1,x2,x3;
621   int             mbs=a->mbs,i,*idx,*ii,j,n,ierr;
622 
623   ierr = VecGetArray(xx,&xg); CHKERRQ(ierr); x = xg;
624   ierr = VecGetArray(zz,&zg); CHKERRQ(ierr); z = zg;
625 
626   idx   = a->j;
627   v     = a->a;
628   ii    = a->i;
629 
630   for ( i=0; i<mbs; i++ ) {
631     n  = ii[1] - ii[0]; ii++;
632     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
633     for ( j=0; j<n; j++ ) {
634       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
635       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
636       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
637       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
638       v += 9;
639     }
640     z[0] = sum1; z[1] = sum2; z[2] = sum3;
641     z += 3;
642   }
643   ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
644   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
645   PLogFlops(18*a->nz - a->m);
646   return 0;
647 }
648 
649 static int MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
650 {
651   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
652   Scalar          *xg,*zg;
653   register Scalar *x,*z,*v,*xb,sum1,sum2,sum3,sum4;
654   register Scalar x1,x2,x3,x4;
655   int             mbs=a->mbs,i,*idx,*ii;
656   int             j,n,ierr;
657 
658   ierr = VecGetArray(xx,&xg); CHKERRQ(ierr); x = xg;
659   ierr = VecGetArray(zz,&zg); CHKERRQ(ierr); z = zg;
660 
661   idx   = a->j;
662   v     = a->a;
663   ii    = a->i;
664 
665   for ( i=0; i<mbs; i++ ) {
666     n  = ii[1] - ii[0]; ii++;
667     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
668     for ( j=0; j<n; j++ ) {
669       xb = x + 4*(*idx++);
670       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
671       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
672       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
673       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
674       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
675       v += 16;
676     }
677     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
678     z += 4;
679   }
680   ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
681   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
682   PLogFlops(32*a->nz - a->m);
683   return 0;
684 }
685 
686 static int MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
687 {
688   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
689   Scalar          *xg,*zg;
690   register Scalar *x,*z,*v,*xb,sum1,sum2,sum3,sum4,sum5;
691   register Scalar x1,x2,x3,x4,x5;
692   int             mbs=a->mbs,i,*idx,*ii,j,n,ierr;
693 
694   ierr = VecGetArray(xx,&xg); CHKERRQ(ierr); x = xg;
695   ierr = VecGetArray(zz,&zg); CHKERRQ(ierr); z = zg;
696 
697   idx   = a->j;
698   v     = a->a;
699   ii    = a->i;
700 
701   for ( i=0; i<mbs; i++ ) {
702     n  = ii[1] - ii[0]; ii++;
703     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
704     for ( j=0; j<n; j++ ) {
705       xb = x + 5*(*idx++);
706       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
707       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
708       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
709       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
710       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
711       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
712       v += 25;
713     }
714     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
715     z += 5;
716   }
717   ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
718   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
719   PLogFlops(50*a->nz - a->m);
720   return 0;
721 }
722 
723 static int MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
724 {
725   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
726   Scalar          *xg,*zg;
727   register Scalar *x,*z,*v,*xb;
728   int             mbs=a->mbs,i,*idx,*ii,bs=a->bs,j,n,bs2=a->bs2,ierr;
729   int             _One = 1,ncols,k; Scalar _DOne = 1.0, *work,*workt, _DZero = 0.0;
730 
731   ierr = VecGetArray(xx,&xg); CHKERRQ(ierr); x = xg;
732   ierr = VecGetArray(zz,&zg); CHKERRQ(ierr); z = zg;
733 
734   idx   = a->j;
735   v     = a->a;
736   ii    = a->i;
737 
738 
739   if (!a->mult_work) {
740     k = PetscMax(a->m,a->n);
741     a->mult_work = (Scalar *) PetscMalloc((k+1)*sizeof(Scalar));CHKPTRQ(a->mult_work);
742   }
743   work = a->mult_work;
744   for ( i=0; i<mbs; i++ ) {
745     n     = ii[1] - ii[0]; ii++;
746     ncols = n*bs;
747     workt = work;
748     for ( j=0; j<n; j++ ) {
749       xb = x + bs*(*idx++);
750       for ( k=0; k<bs; k++ ) workt[k] = xb[k];
751       workt += bs;
752     }
753     LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One);
754     v += n*bs2;
755     z += bs;
756   }
757    ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
758   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
759   PLogFlops(2*a->nz*bs2 - a->m);
760   return 0;
761 }
762 
763 static int MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
764 {
765   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
766   Scalar          *xg,*yg,*zg;
767   register Scalar *x,*y,*z,*v,sum;
768   int             mbs=a->mbs,i,*idx,*ii,n,ierr;
769 
770   ierr = VecGetArray(xx,&xg); CHKERRQ(ierr); x = xg;
771   ierr = VecGetArray(yy,&yg); CHKERRQ(ierr); y = yg;
772   ierr = VecGetArray(zz,&zg); CHKERRQ(ierr); z = zg;
773 
774   idx   = a->j;
775   v     = a->a;
776   ii    = a->i;
777 
778   for ( i=0; i<mbs; i++ ) {
779     n    = ii[1] - ii[0]; ii++;
780     sum  = y[i];
781     while (n--) sum += *v++ * x[*idx++];
782     z[i] = sum;
783   }
784   ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
785   ierr = VecRestoreArray(yy,&yg); CHKERRQ(ierr);
786   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
787   PLogFlops(2*a->nz);
788   return 0;
789 }
790 
791 static int MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
792 {
793   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
794   Scalar          *xg,*yg,*zg;
795   register Scalar *x,*y,*z,*v,*xb,sum1,sum2;
796   register Scalar x1,x2;
797   int             mbs=a->mbs,i,*idx,*ii;
798   int             j,n,ierr;
799 
800   ierr = VecGetArray(xx,&xg); CHKERRQ(ierr); x = xg;
801   ierr = VecGetArray(yy,&yg); CHKERRQ(ierr); y = yg;
802   ierr = VecGetArray(zz,&zg); CHKERRQ(ierr); z = zg;
803 
804   idx   = a->j;
805   v     = a->a;
806   ii    = a->i;
807 
808   for ( i=0; i<mbs; i++ ) {
809     n  = ii[1] - ii[0]; ii++;
810     sum1 = y[0]; sum2 = y[1];
811     for ( j=0; j<n; j++ ) {
812       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
813       sum1 += v[0]*x1 + v[2]*x2;
814       sum2 += v[1]*x1 + v[3]*x2;
815       v += 4;
816     }
817     z[0] = sum1; z[1] = sum2;
818     z += 2; y += 2;
819   }
820   ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
821   ierr = VecRestoreArray(yy,&yg); CHKERRQ(ierr);
822   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
823   PLogFlops(4*a->nz);
824   return 0;
825 }
826 
827 static int MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
828 {
829   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
830   Scalar          *xg,*yg,*zg;
831   register Scalar *x,*y,*z,*v,*xb,sum1,sum2,sum3,x1,x2,x3;
832   int             mbs=a->mbs,i,*idx,*ii,j,n,ierr;
833 
834   ierr = VecGetArray(xx,&xg); CHKERRQ(ierr); x = xg;
835   ierr = VecGetArray(yy,&yg); CHKERRQ(ierr); y = yg;
836   ierr = VecGetArray(zz,&zg); CHKERRQ(ierr); z = zg;
837 
838   idx   = a->j;
839   v     = a->a;
840   ii    = a->i;
841 
842   for ( i=0; i<mbs; i++ ) {
843     n  = ii[1] - ii[0]; ii++;
844     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
845     for ( j=0; j<n; j++ ) {
846       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
847       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
848       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
849       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
850       v += 9;
851     }
852     z[0] = sum1; z[1] = sum2; z[2] = sum3;
853     z += 3; y += 3;
854   }
855   ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
856   ierr = VecRestoreArray(yy,&yg); CHKERRQ(ierr);
857   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
858   PLogFlops(18*a->nz);
859   return 0;
860 }
861 
862 static int MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
863 {
864   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
865   Scalar          *xg,*yg,*zg;
866   register Scalar *x,*y,*z,*v,*xb,sum1,sum2,sum3,sum4;
867   register Scalar x1,x2,x3,x4;
868   int             mbs=a->mbs,i,*idx,*ii;
869   int             j,n,ierr;
870 
871   ierr = VecGetArray(xx,&xg); CHKERRQ(ierr); x = xg;
872   ierr = VecGetArray(yy,&yg); CHKERRQ(ierr); y = yg;
873   ierr = VecGetArray(zz,&zg); CHKERRQ(ierr); z = zg;
874 
875   idx   = a->j;
876   v     = a->a;
877   ii    = a->i;
878 
879   for ( i=0; i<mbs; i++ ) {
880     n  = ii[1] - ii[0]; ii++;
881     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
882     for ( j=0; j<n; j++ ) {
883       xb = x + 4*(*idx++);
884       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
885       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
886       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
887       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
888       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
889       v += 16;
890     }
891     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
892     z += 4; y += 4;
893   }
894   ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
895   ierr = VecRestoreArray(yy,&yg); CHKERRQ(ierr);
896   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
897   PLogFlops(32*a->nz);
898   return 0;
899 }
900 
901 static int MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
902 {
903   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
904   Scalar          *xg,*yg,*zg;
905   register Scalar *x,*y,*z,*v,*xb,sum1,sum2,sum3,sum4,sum5;
906   register Scalar x1,x2,x3,x4,x5;
907   int             mbs=a->mbs,i,*idx,*ii,j,n,ierr;
908 
909   ierr = VecGetArray(xx,&xg); CHKERRQ(ierr); x = xg;
910   ierr = VecGetArray(yy,&yg); CHKERRQ(ierr); y = yg;
911   ierr = VecGetArray(zz,&zg); CHKERRQ(ierr); z = zg;
912 
913   idx   = a->j;
914   v     = a->a;
915   ii    = a->i;
916 
917   for ( i=0; i<mbs; i++ ) {
918     n  = ii[1] - ii[0]; ii++;
919     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
920     for ( j=0; j<n; j++ ) {
921       xb = x + 5*(*idx++);
922       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
923       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
924       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
925       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
926       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
927       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
928       v += 25;
929     }
930     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
931     z += 5; y += 5;
932   }
933   ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
934   ierr = VecRestoreArray(yy,&yg); CHKERRQ(ierr);
935   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
936   PLogFlops(50*a->nz);
937   return 0;
938 }
939 
940 static int MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
941 {
942   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
943   Scalar          *xg,*zg;
944   register Scalar *x,*z,*v,*xb;
945   int             mbs=a->mbs,i,*idx,*ii,bs=a->bs,j,n,bs2=a->bs2,ierr;
946   int             _One = 1,ncols,k; Scalar _DOne = 1.0, *work,*workt;
947 
948   if ( xx != yy) { ierr = VecCopy(yy,zz); CHKERRQ(ierr); }
949 
950   ierr = VecGetArray(xx,&xg); CHKERRQ(ierr); x = xg;
951   ierr = VecGetArray(zz,&zg); CHKERRQ(ierr); z = zg;
952 
953   idx   = a->j;
954   v     = a->a;
955   ii    = a->i;
956 
957 
958   if (!a->mult_work) {
959     k = PetscMax(a->m,a->n);
960     a->mult_work = (Scalar *) PetscMalloc(k*sizeof(Scalar));CHKPTRQ(a->mult_work);
961   }
962   work = a->mult_work;
963   for ( i=0; i<mbs; i++ ) {
964     n     = ii[1] - ii[0]; ii++;
965     ncols = n*bs;
966     workt = work;
967     for ( j=0; j<n; j++ ) {
968       xb = x + bs*(*idx++);
969       for ( k=0; k<bs; k++ ) workt[k] = xb[k];
970       workt += bs;
971     }
972     LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One);
973     v += n*bs2;
974     z += bs;
975   }
976   ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
977   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
978   PLogFlops(2*a->nz*bs2 );
979   return 0;
980 }
981 
982 static int MatMultTrans_SeqBAIJ(Mat A,Vec xx,Vec zz)
983 {
984   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
985   Scalar          *xg,*zg,*zb;
986   register Scalar *x,*z,*v,*xb,x1,x2,x3,x4,x5;
987   int             mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval,N=a->n;
988   int             bs=a->bs,j,n,bs2=a->bs2,*ib,ierr;
989 
990 
991   ierr = VecGetArray(xx,&xg); CHKERRQ(ierr); x = xg;
992   ierr = VecGetArray(zz,&zg); CHKERRQ(ierr); z = zg;
993   PetscMemzero(z,N*sizeof(Scalar));
994 
995   idx   = a->j;
996   v     = a->a;
997   ii    = a->i;
998 
999   switch (bs) {
1000   case 1:
1001     for ( i=0; i<mbs; i++ ) {
1002       n  = ii[1] - ii[0]; ii++;
1003       xb = x + i; x1 = xb[0];
1004       ib = idx + ai[i];
1005       for ( j=0; j<n; j++ ) {
1006         rval    = ib[j];
1007         z[rval] += *v++ * x1;
1008       }
1009     }
1010     break;
1011   case 2:
1012     for ( i=0; i<mbs; i++ ) {
1013       n  = ii[1] - ii[0]; ii++;
1014       xb = x + 2*i; x1 = xb[0]; x2 = xb[1];
1015       ib = idx + ai[i];
1016       for ( j=0; j<n; j++ ) {
1017         rval      = ib[j]*2;
1018         z[rval++] += v[0]*x1 + v[1]*x2;
1019         z[rval++] += v[2]*x1 + v[3]*x2;
1020         v += 4;
1021       }
1022     }
1023     break;
1024   case 3:
1025     for ( i=0; i<mbs; i++ ) {
1026       n  = ii[1] - ii[0]; ii++;
1027       xb = x + 3*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1028       ib = idx + ai[i];
1029       for ( j=0; j<n; j++ ) {
1030         rval      = ib[j]*3;
1031         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1032         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1033         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1034         v += 9;
1035       }
1036     }
1037     break;
1038   case 4:
1039     for ( i=0; i<mbs; i++ ) {
1040       n  = ii[1] - ii[0]; ii++;
1041       xb = x + 4*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1042       ib = idx + ai[i];
1043       for ( j=0; j<n; j++ ) {
1044         rval      = ib[j]*4;
1045         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1046         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1047         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1048         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1049         v += 16;
1050       }
1051     }
1052     break;
1053   case 5:
1054     for ( i=0; i<mbs; i++ ) {
1055       n  = ii[1] - ii[0]; ii++;
1056       xb = x + 5*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1057       x4 = xb[3];   x5 = xb[4];
1058       ib = idx + ai[i];
1059       for ( j=0; j<n; j++ ) {
1060         rval      = ib[j]*5;
1061         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1062         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1063         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1064         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1065         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1066         v += 25;
1067       }
1068     }
1069     break;
1070       /* block sizes larger then 5 by 5 are handled by BLAS */
1071     default: {
1072       int  _One = 1,ncols,k; Scalar _DOne = 1.0, *work,*workt;
1073       if (!a->mult_work) {
1074         k = PetscMax(a->m,a->n);
1075         a->mult_work = (Scalar *) PetscMalloc(k*sizeof(Scalar));
1076         CHKPTRQ(a->mult_work);
1077       }
1078       work = a->mult_work;
1079       for ( i=0; i<mbs; i++ ) {
1080         n     = ii[1] - ii[0]; ii++;
1081         ncols = n*bs;
1082         PetscMemzero(work,ncols*sizeof(Scalar));
1083         LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One);
1084         v += n*bs2;
1085         x += bs;
1086         workt = work;
1087         for ( j=0; j<n; j++ ) {
1088           zb = z + bs*(*idx++);
1089           for ( k=0; k<bs; k++ ) zb[k] += workt[k] ;
1090           workt += bs;
1091         }
1092       }
1093     }
1094   }
1095   ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
1096   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
1097   PLogFlops(2*a->nz*a->bs2 - a->n);
1098   return 0;
1099 }
1100 static int MatMultTransAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1101 {
1102   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1103   Scalar          *xg,*zg,*zb;
1104   register Scalar *x,*z,*v,*xb,x1,x2,x3,x4,x5;
1105   int             mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval,N=a->n;
1106   int             bs=a->bs,j,n,bs2=a->bs2,*ib,ierr;
1107 
1108 
1109 
1110   ierr = VecGetArray(xx,&xg); CHKERRQ(ierr); x = xg;
1111   ierr = VecGetArray(zz,&zg); CHKERRQ(ierr); z = zg;
1112 
1113   if ( yy != zz ) { ierr = VecCopy(yy,zz); CHKERRQ(ierr); }
1114   else PetscMemzero(z,N*sizeof(Scalar));
1115 
1116 
1117   idx   = a->j;
1118   v     = a->a;
1119   ii    = a->i;
1120 
1121   switch (bs) {
1122   case 1:
1123     for ( i=0; i<mbs; i++ ) {
1124       n  = ii[1] - ii[0]; ii++;
1125       xb = x + i; x1 = xb[0];
1126       ib = idx + ai[i];
1127       for ( j=0; j<n; j++ ) {
1128         rval    = ib[j];
1129         z[rval] += *v++ * x1;
1130       }
1131     }
1132     break;
1133   case 2:
1134     for ( i=0; i<mbs; i++ ) {
1135       n  = ii[1] - ii[0]; ii++;
1136       xb = x + 2*i; x1 = xb[0]; x2 = xb[1];
1137       ib = idx + ai[i];
1138       for ( j=0; j<n; j++ ) {
1139         rval      = ib[j]*2;
1140         z[rval++] += v[0]*x1 + v[1]*x2;
1141         z[rval++] += v[2]*x1 + v[3]*x2;
1142         v += 4;
1143       }
1144     }
1145     break;
1146   case 3:
1147     for ( i=0; i<mbs; i++ ) {
1148       n  = ii[1] - ii[0]; ii++;
1149       xb = x + 3*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1150       ib = idx + ai[i];
1151       for ( j=0; j<n; j++ ) {
1152         rval      = ib[j]*3;
1153         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1154         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1155         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1156         v += 9;
1157       }
1158     }
1159     break;
1160   case 4:
1161     for ( i=0; i<mbs; i++ ) {
1162       n  = ii[1] - ii[0]; ii++;
1163       xb = x + 4*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1164       ib = idx + ai[i];
1165       for ( j=0; j<n; j++ ) {
1166         rval      = ib[j]*4;
1167         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1168         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1169         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1170         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1171         v += 16;
1172       }
1173     }
1174     break;
1175   case 5:
1176     for ( i=0; i<mbs; i++ ) {
1177       n  = ii[1] - ii[0]; ii++;
1178       xb = x + 5*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1179       x4 = xb[3];   x5 = xb[4];
1180       ib = idx + ai[i];
1181       for ( j=0; j<n; j++ ) {
1182         rval      = ib[j]*5;
1183         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1184         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1185         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1186         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1187         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1188         v += 25;
1189       }
1190     }
1191     break;
1192       /* block sizes larger then 5 by 5 are handled by BLAS */
1193     default: {
1194       int  _One = 1,ncols,k; Scalar _DOne = 1.0, *work,*workt;
1195       if (!a->mult_work) {
1196         k = PetscMax(a->m,a->n);
1197         a->mult_work = (Scalar *) PetscMalloc(k*sizeof(Scalar));
1198         CHKPTRQ(a->mult_work);
1199       }
1200       work = a->mult_work;
1201       for ( i=0; i<mbs; i++ ) {
1202         n     = ii[1] - ii[0]; ii++;
1203         ncols = n*bs;
1204         PetscMemzero(work,ncols*sizeof(Scalar));
1205         LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One);
1206         v += n*bs2;
1207         x += bs;
1208         workt = work;
1209         for ( j=0; j<n; j++ ) {
1210           zb = z + bs*(*idx++);
1211           for ( k=0; k<bs; k++ ) zb[k] += workt[k] ;
1212           workt += bs;
1213         }
1214       }
1215     }
1216   }
1217   ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
1218   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
1219   PLogFlops(2*a->nz*a->bs2);
1220   return 0;
1221 }
1222 
1223 static int MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,int *nz,int *nza,int *mem)
1224 {
1225   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1226   if (nz)  *nz  = a->bs2*a->nz;
1227   if (nza) *nza = a->maxnz;
1228   if (mem) *mem = (int)A->mem;
1229   return 0;
1230 }
1231 
1232 static int MatEqual_SeqBAIJ(Mat A,Mat B, PetscTruth* flg)
1233 {
1234   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)B->data;
1235 
1236   if (B->type !=MATSEQBAIJ)SETERRQ(1,"MatEqual_SeqBAIJ:Matrices must be same type");
1237 
1238   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1239   if ((a->m != b->m) || (a->n !=b->n) || (a->bs != b->bs)||
1240       (a->nz != b->nz)) {
1241     *flg = PETSC_FALSE; return 0;
1242   }
1243 
1244   /* if the a->i are the same */
1245   if (PetscMemcmp(a->i,b->i, (a->mbs+1)*sizeof(int))) {
1246     *flg = PETSC_FALSE; return 0;
1247   }
1248 
1249   /* if a->j are the same */
1250   if (PetscMemcmp(a->j,b->j,(a->nz)*sizeof(int))) {
1251     *flg = PETSC_FALSE; return 0;
1252   }
1253 
1254   /* if a->a are the same */
1255   if (PetscMemcmp(a->a, b->a,(a->nz)*(a->bs)*(a->bs)*sizeof(Scalar))) {
1256     *flg = PETSC_FALSE; return 0;
1257   }
1258   *flg = PETSC_TRUE;
1259   return 0;
1260 
1261 }
1262 
1263 static int MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
1264 {
1265   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1266   int         i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1267   Scalar      *x, zero = 0.0,*aa,*aa_j;
1268 
1269   bs   = a->bs;
1270   aa   = a->a;
1271   ai   = a->i;
1272   aj   = a->j;
1273   ambs = a->mbs;
1274   bs2  = a->bs2;
1275 
1276   VecSet(&zero,v);
1277   VecGetArray(v,&x); VecGetLocalSize(v,&n);
1278   if (n != a->m) SETERRQ(1,"MatGetDiagonal_SeqBAIJ:Nonconforming matrix and vector");
1279   for ( i=0; i<ambs; i++ ) {
1280     for ( j=ai[i]; j<ai[i+1]; j++ ) {
1281       if (aj[j] == i) {
1282         row  = i*bs;
1283         aa_j = aa+j*bs2;
1284         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1285         break;
1286       }
1287     }
1288   }
1289   return 0;
1290 }
1291 
1292 static int MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
1293 {
1294   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1295   Scalar      *l,*r,x,*v,*aa,*li,*ri;
1296   int         i,j,k,lm,rn,M,m,n,*ai,*aj,mbs,tmp,bs,bs2;
1297 
1298   ai  = a->i;
1299   aj  = a->j;
1300   aa  = a->a;
1301   m   = a->m;
1302   n   = a->n;
1303   bs  = a->bs;
1304   mbs = a->mbs;
1305   bs2 = a->bs2;
1306   if (ll) {
1307     VecGetArray(ll,&l); VecGetSize(ll,&lm);
1308     if (lm != m) SETERRQ(1,"MatDiagonalScale_SeqBAIJ:Left scaling vector wrong length");
1309     for ( i=0; i<mbs; i++ ) { /* for each block row */
1310       M  = ai[i+1] - ai[i];
1311       li = l + i*bs;
1312       v  = aa + bs2*ai[i];
1313       for ( j=0; j<M; j++ ) { /* for each block */
1314         for ( k=0; k<bs2; k++ ) {
1315           (*v++) *= li[k%bs];
1316         }
1317       }
1318     }
1319   }
1320 
1321   if (rr) {
1322     VecGetArray(rr,&r); VecGetSize(rr,&rn);
1323     if (rn != n) SETERRQ(1,"MatDiagonalScale_SeqBAIJ:Right scaling vector wrong length");
1324     for ( i=0; i<mbs; i++ ) { /* for each block row */
1325       M  = ai[i+1] - ai[i];
1326       v  = aa + bs2*ai[i];
1327       for ( j=0; j<M; j++ ) { /* for each block */
1328         ri = r + bs*aj[ai[i]+j];
1329         for ( k=0; k<bs; k++ ) {
1330           x = ri[k];
1331           for ( tmp=0; tmp<bs; tmp++ ) (*v++) *= x;
1332         }
1333       }
1334     }
1335   }
1336   return 0;
1337 }
1338 
1339 
1340 extern int MatLUFactorSymbolic_SeqBAIJ(Mat,IS,IS,double,Mat*);
1341 extern int MatLUFactor_SeqBAIJ(Mat,IS,IS,double);
1342 extern int MatIncreaseOverlap_SeqBAIJ(Mat,int,IS*,int);
1343 extern int MatGetSubMatrix_SeqBAIJ(Mat,IS,IS,MatGetSubMatrixCall,Mat*);
1344 extern int MatGetSubMatrices_SeqBAIJ(Mat,int,IS*,IS*,MatGetSubMatrixCall,Mat**);
1345 
1346 extern int MatSolve_SeqBAIJ_N(Mat,Vec,Vec);
1347 extern int MatSolve_SeqBAIJ_1(Mat,Vec,Vec);
1348 extern int MatSolve_SeqBAIJ_2(Mat,Vec,Vec);
1349 extern int MatSolve_SeqBAIJ_3(Mat,Vec,Vec);
1350 extern int MatSolve_SeqBAIJ_4(Mat,Vec,Vec);
1351 extern int MatSolve_SeqBAIJ_5(Mat,Vec,Vec);
1352 
1353 extern int MatLUFactorNumeric_SeqBAIJ_N(Mat,Mat*);
1354 extern int MatLUFactorNumeric_SeqBAIJ_1(Mat,Mat*);
1355 extern int MatLUFactorNumeric_SeqBAIJ_2(Mat,Mat*);
1356 extern int MatLUFactorNumeric_SeqBAIJ_3(Mat,Mat*);
1357 extern int MatLUFactorNumeric_SeqBAIJ_4(Mat,Mat*);
1358 extern int MatLUFactorNumeric_SeqBAIJ_5(Mat,Mat*);
1359 
1360 static int MatNorm_SeqBAIJ(Mat A,NormType type,double *norm)
1361 {
1362   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1363   Scalar      *v = a->a;
1364   double      sum = 0.0;
1365   int         i,nz=a->nz,bs2=a->bs2;
1366 
1367   if (type == NORM_FROBENIUS) {
1368     for (i=0; i< bs2*nz; i++ ) {
1369 #if defined(PETSC_COMPLEX)
1370       sum += real(conj(*v)*(*v)); v++;
1371 #else
1372       sum += (*v)*(*v); v++;
1373 #endif
1374     }
1375     *norm = sqrt(sum);
1376   }
1377   else {
1378     SETERRQ(1,"MatNorm_SeqBAIJ:No support for this norm yet");
1379   }
1380   return 0;
1381 }
1382 
1383 /*
1384      note: This can only work for identity for row and col. It would
1385    be good to check this and otherwise generate an error.
1386 */
1387 static int MatILUFactor_SeqBAIJ(Mat inA,IS row,IS col,double efill,int fill)
1388 {
1389   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) inA->data;
1390   Mat         outA;
1391   int         ierr;
1392 
1393   if (fill != 0) SETERRQ(1,"MatILUFactor_SeqBAIJ:Only fill=0 supported");
1394 
1395   outA          = inA;
1396   inA->factor   = FACTOR_LU;
1397   a->row        = row;
1398   a->col        = col;
1399 
1400   a->solve_work = (Scalar *) PetscMalloc((a->m+a->bs)*sizeof(Scalar));CHKPTRQ(a->solve_work);
1401 
1402   if (!a->diag) {
1403     ierr = MatMarkDiag_SeqBAIJ(inA); CHKERRQ(ierr);
1404   }
1405   ierr = MatLUFactorNumeric(inA,&outA); CHKERRQ(ierr);
1406   return 0;
1407 }
1408 
1409 static int MatScale_SeqBAIJ(Scalar *alpha,Mat inA)
1410 {
1411   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) inA->data;
1412   int         one = 1, totalnz = a->bs2*a->nz;
1413   BLscal_( &totalnz, alpha, a->a, &one );
1414   PLogFlops(totalnz);
1415   return 0;
1416 }
1417 
1418 static int MatGetValues_SeqBAIJ(Mat A,int m,int *im,int n,int *in,Scalar *v)
1419 {
1420   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1421   int        *rp, k, low, high, t, row, nrow, i, col, l, *aj = a->j;
1422   int        *ai = a->i, *ailen = a->ilen;
1423   int        brow,bcol,ridx,cidx,bs=a->bs,bs2=a->bs2;
1424   Scalar     *ap, *aa = a->a, zero = 0.0;
1425 
1426   for ( k=0; k<m; k++ ) { /* loop over rows */
1427     row  = im[k]; brow = row/bs;
1428     if (row < 0) SETERRQ(1,"MatGetValues_SeqBAIJ:Negative row");
1429     if (row >= a->m) SETERRQ(1,"MatGetValues_SeqBAIJ:Row too large");
1430     rp   = aj + ai[brow] ; ap = aa + bs2*ai[brow] ;
1431     nrow = ailen[brow];
1432     for ( l=0; l<n; l++ ) { /* loop over columns */
1433       if (in[l] < 0) SETERRQ(1,"MatGetValues_SeqBAIJ:Negative column");
1434       if (in[l] >= a->n) SETERRQ(1,"MatGetValues_SeqBAIJ:Column too large");
1435       col  = in[l] ;
1436       bcol = col/bs;
1437       cidx = col%bs;
1438       ridx = row%bs;
1439       high = nrow;
1440       low  = 0; /* assume unsorted */
1441       while (high-low > 5) {
1442         t = (low+high)/2;
1443         if (rp[t] > bcol) high = t;
1444         else             low  = t;
1445       }
1446       for ( i=low; i<high; i++ ) {
1447         if (rp[i] > bcol) break;
1448         if (rp[i] == bcol) {
1449           *v++ = ap[bs2*i+bs*cidx+ridx];
1450           goto finished;
1451         }
1452       }
1453       *v++ = zero;
1454       finished:;
1455     }
1456   }
1457   return 0;
1458 }
1459 
1460 static int MatGetBlockSize_SeqBAIJ(Mat mat, int *bs)
1461 {
1462   Mat_SeqBAIJ *baij = (Mat_SeqBAIJ *) mat->data;
1463   *bs = baij->bs;
1464   return 0;
1465 }
1466 
1467 /* -------------------------------------------------------------------*/
1468 static struct _MatOps MatOps = {MatSetValues_SeqBAIJ,
1469        MatGetRow_SeqBAIJ,MatRestoreRow_SeqBAIJ,
1470        MatMult_SeqBAIJ_N,MatMultAdd_SeqBAIJ_N,
1471        MatMultTrans_SeqBAIJ,MatMultTransAdd_SeqBAIJ,
1472        MatSolve_SeqBAIJ_N,0,
1473        0,0,
1474        MatLUFactor_SeqBAIJ,0,
1475        0,
1476        MatTranspose_SeqBAIJ,
1477        MatGetInfo_SeqBAIJ,MatEqual_SeqBAIJ,
1478        MatGetDiagonal_SeqBAIJ,MatDiagonalScale_SeqBAIJ,MatNorm_SeqBAIJ,
1479        0,MatAssemblyEnd_SeqBAIJ,
1480        0,
1481        MatSetOption_SeqBAIJ,MatZeroEntries_SeqBAIJ,0,
1482        MatGetReordering_SeqBAIJ,
1483        MatLUFactorSymbolic_SeqBAIJ,MatLUFactorNumeric_SeqBAIJ_N,0,0,
1484        MatGetSize_SeqBAIJ,MatGetSize_SeqBAIJ,MatGetOwnershipRange_SeqBAIJ,
1485        MatILUFactorSymbolic_SeqBAIJ,0,
1486        0,0,/*  MatConvert_SeqBAIJ  */ 0,
1487        MatGetSubMatrix_SeqBAIJ,0,
1488        MatConvertSameType_SeqBAIJ,0,0,
1489        MatILUFactor_SeqBAIJ,0,0,
1490        MatGetSubMatrices_SeqBAIJ,MatIncreaseOverlap_SeqBAIJ,
1491        MatGetValues_SeqBAIJ,0,
1492        0,MatScale_SeqBAIJ,
1493        0,0,0,MatGetBlockSize_SeqBAIJ};
1494 
1495 /*@C
1496    MatCreateSeqBAIJ - Creates a sparse matrix in block AIJ (block
1497    compressed row) format.  For good matrix assembly performance the
1498    user should preallocate the matrix storage by setting the parameter nz
1499    (or the array nzz).  By setting these parameters accurately, performance
1500    during matrix assembly can be increased by more than a factor of 50.
1501 
1502    Input Parameters:
1503 .  comm - MPI communicator, set to MPI_COMM_SELF
1504 .  bs - size of block
1505 .  m - number of rows
1506 .  n - number of columns
1507 .  nz - number of block nonzeros per block row (same for all rows)
1508 .  nzz - array containing the number of block nonzeros in the various block rows
1509          (possibly different for each block row) or PETSC_NULL
1510 
1511    Output Parameter:
1512 .  A - the matrix
1513 
1514    Notes:
1515    The block AIJ format is fully compatible with standard Fortran 77
1516    storage.  That is, the stored row and column indices can begin at
1517    either one (as in Fortran) or zero.  See the users' manual for details.
1518 
1519    Specify the preallocated storage with either nz or nnz (not both).
1520    Set nz=PETSC_DEFAULT and nnz=PETSC_NULL for PETSc to control dynamic memory
1521    allocation.  For additional details, see the users manual chapter on
1522    matrices and the file $(PETSC_DIR)/Performance.
1523 
1524 .seealso: MatCreate(), MatCreateSeqAIJ(), MatSetValues()
1525 @*/
1526 int MatCreateSeqBAIJ(MPI_Comm comm,int bs,int m,int n,int nz,int *nnz, Mat *A)
1527 {
1528   Mat         B;
1529   Mat_SeqBAIJ *b;
1530   int         i,len,ierr,flg,mbs=m/bs,nbs=n/bs,bs2=bs*bs;
1531 
1532   if (mbs*bs!=m || nbs*bs!=n)
1533     SETERRQ(1,"MatCreateSeqBAIJ:Number rows, cols must be divisible by blocksize");
1534 
1535   *A = 0;
1536   PetscHeaderCreate(B,_Mat,MAT_COOKIE,MATSEQBAIJ,comm);
1537   PLogObjectCreate(B);
1538   B->data = (void *) (b = PetscNew(Mat_SeqBAIJ)); CHKPTRQ(b);
1539   PetscMemzero(b,sizeof(Mat_SeqBAIJ));
1540   PetscMemcpy(&B->ops,&MatOps,sizeof(struct _MatOps));
1541   ierr = OptionsHasName(PETSC_NULL,"-mat_no_unroll",&flg); CHKERRQ(ierr);
1542   if (!flg) {
1543     switch (bs) {
1544       case 1:
1545         B->ops.lufactornumeric = MatLUFactorNumeric_SeqBAIJ_1;
1546         B->ops.solve           = MatSolve_SeqBAIJ_1;
1547         B->ops.mult            = MatMult_SeqBAIJ_1;
1548         B->ops.multadd         = MatMultAdd_SeqBAIJ_1;
1549        break;
1550       case 2:
1551         B->ops.lufactornumeric = MatLUFactorNumeric_SeqBAIJ_2;
1552         B->ops.solve           = MatSolve_SeqBAIJ_2;
1553         B->ops.mult            = MatMult_SeqBAIJ_2;
1554         B->ops.multadd         = MatMultAdd_SeqBAIJ_2;
1555         break;
1556       case 3:
1557         B->ops.lufactornumeric = MatLUFactorNumeric_SeqBAIJ_3;
1558         B->ops.solve           = MatSolve_SeqBAIJ_3;
1559         B->ops.mult            = MatMult_SeqBAIJ_3;
1560         B->ops.multadd         = MatMultAdd_SeqBAIJ_3;
1561         break;
1562       case 4:
1563         B->ops.lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4;
1564         B->ops.solve           = MatSolve_SeqBAIJ_4;
1565         B->ops.mult            = MatMult_SeqBAIJ_4;
1566         B->ops.multadd         = MatMultAdd_SeqBAIJ_4;
1567         break;
1568       case 5:
1569         B->ops.lufactornumeric = MatLUFactorNumeric_SeqBAIJ_5;
1570         B->ops.solve           = MatSolve_SeqBAIJ_5;
1571         B->ops.mult            = MatMult_SeqBAIJ_5;
1572         B->ops.multadd         = MatMultAdd_SeqBAIJ_5;
1573         break;
1574     }
1575   }
1576   B->destroy          = MatDestroy_SeqBAIJ;
1577   B->view             = MatView_SeqBAIJ;
1578   B->factor           = 0;
1579   B->lupivotthreshold = 1.0;
1580   b->row              = 0;
1581   b->col              = 0;
1582   b->reallocs         = 0;
1583 
1584   b->m       = m; B->m = m; B->M = m;
1585   b->n       = n; B->n = n; B->N = n;
1586   b->mbs     = mbs;
1587   b->nbs     = nbs;
1588   b->imax    = (int *) PetscMalloc( (mbs+1)*sizeof(int) ); CHKPTRQ(b->imax);
1589   if (nnz == PETSC_NULL) {
1590     if (nz == PETSC_DEFAULT) nz = 5;
1591     else if (nz <= 0)        nz = 1;
1592     for ( i=0; i<mbs; i++ ) b->imax[i] = nz;
1593     nz = nz*mbs;
1594   }
1595   else {
1596     nz = 0;
1597     for ( i=0; i<mbs; i++ ) {b->imax[i] = nnz[i]; nz += nnz[i];}
1598   }
1599 
1600   /* allocate the matrix space */
1601   len   = nz*sizeof(int) + nz*bs2*sizeof(Scalar) + (b->m+1)*sizeof(int);
1602   b->a  = (Scalar *) PetscMalloc( len ); CHKPTRQ(b->a);
1603   PetscMemzero(b->a,nz*bs2*sizeof(Scalar));
1604   b->j  = (int *) (b->a + nz*bs2);
1605   PetscMemzero(b->j,nz*sizeof(int));
1606   b->i  = b->j + nz;
1607   b->singlemalloc = 1;
1608 
1609   b->i[0] = 0;
1610   for (i=1; i<mbs+1; i++) {
1611     b->i[i] = b->i[i-1] + b->imax[i-1];
1612   }
1613 
1614   /* b->ilen will count nonzeros in each block row so far. */
1615   b->ilen = (int *) PetscMalloc((mbs+1)*sizeof(int));
1616   PLogObjectMemory(B,len+2*(mbs+1)*sizeof(int)+sizeof(struct _Mat)+sizeof(Mat_SeqBAIJ));
1617   for ( i=0; i<mbs; i++ ) { b->ilen[i] = 0;}
1618 
1619   b->bs               = bs;
1620   b->bs2              = bs2;
1621   b->mbs              = mbs;
1622   b->nz               = 0;
1623   b->maxnz            = nz*bs2;
1624   b->sorted           = 0;
1625   b->roworiented      = 1;
1626   b->nonew            = 0;
1627   b->diag             = 0;
1628   b->solve_work       = 0;
1629   b->mult_work        = 0;
1630   b->spptr            = 0;
1631   *A = B;
1632   ierr = OptionsHasName(PETSC_NULL,"-help", &flg); CHKERRQ(ierr);
1633   if (flg) {ierr = MatPrintHelp(B); CHKERRQ(ierr); }
1634   return 0;
1635 }
1636 
1637 int MatConvertSameType_SeqBAIJ(Mat A,Mat *B,int cpvalues)
1638 {
1639   Mat         C;
1640   Mat_SeqBAIJ *c,*a = (Mat_SeqBAIJ *) A->data;
1641   int         i,len, mbs = a->mbs,nz = a->nz,bs2 =a->bs2;
1642 
1643   if (a->i[mbs] != nz) SETERRQ(1,"MatConvertSameType_SeqBAIJ:Corrupt matrix");
1644 
1645   *B = 0;
1646   PetscHeaderCreate(C,_Mat,MAT_COOKIE,MATSEQBAIJ,A->comm);
1647   PLogObjectCreate(C);
1648   C->data       = (void *) (c = PetscNew(Mat_SeqBAIJ)); CHKPTRQ(c);
1649   PetscMemcpy(&C->ops,&A->ops,sizeof(struct _MatOps));
1650   C->destroy    = MatDestroy_SeqBAIJ;
1651   C->view       = MatView_SeqBAIJ;
1652   C->factor     = A->factor;
1653   c->row        = 0;
1654   c->col        = 0;
1655   C->assembled  = PETSC_TRUE;
1656 
1657   c->m = C->m   = a->m;
1658   c->n = C->n   = a->n;
1659   C->M          = a->m;
1660   C->N          = a->n;
1661 
1662   c->bs         = a->bs;
1663   c->bs2        = a->bs2;
1664   c->mbs        = a->mbs;
1665   c->nbs        = a->nbs;
1666 
1667   c->imax       = (int *) PetscMalloc((mbs+1)*sizeof(int)); CHKPTRQ(c->imax);
1668   c->ilen       = (int *) PetscMalloc((mbs+1)*sizeof(int)); CHKPTRQ(c->ilen);
1669   for ( i=0; i<mbs; i++ ) {
1670     c->imax[i] = a->imax[i];
1671     c->ilen[i] = a->ilen[i];
1672   }
1673 
1674   /* allocate the matrix space */
1675   c->singlemalloc = 1;
1676   len   = (mbs+1)*sizeof(int) + nz*(bs2*sizeof(Scalar) + sizeof(int));
1677   c->a  = (Scalar *) PetscMalloc( len ); CHKPTRQ(c->a);
1678   c->j  = (int *) (c->a + nz*bs2);
1679   c->i  = c->j + nz;
1680   PetscMemcpy(c->i,a->i,(mbs+1)*sizeof(int));
1681   if (mbs > 0) {
1682     PetscMemcpy(c->j,a->j,nz*sizeof(int));
1683     if (cpvalues == COPY_VALUES) {
1684       PetscMemcpy(c->a,a->a,bs2*nz*sizeof(Scalar));
1685     }
1686   }
1687 
1688   PLogObjectMemory(C,len+2*(mbs+1)*sizeof(int)+sizeof(struct _Mat)+sizeof(Mat_SeqBAIJ));
1689   c->sorted      = a->sorted;
1690   c->roworiented = a->roworiented;
1691   c->nonew       = a->nonew;
1692 
1693   if (a->diag) {
1694     c->diag = (int *) PetscMalloc( (mbs+1)*sizeof(int) ); CHKPTRQ(c->diag);
1695     PLogObjectMemory(C,(mbs+1)*sizeof(int));
1696     for ( i=0; i<mbs; i++ ) {
1697       c->diag[i] = a->diag[i];
1698     }
1699   }
1700   else c->diag          = 0;
1701   c->nz                 = a->nz;
1702   c->maxnz              = a->maxnz;
1703   c->solve_work         = 0;
1704   c->spptr              = 0;      /* Dangerous -I'm throwing away a->spptr */
1705   c->mult_work          = 0;
1706   *B = C;
1707   return 0;
1708 }
1709 
1710 int MatLoad_SeqBAIJ(Viewer viewer,MatType type,Mat *A)
1711 {
1712   Mat_SeqBAIJ  *a;
1713   Mat          B;
1714   int          i,nz,ierr,fd,header[4],size,*rowlengths=0,M,N,bs=1,flg;
1715   int          *mask,mbs,*jj,j,rowcount,nzcount,k,*browlengths,maskcount;
1716   int          kmax,jcount,block,idx,point,nzcountb,extra_rows;
1717   int          *masked, nmask,tmp,bs2,ishift;
1718   Scalar       *aa;
1719   MPI_Comm     comm = ((PetscObject) viewer)->comm;
1720 
1721   ierr = OptionsGetInt(PETSC_NULL,"-matload_block_size",&bs,&flg);CHKERRQ(ierr);
1722   bs2  = bs*bs;
1723 
1724   MPI_Comm_size(comm,&size);
1725   if (size > 1) SETERRQ(1,"MatLoad_SeqBAIJ:view must have one processor");
1726   ierr = ViewerBinaryGetDescriptor(viewer,&fd); CHKERRQ(ierr);
1727   ierr = PetscBinaryRead(fd,header,4,BINARY_INT); CHKERRQ(ierr);
1728   if (header[0] != MAT_COOKIE) SETERRQ(1,"MatLoad_SeqBAIJ:not Mat object");
1729   M = header[1]; N = header[2]; nz = header[3];
1730 
1731   if (M != N) SETERRQ(1,"MatLoad_SeqBAIJ:Can only do square matrices");
1732 
1733   /*
1734      This code adds extra rows to make sure the number of rows is
1735     divisible by the blocksize
1736   */
1737   mbs        = M/bs;
1738   extra_rows = bs - M + bs*(mbs);
1739   if (extra_rows == bs) extra_rows = 0;
1740   else                  mbs++;
1741   if (extra_rows) {
1742     PLogInfo(0,"MatLoad_SeqBAIJ:Padding loaded matrix to match blocksize");
1743   }
1744 
1745   /* read in row lengths */
1746   rowlengths = (int*) PetscMalloc((M+extra_rows)*sizeof(int));CHKPTRQ(rowlengths);
1747   ierr = PetscBinaryRead(fd,rowlengths,M,BINARY_INT); CHKERRQ(ierr);
1748   for ( i=0; i<extra_rows; i++ ) rowlengths[M+i] = 1;
1749 
1750   /* read in column indices */
1751   jj = (int*) PetscMalloc( (nz+extra_rows)*sizeof(int) ); CHKPTRQ(jj);
1752   ierr = PetscBinaryRead(fd,jj,nz,BINARY_INT); CHKERRQ(ierr);
1753   for ( i=0; i<extra_rows; i++ ) jj[nz+i] = M+i;
1754 
1755   /* loop over row lengths determining block row lengths */
1756   browlengths = (int *) PetscMalloc(mbs*sizeof(int));CHKPTRQ(browlengths);
1757   PetscMemzero(browlengths,mbs*sizeof(int));
1758   mask   = (int *) PetscMalloc( 2*mbs*sizeof(int) ); CHKPTRQ(mask);
1759   PetscMemzero(mask,mbs*sizeof(int));
1760   masked = mask + mbs;
1761   rowcount = 0; nzcount = 0;
1762   for ( i=0; i<mbs; i++ ) {
1763     nmask = 0;
1764     for ( j=0; j<bs; j++ ) {
1765       kmax = rowlengths[rowcount];
1766       for ( k=0; k<kmax; k++ ) {
1767         tmp = jj[nzcount++]/bs;
1768         if (!mask[tmp]) {masked[nmask++] = tmp; mask[tmp] = 1;}
1769       }
1770       rowcount++;
1771     }
1772     browlengths[i] += nmask;
1773     /* zero out the mask elements we set */
1774     for ( j=0; j<nmask; j++ ) mask[masked[j]] = 0;
1775   }
1776 
1777   /* create our matrix */
1778   ierr = MatCreateSeqBAIJ(comm,bs,M+extra_rows,N+extra_rows,0,browlengths,A);
1779          CHKERRQ(ierr);
1780   B = *A;
1781   a = (Mat_SeqBAIJ *) B->data;
1782 
1783   /* set matrix "i" values */
1784   a->i[0] = 0;
1785   for ( i=1; i<= mbs; i++ ) {
1786     a->i[i]      = a->i[i-1] + browlengths[i-1];
1787     a->ilen[i-1] = browlengths[i-1];
1788   }
1789   a->nz         = 0;
1790   for ( i=0; i<mbs; i++ ) a->nz += browlengths[i];
1791 
1792   /* read in nonzero values */
1793   aa = (Scalar *) PetscMalloc((nz+extra_rows)*sizeof(Scalar));CHKPTRQ(aa);
1794   ierr = PetscBinaryRead(fd,aa,nz,BINARY_SCALAR); CHKERRQ(ierr);
1795   for ( i=0; i<extra_rows; i++ ) aa[nz+i] = 1.0;
1796 
1797   /* set "a" and "j" values into matrix */
1798   nzcount = 0; jcount = 0;
1799   for ( i=0; i<mbs; i++ ) {
1800     nzcountb = nzcount;
1801     nmask    = 0;
1802     for ( j=0; j<bs; j++ ) {
1803       kmax = rowlengths[i*bs+j];
1804       for ( k=0; k<kmax; k++ ) {
1805         tmp = jj[nzcount++]/bs;
1806 	if (!mask[tmp]) { masked[nmask++] = tmp; mask[tmp] = 1;}
1807       }
1808       rowcount++;
1809     }
1810     /* sort the masked values */
1811     PetscSortInt(nmask,masked);
1812 
1813     /* set "j" values into matrix */
1814     maskcount = 1;
1815     for ( j=0; j<nmask; j++ ) {
1816       a->j[jcount++]  = masked[j];
1817       mask[masked[j]] = maskcount++;
1818     }
1819     /* set "a" values into matrix */
1820     ishift = bs2*a->i[i];
1821     for ( j=0; j<bs; j++ ) {
1822       kmax = rowlengths[i*bs+j];
1823       for ( k=0; k<kmax; k++ ) {
1824         tmp    = jj[nzcountb]/bs ;
1825         block  = mask[tmp] - 1;
1826         point  = jj[nzcountb] - bs*tmp;
1827         idx    = ishift + bs2*block + j + bs*point;
1828         a->a[idx] = aa[nzcountb++];
1829       }
1830     }
1831     /* zero out the mask elements we set */
1832     for ( j=0; j<nmask; j++ ) mask[masked[j]] = 0;
1833   }
1834   if (jcount != a->nz) SETERRQ(1,"MatLoad_SeqBAIJ:Error bad binary matrix");
1835 
1836   PetscFree(rowlengths);
1837   PetscFree(browlengths);
1838   PetscFree(aa);
1839   PetscFree(jj);
1840   PetscFree(mask);
1841 
1842   B->assembled = PETSC_TRUE;
1843 
1844   ierr = OptionsHasName(PETSC_NULL,"-mat_view_info",&flg); CHKERRQ(ierr);
1845   if (flg) {
1846     Viewer tviewer;
1847     ierr = ViewerFileOpenASCII(B->comm,"stdout",&tviewer);CHKERRQ(ierr);
1848     ierr = ViewerSetFormat(tviewer,ASCII_FORMAT_INFO,0);CHKERRQ(ierr);
1849     ierr = MatView(B,tviewer); CHKERRQ(ierr);
1850     ierr = ViewerDestroy(tviewer); CHKERRQ(ierr);
1851   }
1852   ierr = OptionsHasName(PETSC_NULL,"-mat_view_info_detailed",&flg);CHKERRQ(ierr);
1853   if (flg) {
1854     Viewer tviewer;
1855     ierr = ViewerFileOpenASCII(B->comm,"stdout",&tviewer);CHKERRQ(ierr);
1856     ierr = ViewerSetFormat(tviewer,ASCII_FORMAT_INFO_DETAILED,0);CHKERRQ(ierr);
1857     ierr = MatView(B,tviewer); CHKERRQ(ierr);
1858     ierr = ViewerDestroy(tviewer); CHKERRQ(ierr);
1859   }
1860   ierr = OptionsHasName(PETSC_NULL,"-mat_view",&flg); CHKERRQ(ierr);
1861   if (flg) {
1862     Viewer tviewer;
1863     ierr = ViewerFileOpenASCII(B->comm,"stdout",&tviewer);CHKERRQ(ierr);
1864     ierr = MatView(B,tviewer); CHKERRQ(ierr);
1865     ierr = ViewerDestroy(tviewer); CHKERRQ(ierr);
1866   }
1867   ierr = OptionsHasName(PETSC_NULL,"-mat_view_matlab",&flg); CHKERRQ(ierr);
1868   if (flg) {
1869     Viewer tviewer;
1870     ierr = ViewerFileOpenASCII(B->comm,"stdout",&tviewer);CHKERRQ(ierr);
1871     ierr = ViewerSetFormat(tviewer,ASCII_FORMAT_MATLAB,"M");CHKERRQ(ierr);
1872     ierr = MatView(B,tviewer); CHKERRQ(ierr);
1873     ierr = ViewerDestroy(tviewer); CHKERRQ(ierr);
1874   }
1875   ierr = OptionsHasName(PETSC_NULL,"-mat_view_draw",&flg); CHKERRQ(ierr);
1876   if (flg) {
1877     Viewer tviewer;
1878     ierr = ViewerDrawOpenX(B->comm,0,0,0,0,300,300,&tviewer); CHKERRQ(ierr);
1879     ierr = MatView(B,(Viewer)tviewer); CHKERRQ(ierr);
1880     ierr = ViewerFlush(tviewer); CHKERRQ(ierr);
1881     ierr = ViewerDestroy(tviewer); CHKERRQ(ierr);
1882   }
1883   return 0;
1884 }
1885 
1886 
1887 
1888