xref: /petsc/src/mat/impls/baij/seq/baij.c (revision 2b3076dc7d5306ac29c7a2bb400e8e4ff1ac4c63)
1 
2 #ifndef lint
3 static char vcid[] = "$Id: baij.c,v 1.57 1996/07/09 17:06:28 balay Exp balay $";
4 #endif
5 
6 /*
7     Defines the basic matrix operations for the BAIJ (compressed row)
8   matrix storage format.
9 */
10 #include "baij.h"
11 #include "src/vec/vecimpl.h"
12 #include "src/inline/spops.h"
13 #include "petsc.h"
14 
15 extern   int MatToSymmetricIJ_SeqAIJ(int,int*,int*,int,int,int**,int**);
16 
17 static int MatGetReordering_SeqBAIJ(Mat A,MatReordering type,IS *rperm,IS *cperm)
18 {
19   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
20   int         ierr, *ia, *ja,n = a->mbs,*idx,i,ishift,oshift;
21 
22   /*
23      this is tacky: In the future when we have written special factorization
24      and solve routines for the identity permutation we should use a
25      stride index set instead of the general one.
26   */
27   if (type  == ORDER_NATURAL) {
28     idx = (int *) PetscMalloc( n*sizeof(int) ); CHKPTRQ(idx);
29     for ( i=0; i<n; i++ ) idx[i] = i;
30     ierr = ISCreateSeq(MPI_COMM_SELF,n,idx,rperm); CHKERRQ(ierr);
31     ierr = ISCreateSeq(MPI_COMM_SELF,n,idx,cperm); CHKERRQ(ierr);
32     PetscFree(idx);
33     ISSetPermutation(*rperm);
34     ISSetPermutation(*cperm);
35     ISSetIdentity(*rperm);
36     ISSetIdentity(*cperm);
37     return 0;
38   }
39 
40   MatReorderingRegisterAll();
41   ishift = 0;
42   oshift = -MatReorderingIndexShift[(int)type];
43   if (MatReorderingRequiresSymmetric[(int)type]) {
44     ierr = MatToSymmetricIJ_SeqAIJ(n,a->i,a->j,ishift,oshift,&ia,&ja);CHKERRQ(ierr);
45     ierr = MatGetReordering_IJ(n,ia,ja,type,rperm,cperm); CHKERRQ(ierr);
46     PetscFree(ia); PetscFree(ja);
47   } else {
48     if (ishift == oshift) {
49       ierr = MatGetReordering_IJ(n,a->i,a->j,type,rperm,cperm);CHKERRQ(ierr);
50     }
51     else if (ishift == -1) {
52       /* temporarily subtract 1 from i and j indices */
53       int nz = a->i[n] - 1;
54       for ( i=0; i<nz; i++ ) a->j[i]--;
55       for ( i=0; i<n+1; i++ ) a->i[i]--;
56       ierr = MatGetReordering_IJ(n,a->i,a->j,type,rperm,cperm);CHKERRQ(ierr);
57       for ( i=0; i<nz; i++ ) a->j[i]++;
58       for ( i=0; i<n+1; i++ ) a->i[i]++;
59     } else {
60       /* temporarily add 1 to i and j indices */
61       int nz = a->i[n] - 1;
62       for ( i=0; i<nz; i++ ) a->j[i]++;
63       for ( i=0; i<n+1; i++ ) a->i[i]++;
64       ierr = MatGetReordering_IJ(n,a->i,a->j,type,rperm,cperm);CHKERRQ(ierr);
65       for ( i=0; i<nz; i++ ) a->j[i]--;
66       for ( i=0; i<n+1; i++ ) a->i[i]--;
67     }
68   }
69   return 0;
70 }
71 
72 /*
73      Adds diagonal pointers to sparse matrix structure.
74 */
75 
76 int MatMarkDiag_SeqBAIJ(Mat A)
77 {
78   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
79   int         i,j, *diag, m = a->mbs;
80 
81   diag = (int *) PetscMalloc( (m+1)*sizeof(int)); CHKPTRQ(diag);
82   PLogObjectMemory(A,(m+1)*sizeof(int));
83   for ( i=0; i<m; i++ ) {
84     for ( j=a->i[i]; j<a->i[i+1]; j++ ) {
85       if (a->j[j] == i) {
86         diag[i] = j;
87         break;
88       }
89     }
90   }
91   a->diag = diag;
92   return 0;
93 }
94 
95 #include "draw.h"
96 #include "pinclude/pviewer.h"
97 #include "sys.h"
98 
99 static int MatView_SeqBAIJ_Binary(Mat A,Viewer viewer)
100 {
101   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
102   int         i, fd, *col_lens, ierr, bs = a->bs,count,*jj,j,k,l,bs2=a->bs2;
103   Scalar      *aa;
104 
105   ierr = ViewerBinaryGetDescriptor(viewer,&fd); CHKERRQ(ierr);
106   col_lens = (int *) PetscMalloc((4+a->m)*sizeof(int));CHKPTRQ(col_lens);
107   col_lens[0] = MAT_COOKIE;
108   col_lens[1] = a->m;
109   col_lens[2] = a->n;
110   col_lens[3] = a->nz*bs2;
111 
112   /* store lengths of each row and write (including header) to file */
113   count = 0;
114   for ( i=0; i<a->mbs; i++ ) {
115     for ( j=0; j<bs; j++ ) {
116       col_lens[4+count++] = bs*(a->i[i+1] - a->i[i]);
117     }
118   }
119   ierr = PetscBinaryWrite(fd,col_lens,4+a->m,BINARY_INT,1); CHKERRQ(ierr);
120   PetscFree(col_lens);
121 
122   /* store column indices (zero start index) */
123   jj = (int *) PetscMalloc( a->nz*bs2*sizeof(int) ); CHKPTRQ(jj);
124   count = 0;
125   for ( i=0; i<a->mbs; i++ ) {
126     for ( j=0; j<bs; j++ ) {
127       for ( k=a->i[i]; k<a->i[i+1]; k++ ) {
128         for ( l=0; l<bs; l++ ) {
129           jj[count++] = bs*a->j[k] + l;
130         }
131       }
132     }
133   }
134   ierr = PetscBinaryWrite(fd,jj,bs2*a->nz,BINARY_INT,0); CHKERRQ(ierr);
135   PetscFree(jj);
136 
137   /* store nonzero values */
138   aa = (Scalar *) PetscMalloc(a->nz*bs2*sizeof(Scalar)); CHKPTRQ(aa);
139   count = 0;
140   for ( i=0; i<a->mbs; i++ ) {
141     for ( j=0; j<bs; j++ ) {
142       for ( k=a->i[i]; k<a->i[i+1]; k++ ) {
143         for ( l=0; l<bs; l++ ) {
144           aa[count++] = a->a[bs2*k + l*bs + j];
145         }
146       }
147     }
148   }
149   ierr = PetscBinaryWrite(fd,aa,bs2*a->nz,BINARY_SCALAR,0); CHKERRQ(ierr);
150   PetscFree(aa);
151   return 0;
152 }
153 
154 static int MatView_SeqBAIJ_ASCII(Mat A,Viewer viewer)
155 {
156   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
157   int         ierr, i,j,format,bs = a->bs,k,l,bs2=a->bs2;
158   FILE        *fd;
159   char        *outputname;
160 
161   ierr = ViewerASCIIGetPointer(viewer,&fd); CHKERRQ(ierr);
162   ierr = ViewerFileGetOutputname_Private(viewer,&outputname);CHKERRQ(ierr);
163   ierr = ViewerGetFormat(viewer,&format);
164   if (format == ASCII_FORMAT_INFO || format == ASCII_FORMAT_INFO_DETAILED) {
165     fprintf(fd,"  block size is %d\n",bs);
166   }
167   else if (format == ASCII_FORMAT_MATLAB) {
168     SETERRQ(1,"MatView_SeqBAIJ_ASCII:Matlab format not supported");
169   }
170   else if (format == ASCII_FORMAT_COMMON) {
171     for ( i=0; i<a->mbs; i++ ) {
172       for ( j=0; j<bs; j++ ) {
173         fprintf(fd,"row %d:",i*bs+j);
174         for ( k=a->i[i]; k<a->i[i+1]; k++ ) {
175           for ( l=0; l<bs; l++ ) {
176 #if defined(PETSC_COMPLEX)
177           if (imag(a->a[bs2*k + l*bs + j]) != 0.0 && real(a->a[bs2*k + l*bs + j]) != 0.0)
178             fprintf(fd," %d %g + %g i",bs*a->j[k]+l,
179               real(a->a[bs2*k + l*bs + j]),imag(a->a[bs2*k + l*bs + j]));
180           else if (real(a->a[bs2*k + l*bs + j]) != 0.0)
181             fprintf(fd," %d %g ",bs*a->j[k]+l,real(a->a[bs2*k + l*bs + j]));
182 #else
183           if (a->a[bs2*k + l*bs + j] != 0.0)
184             fprintf(fd," %d %g ",bs*a->j[k]+l,a->a[bs2*k + l*bs + j]);
185 #endif
186           }
187         }
188         fprintf(fd,"\n");
189       }
190     }
191   }
192   else {
193     for ( i=0; i<a->mbs; i++ ) {
194       for ( j=0; j<bs; j++ ) {
195         fprintf(fd,"row %d:",i*bs+j);
196         for ( k=a->i[i]; k<a->i[i+1]; k++ ) {
197           for ( l=0; l<bs; l++ ) {
198 #if defined(PETSC_COMPLEX)
199           if (imag(a->a[bs2*k + l*bs + j]) != 0.0) {
200             fprintf(fd," %d %g + %g i",bs*a->j[k]+l,
201               real(a->a[bs2*k + l*bs + j]),imag(a->a[bs2*k + l*bs + j]));
202           }
203           else {
204             fprintf(fd," %d %g ",bs*a->j[k]+l,real(a->a[bs2*k + l*bs + j]));
205           }
206 #else
207             fprintf(fd," %d %g ",bs*a->j[k]+l,a->a[bs2*k + l*bs + j]);
208 #endif
209           }
210         }
211         fprintf(fd,"\n");
212       }
213     }
214   }
215   fflush(fd);
216   return 0;
217 }
218 
219 static int MatView_SeqBAIJ(PetscObject obj,Viewer viewer)
220 {
221   Mat         A = (Mat) obj;
222   ViewerType  vtype;
223   int         ierr;
224 
225   ierr = ViewerGetType(viewer,&vtype); CHKERRQ(ierr);
226   if (vtype == MATLAB_VIEWER) {
227     SETERRQ(1,"MatView_SeqBAIJ:Matlab viewer not supported");
228   }
229   else if (vtype == ASCII_FILE_VIEWER || vtype == ASCII_FILES_VIEWER){
230     return MatView_SeqBAIJ_ASCII(A,viewer);
231   }
232   else if (vtype == BINARY_FILE_VIEWER) {
233     return MatView_SeqBAIJ_Binary(A,viewer);
234   }
235   else if (vtype == DRAW_VIEWER) {
236     SETERRQ(1,"MatView_SeqBAIJ:Draw viewer not supported");
237   }
238   return 0;
239 }
240 
241 #define CHUNKSIZE  10
242 
243 /* This version has row oriented v  */
244 static int MatSetValues_SeqBAIJ(Mat A,int m,int *im,int n,int *in,Scalar *v,InsertMode is)
245 {
246   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
247   int         *rp,k,low,high,t,ii,row,nrow,i,col,l,rmax,N,sorted=a->sorted;
248   int         *imax=a->imax,*ai=a->i,*ailen=a->ilen,roworiented=a->roworiented;
249   int         *aj=a->j,nonew=a->nonew,bs=a->bs,brow,bcol;
250   int          ridx,cidx,bs2=a->bs2;
251   Scalar      *ap,value,*aa=a->a,*bap;
252 
253   for ( k=0; k<m; k++ ) { /* loop over added rows */
254     row  = im[k]; brow = row/bs;
255     if (row < 0) SETERRQ(1,"MatSetValues_SeqBAIJ:Negative row");
256     if (row >= a->m) SETERRQ(1,"MatSetValues_SeqBAIJ:Row too large");
257     rp   = aj + ai[brow]; ap = aa + bs2*ai[brow];
258     rmax = imax[brow]; nrow = ailen[brow];
259     low = 0;
260     for ( l=0; l<n; l++ ) { /* loop over added columns */
261       if (in[l] < 0) SETERRQ(1,"MatSetValues_SeqBAIJ:Negative column");
262       if (in[l] >= a->n) SETERRQ(1,"MatSetValues_SeqBAIJ:Column too large");
263       col = in[l]; bcol = col/bs;
264       ridx = row % bs; cidx = col % bs;
265       if (roworiented) {
266         value = *v++;
267       }
268       else {
269         value = v[k + l*m];
270       }
271       if (!sorted) low = 0; high = nrow;
272       while (high-low > 5) {
273         t = (low+high)/2;
274         if (rp[t] > bcol) high = t;
275         else             low  = t;
276       }
277       for ( i=low; i<high; i++ ) {
278         if (rp[i] > bcol) break;
279         if (rp[i] == bcol) {
280           bap  = ap +  bs2*i + bs*cidx + ridx;
281           if (is == ADD_VALUES) *bap += value;
282           else                  *bap  = value;
283           goto noinsert;
284         }
285       }
286       if (nonew) goto noinsert;
287       if (nrow >= rmax) {
288         /* there is no extra room in row, therefore enlarge */
289         int    new_nz = ai[a->mbs] + CHUNKSIZE,len,*new_i,*new_j;
290         Scalar *new_a;
291 
292         /* malloc new storage space */
293         len     = new_nz*(sizeof(int)+bs2*sizeof(Scalar))+(a->mbs+1)*sizeof(int);
294         new_a   = (Scalar *) PetscMalloc( len ); CHKPTRQ(new_a);
295         new_j   = (int *) (new_a + bs2*new_nz);
296         new_i   = new_j + new_nz;
297 
298         /* copy over old data into new slots */
299         for ( ii=0; ii<brow+1; ii++ ) {new_i[ii] = ai[ii];}
300         for ( ii=brow+1; ii<a->mbs+1; ii++ ) {new_i[ii] = ai[ii]+CHUNKSIZE;}
301         PetscMemcpy(new_j,aj,(ai[brow]+nrow)*sizeof(int));
302         len = (new_nz - CHUNKSIZE - ai[brow] - nrow);
303         PetscMemcpy(new_j+ai[brow]+nrow+CHUNKSIZE,aj+ai[brow]+nrow,
304                                                            len*sizeof(int));
305         PetscMemcpy(new_a,aa,(ai[brow]+nrow)*bs2*sizeof(Scalar));
306         PetscMemzero(new_a+bs2*(ai[brow]+nrow),bs2*CHUNKSIZE*sizeof(Scalar));
307         PetscMemcpy(new_a+bs2*(ai[brow]+nrow+CHUNKSIZE),
308                     aa+bs2*(ai[brow]+nrow),bs2*len*sizeof(Scalar));
309         /* free up old matrix storage */
310         PetscFree(a->a);
311         if (!a->singlemalloc) {PetscFree(a->i);PetscFree(a->j);}
312         aa = a->a = new_a; ai = a->i = new_i; aj = a->j = new_j;
313         a->singlemalloc = 1;
314 
315         rp   = aj + ai[brow]; ap = aa + bs2*ai[brow];
316         rmax = imax[brow] = imax[brow] + CHUNKSIZE;
317         PLogObjectMemory(A,CHUNKSIZE*(sizeof(int) + bs2*sizeof(Scalar)));
318         a->maxnz += bs2*CHUNKSIZE;
319         a->reallocs++;
320         a->nz++;
321       }
322       N = nrow++ - 1;
323       /* shift up all the later entries in this row */
324       for ( ii=N; ii>=i; ii-- ) {
325         rp[ii+1] = rp[ii];
326          PetscMemcpy(ap+bs2*(ii+1),ap+bs2*(ii),bs2*sizeof(Scalar));
327       }
328       if (N>=i) PetscMemzero(ap+bs2*i,bs2*sizeof(Scalar));
329       rp[i] = bcol;
330       ap[bs2*i + bs*cidx + ridx] = value;
331       noinsert:;
332       low = i;
333     }
334     ailen[brow] = nrow;
335   }
336   return 0;
337 }
338 
339 static int MatGetSize_SeqBAIJ(Mat A,int *m,int *n)
340 {
341   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
342   *m = a->m; *n = a->n;
343   return 0;
344 }
345 
346 static int MatGetOwnershipRange_SeqBAIJ(Mat A,int *m,int *n)
347 {
348   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
349   *m = 0; *n = a->m;
350   return 0;
351 }
352 
353 int MatGetRow_SeqBAIJ(Mat A,int row,int *nz,int **idx,Scalar **v)
354 {
355   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
356   int          itmp,i,j,k,M,*ai,*aj,bs,bn,bp,*idx_i,bs2;
357   Scalar      *aa,*v_i,*aa_i;
358 
359   bs  = a->bs;
360   ai  = a->i;
361   aj  = a->j;
362   aa  = a->a;
363   bs2 = a->bs2;
364 
365   if (row < 0 || row >= a->m) SETERRQ(1,"MatGetRow_SeqBAIJ:Row out of range");
366 
367   bn  = row/bs;   /* Block number */
368   bp  = row % bs; /* Block Position */
369   M   = ai[bn+1] - ai[bn];
370   *nz = bs*M;
371 
372   if (v) {
373     *v = 0;
374     if (*nz) {
375       *v = (Scalar *) PetscMalloc( (*nz)*sizeof(Scalar) ); CHKPTRQ(*v);
376       for ( i=0; i<M; i++ ) { /* for each block in the block row */
377         v_i  = *v + i*bs;
378         aa_i = aa + bs2*(ai[bn] + i);
379         for ( j=bp,k=0; j<bs2; j+=bs,k++ ) {v_i[k] = aa_i[j];}
380       }
381     }
382   }
383 
384   if (idx) {
385     *idx = 0;
386     if (*nz) {
387       *idx = (int *) PetscMalloc( (*nz)*sizeof(int) ); CHKPTRQ(*idx);
388       for ( i=0; i<M; i++ ) { /* for each block in the block row */
389         idx_i = *idx + i*bs;
390         itmp  = bs*aj[ai[bn] + i];
391         for ( j=0; j<bs; j++ ) {idx_i[j] = itmp++;}
392       }
393     }
394   }
395   return 0;
396 }
397 
398 int MatRestoreRow_SeqBAIJ(Mat A,int row,int *nz,int **idx,Scalar **v)
399 {
400   if (idx) {if (*idx) PetscFree(*idx);}
401   if (v)   {if (*v)   PetscFree(*v);}
402   return 0;
403 }
404 
405 static int MatTranspose_SeqBAIJ(Mat A,Mat *B)
406 {
407   Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
408   Mat         C;
409   int         i,j,k,ierr,*aj=a->j,*ai=a->i,bs=a->bs,mbs=a->mbs,nbs=a->nbs,len,*col;
410   int         *rows,*cols,bs2=a->bs2;
411   Scalar      *array=a->a;
412 
413   if (B==PETSC_NULL && mbs!=nbs)
414     SETERRQ(1,"MatTranspose_SeqBAIJ:Square matrix only for in-place");
415   col = (int *) PetscMalloc((1+nbs)*sizeof(int)); CHKPTRQ(col);
416   PetscMemzero(col,(1+nbs)*sizeof(int));
417 
418   for ( i=0; i<ai[mbs]; i++ ) col[aj[i]] += 1;
419   ierr = MatCreateSeqBAIJ(A->comm,bs,a->n,a->m,PETSC_NULL,col,&C); CHKERRQ(ierr);
420   PetscFree(col);
421   rows = (int *) PetscMalloc(2*bs*sizeof(int)); CHKPTRQ(rows);
422   cols = rows + bs;
423   for ( i=0; i<mbs; i++ ) {
424     cols[0] = i*bs;
425     for (k=1; k<bs; k++ ) cols[k] = cols[k-1] + 1;
426     len = ai[i+1] - ai[i];
427     for ( j=0; j<len; j++ ) {
428       rows[0] = (*aj++)*bs;
429       for (k=1; k<bs; k++ ) rows[k] = rows[k-1] + 1;
430       ierr = MatSetValues(C,bs,rows,bs,cols,array,INSERT_VALUES); CHKERRQ(ierr);
431       array += bs2;
432     }
433   }
434   PetscFree(rows);
435 
436   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
437   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
438 
439   if (B != PETSC_NULL) {
440     *B = C;
441   } else {
442     /* This isn't really an in-place transpose */
443     PetscFree(a->a);
444     if (!a->singlemalloc) {PetscFree(a->i); PetscFree(a->j);}
445     if (a->diag) PetscFree(a->diag);
446     if (a->ilen) PetscFree(a->ilen);
447     if (a->imax) PetscFree(a->imax);
448     if (a->solve_work) PetscFree(a->solve_work);
449     PetscFree(a);
450     PetscMemcpy(A,C,sizeof(struct _Mat));
451     PetscHeaderDestroy(C);
452   }
453   return 0;
454 }
455 
456 
457 static int MatAssemblyEnd_SeqBAIJ(Mat A,MatAssemblyType mode)
458 {
459   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
460   int        fshift = 0,i,j,*ai = a->i, *aj = a->j, *imax = a->imax;
461   int        m = a->m,*ip, N, *ailen = a->ilen;
462   int        mbs = a->mbs, bs2 = a->bs2;
463   Scalar     *aa = a->a, *ap;
464 
465   if (mode == MAT_FLUSH_ASSEMBLY) return 0;
466 
467   for ( i=1; i<mbs; i++ ) {
468     /* move each row back by the amount of empty slots (fshift) before it*/
469     fshift += imax[i-1] - ailen[i-1];
470     if (fshift) {
471       ip = aj + ai[i]; ap = aa + bs2*ai[i];
472       N = ailen[i];
473       for ( j=0; j<N; j++ ) {
474         ip[j-fshift] = ip[j];
475         PetscMemcpy(ap+(j-fshift)*bs2,ap+j*bs2,bs2*sizeof(Scalar));
476       }
477     }
478     ai[i] = ai[i-1] + ailen[i-1];
479   }
480   if (mbs) {
481     fshift += imax[mbs-1] - ailen[mbs-1];
482     ai[mbs] = ai[mbs-1] + ailen[mbs-1];
483   }
484   /* reset ilen and imax for each row */
485   for ( i=0; i<mbs; i++ ) {
486     ailen[i] = imax[i] = ai[i+1] - ai[i];
487   }
488   a->nz = ai[mbs];
489 
490   /* diagonals may have moved, so kill the diagonal pointers */
491   if (fshift && a->diag) {
492     PetscFree(a->diag);
493     PLogObjectMemory(A,-(m+1)*sizeof(int));
494     a->diag = 0;
495   }
496   PLogInfo(A,"MatAssemblyEnd_SeqBAIJ: Unneed storage space (blocks) %d used %d, rows %d, block size %d\n", fshift*bs2,a->nz*bs2,m,a->bs);
497   PLogInfo(A,"MatAssemblyEnd_SeqBAIJ: Number of mallocs during MatSetValues %d\n",
498            a->reallocs);
499   return 0;
500 }
501 
502 static int MatZeroEntries_SeqBAIJ(Mat A)
503 {
504   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
505   PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(Scalar));
506   return 0;
507 }
508 
509 int MatDestroy_SeqBAIJ(PetscObject obj)
510 {
511   Mat         A  = (Mat) obj;
512   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
513 
514 #if defined(PETSC_LOG)
515   PLogObjectState(obj,"Rows=%d, Cols=%d, NZ=%d",a->m,a->n,a->nz);
516 #endif
517   PetscFree(a->a);
518   if (!a->singlemalloc) { PetscFree(a->i); PetscFree(a->j);}
519   if (a->diag) PetscFree(a->diag);
520   if (a->ilen) PetscFree(a->ilen);
521   if (a->imax) PetscFree(a->imax);
522   if (a->solve_work) PetscFree(a->solve_work);
523   if (a->mult_work) PetscFree(a->mult_work);
524   PetscFree(a);
525   PLogObjectDestroy(A);
526   PetscHeaderDestroy(A);
527   return 0;
528 }
529 
530 static int MatSetOption_SeqBAIJ(Mat A,MatOption op)
531 {
532   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
533   if      (op == MAT_ROW_ORIENTED)              a->roworiented = 1;
534   else if (op == MAT_COLUMN_ORIENTED)           a->roworiented = 0;
535   else if (op == MAT_COLUMNS_SORTED)            a->sorted      = 1;
536   else if (op == MAT_NO_NEW_NONZERO_LOCATIONS)  a->nonew       = 1;
537   else if (op == MAT_YES_NEW_NONZERO_LOCATIONS) a->nonew       = 0;
538   else if (op == MAT_ROWS_SORTED ||
539            op == MAT_SYMMETRIC ||
540            op == MAT_STRUCTURALLY_SYMMETRIC ||
541            op == MAT_YES_NEW_DIAGONALS)
542     PLogInfo(A,"Info:MatSetOption_SeqBAIJ:Option ignored\n");
543   else if (op == MAT_NO_NEW_DIAGONALS)
544     {SETERRQ(PETSC_ERR_SUP,"MatSetOption_SeqBAIJ:MAT_NO_NEW_DIAGONALS");}
545   else
546     {SETERRQ(PETSC_ERR_SUP,"MatSetOption_SeqBAIJ:unknown option");}
547   return 0;
548 }
549 
550 
551 /* -------------------------------------------------------*/
552 /* Should check that shapes of vectors and matrices match */
553 /* -------------------------------------------------------*/
554 #include "pinclude/plapack.h"
555 
556 static int MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
557 {
558   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
559   Scalar          *xg,*zg;
560   register Scalar *x,*z,*v,sum;
561   int             mbs=a->mbs,i,*idx,*ii,n,ierr;
562 
563   ierr = VecGetArray(xx,&xg); CHKERRQ(ierr); x = xg;
564   ierr = VecGetArray(zz,&zg); CHKERRQ(ierr); z = zg;
565 
566   idx   = a->j;
567   v     = a->a;
568   ii    = a->i;
569 
570   for ( i=0; i<mbs; i++ ) {
571     n    = ii[1] - ii[0]; ii++;
572     sum  = 0.0;
573     while (n--) sum += *v++ * x[*idx++];
574     z[i] = sum;
575   }
576   ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
577   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
578   PLogFlops(2*a->nz - a->m);
579   return 0;
580 }
581 
582 static int MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
583 {
584   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
585   Scalar          *xg,*zg;
586   register Scalar *x,*z,*v,*xb,sum1,sum2;
587   register Scalar x1,x2;
588   int             mbs=a->mbs,i,*idx,*ii;
589   int             j,n,ierr;
590 
591   ierr = VecGetArray(xx,&xg); CHKERRQ(ierr); x = xg;
592   ierr = VecGetArray(zz,&zg); CHKERRQ(ierr); z = zg;
593 
594   idx   = a->j;
595   v     = a->a;
596   ii    = a->i;
597 
598   for ( i=0; i<mbs; i++ ) {
599     n  = ii[1] - ii[0]; ii++;
600     sum1 = 0.0; sum2 = 0.0;
601     for ( j=0; j<n; j++ ) {
602       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
603       sum1 += v[0]*x1 + v[2]*x2;
604       sum2 += v[1]*x1 + v[3]*x2;
605       v += 4;
606     }
607     z[0] = sum1; z[1] = sum2;
608     z += 2;
609   }
610   ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
611   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
612   PLogFlops(4*a->nz - a->m);
613   return 0;
614 }
615 
616 static int MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
617 {
618   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
619   Scalar          *xg,*zg;
620   register Scalar *x,*z,*v,*xb,sum1,sum2,sum3,x1,x2,x3;
621   int             mbs=a->mbs,i,*idx,*ii,j,n,ierr;
622 
623   ierr = VecGetArray(xx,&xg); CHKERRQ(ierr); x = xg;
624   ierr = VecGetArray(zz,&zg); CHKERRQ(ierr); z = zg;
625 
626   idx   = a->j;
627   v     = a->a;
628   ii    = a->i;
629 
630   for ( i=0; i<mbs; i++ ) {
631     n  = ii[1] - ii[0]; ii++;
632     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
633     for ( j=0; j<n; j++ ) {
634       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
635       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
636       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
637       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
638       v += 9;
639     }
640     z[0] = sum1; z[1] = sum2; z[2] = sum3;
641     z += 3;
642   }
643   ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
644   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
645   PLogFlops(18*a->nz - a->m);
646   return 0;
647 }
648 
649 static int MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
650 {
651   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
652   Scalar          *xg,*zg;
653   register Scalar *x,*z,*v,*xb,sum1,sum2,sum3,sum4;
654   register Scalar x1,x2,x3,x4;
655   int             mbs=a->mbs,i,*idx,*ii;
656   int             j,n,ierr;
657 
658   ierr = VecGetArray(xx,&xg); CHKERRQ(ierr); x = xg;
659   ierr = VecGetArray(zz,&zg); CHKERRQ(ierr); z = zg;
660 
661   idx   = a->j;
662   v     = a->a;
663   ii    = a->i;
664 
665   for ( i=0; i<mbs; i++ ) {
666     n  = ii[1] - ii[0]; ii++;
667     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
668     for ( j=0; j<n; j++ ) {
669       xb = x + 4*(*idx++);
670       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
671       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
672       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
673       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
674       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
675       v += 16;
676     }
677     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
678     z += 4;
679   }
680   ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
681   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
682   PLogFlops(32*a->nz - a->m);
683   return 0;
684 }
685 
686 static int MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
687 {
688   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
689   Scalar          *xg,*zg;
690   register Scalar *x,*z,*v,*xb,sum1,sum2,sum3,sum4,sum5;
691   register Scalar x1,x2,x3,x4,x5;
692   int             mbs=a->mbs,i,*idx,*ii,j,n,ierr;
693 
694   ierr = VecGetArray(xx,&xg); CHKERRQ(ierr); x = xg;
695   ierr = VecGetArray(zz,&zg); CHKERRQ(ierr); z = zg;
696 
697   idx   = a->j;
698   v     = a->a;
699   ii    = a->i;
700 
701   for ( i=0; i<mbs; i++ ) {
702     n  = ii[1] - ii[0]; ii++;
703     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
704     for ( j=0; j<n; j++ ) {
705       xb = x + 5*(*idx++);
706       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
707       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
708       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
709       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
710       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
711       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
712       v += 25;
713     }
714     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
715     z += 5;
716   }
717   ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
718   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
719   PLogFlops(50*a->nz - a->m);
720   return 0;
721 }
722 
723 static int MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
724 {
725   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
726   Scalar          *xg,*zg;
727   register Scalar *x,*z,*v,*xb;
728   int             mbs=a->mbs,i,*idx,*ii,bs=a->bs,j,n,bs2=a->bs2,ierr;
729   int             _One = 1,ncols,k; Scalar _DOne = 1.0, *work,*workt, _DZero = 0.0;
730 
731   ierr = VecGetArray(xx,&xg); CHKERRQ(ierr); x = xg;
732   ierr = VecGetArray(zz,&zg); CHKERRQ(ierr); z = zg;
733 
734   idx   = a->j;
735   v     = a->a;
736   ii    = a->i;
737 
738 
739   if (!a->mult_work) {
740     k = PetscMax(a->m,a->n);
741     a->mult_work = (Scalar *) PetscMalloc((k+1)*sizeof(Scalar));CHKPTRQ(a->mult_work);
742   }
743   work = a->mult_work;
744   for ( i=0; i<mbs; i++ ) {
745     n     = ii[1] - ii[0]; ii++;
746     ncols = n*bs;
747     workt = work;
748     for ( j=0; j<n; j++ ) {
749       xb = x + bs*(*idx++);
750       for ( k=0; k<bs; k++ ) workt[k] = xb[k];
751       workt += bs;
752     }
753     LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One);
754     v += n*bs2;
755     z += bs;
756   }
757    ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
758   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
759   PLogFlops(2*a->nz*bs2 - a->m);
760   return 0;
761 }
762 
763 static int MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
764 {
765   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
766   Scalar          *xg,*yg,*zg;
767   register Scalar *x,*y,*z,*v,sum;
768   int             mbs=a->mbs,i,*idx,*ii,n,ierr;
769 
770   ierr = VecGetArray(xx,&xg); CHKERRQ(ierr); x = xg;
771   ierr = VecGetArray(yy,&yg); CHKERRQ(ierr); y = yg;
772   ierr = VecGetArray(zz,&zg); CHKERRQ(ierr); z = zg;
773 
774   idx   = a->j;
775   v     = a->a;
776   ii    = a->i;
777 
778   for ( i=0; i<mbs; i++ ) {
779     n    = ii[1] - ii[0]; ii++;
780     sum  = y[i];
781     while (n--) sum += *v++ * x[*idx++];
782     z[i] = sum;
783   }
784   ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
785   ierr = VecRestoreArray(yy,&yg); CHKERRQ(ierr);
786   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
787   PLogFlops(2*a->nz);
788   return 0;
789 }
790 
791 static int MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
792 {
793   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
794   Scalar          *xg,*yg,*zg;
795   register Scalar *x,*y,*z,*v,*xb,sum1,sum2;
796   register Scalar x1,x2;
797   int             mbs=a->mbs,i,*idx,*ii;
798   int             j,n,ierr;
799 
800   ierr = VecGetArray(xx,&xg); CHKERRQ(ierr); x = xg;
801   ierr = VecGetArray(yy,&yg); CHKERRQ(ierr); y = yg;
802   ierr = VecGetArray(zz,&zg); CHKERRQ(ierr); z = zg;
803 
804   idx   = a->j;
805   v     = a->a;
806   ii    = a->i;
807 
808   for ( i=0; i<mbs; i++ ) {
809     n  = ii[1] - ii[0]; ii++;
810     sum1 = y[0]; sum2 = y[1];
811     for ( j=0; j<n; j++ ) {
812       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
813       sum1 += v[0]*x1 + v[2]*x2;
814       sum2 += v[1]*x1 + v[3]*x2;
815       v += 4;
816     }
817     z[0] = sum1; z[1] = sum2;
818     z += 2; y += 2;
819   }
820   ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
821   ierr = VecRestoreArray(yy,&yg); CHKERRQ(ierr);
822   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
823   PLogFlops(4*a->nz);
824   return 0;
825 }
826 
827 static int MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
828 {
829   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
830   Scalar          *xg,*yg,*zg;
831   register Scalar *x,*y,*z,*v,*xb,sum1,sum2,sum3,x1,x2,x3;
832   int             mbs=a->mbs,i,*idx,*ii,j,n,ierr;
833 
834   ierr = VecGetArray(xx,&xg); CHKERRQ(ierr); x = xg;
835   ierr = VecGetArray(yy,&yg); CHKERRQ(ierr); y = yg;
836   ierr = VecGetArray(zz,&zg); CHKERRQ(ierr); z = zg;
837 
838   idx   = a->j;
839   v     = a->a;
840   ii    = a->i;
841 
842   for ( i=0; i<mbs; i++ ) {
843     n  = ii[1] - ii[0]; ii++;
844     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
845     for ( j=0; j<n; j++ ) {
846       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
847       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
848       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
849       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
850       v += 9;
851     }
852     z[0] = sum1; z[1] = sum2; z[2] = sum3;
853     z += 3; y += 3;
854   }
855   ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
856   ierr = VecRestoreArray(yy,&yg); CHKERRQ(ierr);
857   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
858   PLogFlops(18*a->nz);
859   return 0;
860 }
861 
862 static int MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
863 {
864   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
865   Scalar          *xg,*yg,*zg;
866   register Scalar *x,*y,*z,*v,*xb,sum1,sum2,sum3,sum4;
867   register Scalar x1,x2,x3,x4;
868   int             mbs=a->mbs,i,*idx,*ii;
869   int             j,n,ierr;
870 
871   ierr = VecGetArray(xx,&xg); CHKERRQ(ierr); x = xg;
872   ierr = VecGetArray(yy,&yg); CHKERRQ(ierr); y = yg;
873   ierr = VecGetArray(zz,&zg); CHKERRQ(ierr); z = zg;
874 
875   idx   = a->j;
876   v     = a->a;
877   ii    = a->i;
878 
879   for ( i=0; i<mbs; i++ ) {
880     n  = ii[1] - ii[0]; ii++;
881     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
882     for ( j=0; j<n; j++ ) {
883       xb = x + 4*(*idx++);
884       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
885       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
886       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
887       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
888       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
889       v += 16;
890     }
891     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
892     z += 4; y += 4;
893   }
894   ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
895   ierr = VecRestoreArray(yy,&yg); CHKERRQ(ierr);
896   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
897   PLogFlops(32*a->nz);
898   return 0;
899 }
900 
901 static int MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
902 {
903   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
904   Scalar          *xg,*yg,*zg;
905   register Scalar *x,*y,*z,*v,*xb,sum1,sum2,sum3,sum4,sum5;
906   register Scalar x1,x2,x3,x4,x5;
907   int             mbs=a->mbs,i,*idx,*ii,j,n,ierr;
908 
909   ierr = VecGetArray(xx,&xg); CHKERRQ(ierr); x = xg;
910   ierr = VecGetArray(yy,&yg); CHKERRQ(ierr); y = yg;
911   ierr = VecGetArray(zz,&zg); CHKERRQ(ierr); z = zg;
912 
913   idx   = a->j;
914   v     = a->a;
915   ii    = a->i;
916 
917   for ( i=0; i<mbs; i++ ) {
918     n  = ii[1] - ii[0]; ii++;
919     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
920     for ( j=0; j<n; j++ ) {
921       xb = x + 5*(*idx++);
922       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
923       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
924       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
925       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
926       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
927       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
928       v += 25;
929     }
930     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
931     z += 5; y += 5;
932   }
933   ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
934   ierr = VecRestoreArray(yy,&yg); CHKERRQ(ierr);
935   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
936   PLogFlops(50*a->nz);
937   return 0;
938 }
939 
940 static int MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
941 {
942   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
943   Scalar          *xg,*zg;
944   register Scalar *x,*z,*v,*xb;
945   int             mbs=a->mbs,i,*idx,*ii,bs=a->bs,j,n,bs2=a->bs2,ierr;
946   int             _One = 1,ncols,k; Scalar _DOne = 1.0, *work,*workt;
947 
948   if ( xx != yy) { ierr = VecCopy(yy,zz); CHKERRQ(ierr); }
949 
950   ierr = VecGetArray(xx,&xg); CHKERRQ(ierr); x = xg;
951   ierr = VecGetArray(zz,&zg); CHKERRQ(ierr); z = zg;
952 
953   idx   = a->j;
954   v     = a->a;
955   ii    = a->i;
956 
957 
958   if (!a->mult_work) {
959     k = PetscMax(a->m,a->n);
960     a->mult_work = (Scalar *) PetscMalloc(k*sizeof(Scalar));CHKPTRQ(a->mult_work);
961   }
962   work = a->mult_work;
963   for ( i=0; i<mbs; i++ ) {
964     n     = ii[1] - ii[0]; ii++;
965     ncols = n*bs;
966     workt = work;
967     for ( j=0; j<n; j++ ) {
968       xb = x + bs*(*idx++);
969       for ( k=0; k<bs; k++ ) workt[k] = xb[k];
970       workt += bs;
971     }
972     LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One);
973     v += n*bs2;
974     z += bs;
975   }
976   ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
977   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
978   PLogFlops(2*a->nz*bs2 );
979   return 0;
980 }
981 
982 static int MatMultTrans_SeqBAIJ(Mat A,Vec xx,Vec zz)
983 {
984   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
985   Scalar          *xg,*zg,*zb;
986   register Scalar *x,*z,*v,*xb,x1,x2,x3,x4,x5;
987   int             mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval,N=a->n;
988   int             bs=a->bs,j,n,bs2=a->bs2,*ib,ierr;
989 
990 
991   ierr = VecGetArray(xx,&xg); CHKERRQ(ierr); x = xg;
992   ierr = VecGetArray(zz,&zg); CHKERRQ(ierr); z = zg;
993   PetscMemzero(z,N*sizeof(Scalar));
994 
995   idx   = a->j;
996   v     = a->a;
997   ii    = a->i;
998 
999   switch (bs) {
1000   case 1:
1001     for ( i=0; i<mbs; i++ ) {
1002       n  = ii[1] - ii[0]; ii++;
1003       xb = x + i; x1 = xb[0];
1004       ib = idx + ai[i];
1005       for ( j=0; j<n; j++ ) {
1006         rval    = ib[j];
1007         z[rval] += *v++ * x1;
1008       }
1009     }
1010     break;
1011   case 2:
1012     for ( i=0; i<mbs; i++ ) {
1013       n  = ii[1] - ii[0]; ii++;
1014       xb = x + 2*i; x1 = xb[0]; x2 = xb[1];
1015       ib = idx + ai[i];
1016       for ( j=0; j<n; j++ ) {
1017         rval      = ib[j]*2;
1018         z[rval++] += v[0]*x1 + v[1]*x2;
1019         z[rval++] += v[2]*x1 + v[3]*x2;
1020         v += 4;
1021       }
1022     }
1023     break;
1024   case 3:
1025     for ( i=0; i<mbs; i++ ) {
1026       n  = ii[1] - ii[0]; ii++;
1027       xb = x + 3*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1028       ib = idx + ai[i];
1029       for ( j=0; j<n; j++ ) {
1030         rval      = ib[j]*3;
1031         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1032         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1033         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1034         v += 9;
1035       }
1036     }
1037     break;
1038   case 4:
1039     for ( i=0; i<mbs; i++ ) {
1040       n  = ii[1] - ii[0]; ii++;
1041       xb = x + 4*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1042       ib = idx + ai[i];
1043       for ( j=0; j<n; j++ ) {
1044         rval      = ib[j]*4;
1045         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1046         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1047         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1048         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1049         v += 16;
1050       }
1051     }
1052     break;
1053   case 5:
1054     for ( i=0; i<mbs; i++ ) {
1055       n  = ii[1] - ii[0]; ii++;
1056       xb = x + 5*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1057       x4 = xb[3];   x5 = xb[4];
1058       ib = idx + ai[i];
1059       for ( j=0; j<n; j++ ) {
1060         rval      = ib[j]*5;
1061         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1062         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1063         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1064         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1065         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1066         v += 25;
1067       }
1068     }
1069     break;
1070       /* block sizes larger then 5 by 5 are handled by BLAS */
1071     default: {
1072       int  _One = 1,ncols,k; Scalar _DOne = 1.0, *work,*workt;
1073       if (!a->mult_work) {
1074         k = PetscMax(a->m,a->n);
1075         a->mult_work = (Scalar *) PetscMalloc(k*sizeof(Scalar));
1076         CHKPTRQ(a->mult_work);
1077       }
1078       work = a->mult_work;
1079       for ( i=0; i<mbs; i++ ) {
1080         n     = ii[1] - ii[0]; ii++;
1081         ncols = n*bs;
1082         PetscMemzero(work,ncols*sizeof(Scalar));
1083         LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One);
1084         v += n*bs2;
1085         x += bs;
1086         workt = work;
1087         for ( j=0; j<n; j++ ) {
1088           zb = z + bs*(*idx++);
1089           for ( k=0; k<bs; k++ ) zb[k] += workt[k] ;
1090           workt += bs;
1091         }
1092       }
1093     }
1094   }
1095   ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
1096   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
1097   PLogFlops(2*a->nz*a->bs2 - a->n);
1098   return 0;
1099 }
1100 static int MatMultTransAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1101 {
1102   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1103   Scalar          *xg,*zg,*zb;
1104   register Scalar *x,*z,*v,*xb,x1,x2,x3,x4,x5;
1105   int             mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval,N=a->n;
1106   int             bs=a->bs,j,n,bs2=a->bs2,*ib,ierr;
1107 
1108 
1109 
1110   ierr = VecGetArray(xx,&xg); CHKERRQ(ierr); x = xg;
1111   ierr = VecGetArray(zz,&zg); CHKERRQ(ierr); z = zg;
1112 
1113   if ( yy != zz ) { ierr = VecCopy(yy,zz); CHKERRQ(ierr); }
1114   else PetscMemzero(z,N*sizeof(Scalar));
1115 
1116 
1117   idx   = a->j;
1118   v     = a->a;
1119   ii    = a->i;
1120 
1121   switch (bs) {
1122   case 1:
1123     for ( i=0; i<mbs; i++ ) {
1124       n  = ii[1] - ii[0]; ii++;
1125       xb = x + i; x1 = xb[0];
1126       ib = idx + ai[i];
1127       for ( j=0; j<n; j++ ) {
1128         rval    = ib[j];
1129         z[rval] += *v++ * x1;
1130       }
1131     }
1132     break;
1133   case 2:
1134     for ( i=0; i<mbs; i++ ) {
1135       n  = ii[1] - ii[0]; ii++;
1136       xb = x + 2*i; x1 = xb[0]; x2 = xb[1];
1137       ib = idx + ai[i];
1138       for ( j=0; j<n; j++ ) {
1139         rval      = ib[j]*2;
1140         z[rval++] += v[0]*x1 + v[1]*x2;
1141         z[rval++] += v[2]*x1 + v[3]*x2;
1142         v += 4;
1143       }
1144     }
1145     break;
1146   case 3:
1147     for ( i=0; i<mbs; i++ ) {
1148       n  = ii[1] - ii[0]; ii++;
1149       xb = x + 3*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1150       ib = idx + ai[i];
1151       for ( j=0; j<n; j++ ) {
1152         rval      = ib[j]*3;
1153         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1154         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1155         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1156         v += 9;
1157       }
1158     }
1159     break;
1160   case 4:
1161     for ( i=0; i<mbs; i++ ) {
1162       n  = ii[1] - ii[0]; ii++;
1163       xb = x + 4*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1164       ib = idx + ai[i];
1165       for ( j=0; j<n; j++ ) {
1166         rval      = ib[j]*4;
1167         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1168         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1169         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1170         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1171         v += 16;
1172       }
1173     }
1174     break;
1175   case 5:
1176     for ( i=0; i<mbs; i++ ) {
1177       n  = ii[1] - ii[0]; ii++;
1178       xb = x + 5*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1179       x4 = xb[3];   x5 = xb[4];
1180       ib = idx + ai[i];
1181       for ( j=0; j<n; j++ ) {
1182         rval      = ib[j]*5;
1183         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1184         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1185         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1186         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1187         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1188         v += 25;
1189       }
1190     }
1191     break;
1192       /* block sizes larger then 5 by 5 are handled by BLAS */
1193     default: {
1194       int  _One = 1,ncols,k; Scalar _DOne = 1.0, *work,*workt;
1195       if (!a->mult_work) {
1196         k = PetscMax(a->m,a->n);
1197         a->mult_work = (Scalar *) PetscMalloc(k*sizeof(Scalar));
1198         CHKPTRQ(a->mult_work);
1199       }
1200       work = a->mult_work;
1201       for ( i=0; i<mbs; i++ ) {
1202         n     = ii[1] - ii[0]; ii++;
1203         ncols = n*bs;
1204         PetscMemzero(work,ncols*sizeof(Scalar));
1205         LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One);
1206         v += n*bs2;
1207         x += bs;
1208         workt = work;
1209         for ( j=0; j<n; j++ ) {
1210           zb = z + bs*(*idx++);
1211           for ( k=0; k<bs; k++ ) zb[k] += workt[k] ;
1212           workt += bs;
1213         }
1214       }
1215     }
1216   }
1217   ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
1218   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
1219   PLogFlops(2*a->nz*a->bs2);
1220   return 0;
1221 }
1222 
1223 static int MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,int *nz,int *nza,int *mem)
1224 {
1225   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1226   if (nz)  *nz  = a->bs2*a->nz;
1227   if (nza) *nza = a->maxnz;
1228   if (mem) *mem = (int)A->mem;
1229   return 0;
1230 }
1231 
1232 static int MatEqual_SeqBAIJ(Mat A,Mat B, PetscTruth* flg)
1233 {
1234   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)B->data;
1235 
1236   if (B->type !=MATSEQBAIJ)SETERRQ(1,"MatEqual_SeqBAIJ:Matrices must be same type");
1237 
1238   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1239   if ((a->m != b->m) || (a->n !=b->n) || (a->bs != b->bs)||
1240       (a->nz != b->nz)) {
1241     *flg = PETSC_FALSE; return 0;
1242   }
1243 
1244   /* if the a->i are the same */
1245   if (PetscMemcmp(a->i,b->i, (a->mbs+1)*sizeof(int))) {
1246     *flg = PETSC_FALSE; return 0;
1247   }
1248 
1249   /* if a->j are the same */
1250   if (PetscMemcmp(a->j,b->j,(a->nz)*sizeof(int))) {
1251     *flg = PETSC_FALSE; return 0;
1252   }
1253 
1254   /* if a->a are the same */
1255   if (PetscMemcmp(a->a, b->a,(a->nz)*(a->bs)*(a->bs)*sizeof(Scalar))) {
1256     *flg = PETSC_FALSE; return 0;
1257   }
1258   *flg = PETSC_TRUE;
1259   return 0;
1260 
1261 }
1262 
1263 static int MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
1264 {
1265   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1266   int         i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1267   Scalar      *x, zero = 0.0,*aa,*aa_j;
1268 
1269   bs   = a->bs;
1270   aa   = a->a;
1271   ai   = a->i;
1272   aj   = a->j;
1273   ambs = a->mbs;
1274   bs2  = a->bs2;
1275 
1276   VecSet(&zero,v);
1277   VecGetArray(v,&x); VecGetLocalSize(v,&n);
1278   if (n != a->m) SETERRQ(1,"MatGetDiagonal_SeqBAIJ:Nonconforming matrix and vector");
1279   for ( i=0; i<ambs; i++ ) {
1280     for ( j=ai[i]; j<ai[i+1]; j++ ) {
1281       if (aj[j] == i) {
1282         row  = i*bs;
1283         aa_j = aa+j*bs2;
1284         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1285         break;
1286       }
1287     }
1288   }
1289   return 0;
1290 }
1291 
1292 static int MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
1293 {
1294   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1295   Scalar      *l,*r,x,*v,*aa,*li,*ri;
1296   int         i,j,k,lm,rn,M,m,n,*ai,*aj,mbs,tmp,bs,bs2;
1297 
1298   ai  = a->i;
1299   aj  = a->j;
1300   aa  = a->a;
1301   m   = a->m;
1302   n   = a->n;
1303   bs  = a->bs;
1304   mbs = a->mbs;
1305   bs2 = a->bs2;
1306   if (ll) {
1307     VecGetArray(ll,&l); VecGetSize(ll,&lm);
1308     if (lm != m) SETERRQ(1,"MatDiagonalScale_SeqBAIJ:Left scaling vector wrong length");
1309     for ( i=0; i<mbs; i++ ) { /* for each block row */
1310       M  = ai[i+1] - ai[i];
1311       li = l + i*bs;
1312       v  = aa + bs2*ai[i];
1313       for ( j=0; j<M; j++ ) { /* for each block */
1314         for ( k=0; k<bs2; k++ ) {
1315           (*v++) *= li[k%bs];
1316         }
1317       }
1318     }
1319   }
1320 
1321   if (rr) {
1322     VecGetArray(rr,&r); VecGetSize(rr,&rn);
1323     if (rn != n) SETERRQ(1,"MatDiagonalScale_SeqBAIJ:Right scaling vector wrong length");
1324     for ( i=0; i<mbs; i++ ) { /* for each block row */
1325       M  = ai[i+1] - ai[i];
1326       v  = aa + bs2*ai[i];
1327       for ( j=0; j<M; j++ ) { /* for each block */
1328         ri = r + bs*aj[ai[i]+j];
1329         for ( k=0; k<bs; k++ ) {
1330           x = ri[k];
1331           for ( tmp=0; tmp<bs; tmp++ ) (*v++) *= x;
1332         }
1333       }
1334     }
1335   }
1336   return 0;
1337 }
1338 
1339 
1340 extern int MatLUFactorSymbolic_SeqBAIJ(Mat,IS,IS,double,Mat*);
1341 extern int MatLUFactor_SeqBAIJ(Mat,IS,IS,double);
1342 extern int MatIncreaseOverlap_SeqBAIJ(Mat,int,IS*,int);
1343 extern int MatGetSubMatrix_SeqBAIJ(Mat,IS,IS,MatGetSubMatrixCall,Mat*);
1344 extern int MatGetSubMatrices_SeqBAIJ(Mat,int,IS*,IS*,MatGetSubMatrixCall,Mat**);
1345 
1346 extern int MatSolve_SeqBAIJ_N(Mat,Vec,Vec);
1347 extern int MatSolve_SeqBAIJ_1(Mat,Vec,Vec);
1348 extern int MatSolve_SeqBAIJ_2(Mat,Vec,Vec);
1349 extern int MatSolve_SeqBAIJ_3(Mat,Vec,Vec);
1350 extern int MatSolve_SeqBAIJ_4(Mat,Vec,Vec);
1351 extern int MatSolve_SeqBAIJ_5(Mat,Vec,Vec);
1352 
1353 extern int MatLUFactorNumeric_SeqBAIJ_N(Mat,Mat*);
1354 extern int MatLUFactorNumeric_SeqBAIJ_1(Mat,Mat*);
1355 extern int MatLUFactorNumeric_SeqBAIJ_2(Mat,Mat*);
1356 extern int MatLUFactorNumeric_SeqBAIJ_3(Mat,Mat*);
1357 extern int MatLUFactorNumeric_SeqBAIJ_4(Mat,Mat*);
1358 extern int MatLUFactorNumeric_SeqBAIJ_5(Mat,Mat*);
1359 
1360 static int MatNorm_SeqBAIJ(Mat A,NormType type,double *norm)
1361 {
1362   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1363   Scalar      *v = a->a;
1364   double      sum = 0.0;
1365   int         i,nz=a->nz,bs2=a->bs2;
1366 
1367   if (type == NORM_FROBENIUS) {
1368     for (i=0; i< bs2*nz; i++ ) {
1369 #if defined(PETSC_COMPLEX)
1370       sum += real(conj(*v)*(*v)); v++;
1371 #else
1372       sum += (*v)*(*v); v++;
1373 #endif
1374     }
1375     *norm = sqrt(sum);
1376   }
1377   else {
1378     SETERRQ(1,"MatNorm_SeqBAIJ:No support for this norm yet");
1379   }
1380   return 0;
1381 }
1382 
1383 /*
1384      note: This can only work for identity for row and col. It would
1385    be good to check this and otherwise generate an error.
1386 */
1387 static int MatILUFactor_SeqBAIJ(Mat inA,IS row,IS col,double efill,int fill)
1388 {
1389   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) inA->data;
1390   Mat         outA;
1391   int         ierr;
1392 
1393   if (fill != 0) SETERRQ(1,"MatILUFactor_SeqBAIJ:Only fill=0 supported");
1394 
1395   outA          = inA;
1396   inA->factor   = FACTOR_LU;
1397   a->row        = row;
1398   a->col        = col;
1399 
1400   a->solve_work = (Scalar *) PetscMalloc((a->m+a->bs)*sizeof(Scalar));CHKPTRQ(a->solve_work);
1401 
1402   if (!a->diag) {
1403     ierr = MatMarkDiag_SeqBAIJ(inA); CHKERRQ(ierr);
1404   }
1405   ierr = MatLUFactorNumeric(inA,&outA); CHKERRQ(ierr);
1406   return 0;
1407 }
1408 
1409 static int MatScale_SeqBAIJ(Scalar *alpha,Mat inA)
1410 {
1411   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) inA->data;
1412   int         one = 1, totalnz = a->bs2*a->nz;
1413   BLscal_( &totalnz, alpha, a->a, &one );
1414   PLogFlops(totalnz);
1415   return 0;
1416 }
1417 
1418 static int MatGetValues_SeqBAIJ(Mat A,int m,int *im,int n,int *in,Scalar *v)
1419 {
1420   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1421   int        *rp, k, low, high, t, row, nrow, i, col, l, *aj = a->j;
1422   int        *ai = a->i, *ailen = a->ilen;
1423   int        brow,bcol,ridx,cidx,bs=a->bs,bs2=a->bs2;
1424   Scalar     *ap, *aa = a->a, zero = 0.0;
1425 
1426   for ( k=0; k<m; k++ ) { /* loop over rows */
1427     row  = im[k]; brow = row/bs;
1428     if (row < 0) SETERRQ(1,"MatGetValues_SeqBAIJ:Negative row");
1429     if (row >= a->m) SETERRQ(1,"MatGetValues_SeqBAIJ:Row too large");
1430     rp   = aj + ai[brow] ; ap = aa + bs2*ai[brow] ;
1431     nrow = ailen[brow];
1432     for ( l=0; l<n; l++ ) { /* loop over columns */
1433       if (in[l] < 0) SETERRQ(1,"MatGetValues_SeqBAIJ:Negative column");
1434       if (in[l] >= a->n) SETERRQ(1,"MatGetValues_SeqBAIJ:Column too large");
1435       col  = in[l] ;
1436       bcol = col/bs;
1437       cidx = col%bs;
1438       ridx = row%bs;
1439       high = nrow;
1440       low  = 0; /* assume unsorted */
1441       while (high-low > 5) {
1442         t = (low+high)/2;
1443         if (rp[t] > bcol) high = t;
1444         else             low  = t;
1445       }
1446       for ( i=low; i<high; i++ ) {
1447         if (rp[i] > bcol) break;
1448         if (rp[i] == bcol) {
1449           *v++ = ap[bs2*i+bs*cidx+ridx];
1450           goto finished;
1451         }
1452       }
1453       *v++ = zero;
1454       finished:;
1455     }
1456   }
1457   return 0;
1458 }
1459 
1460 /* -------------------------------------------------------------------*/
1461 static struct _MatOps MatOps = {MatSetValues_SeqBAIJ,
1462        MatGetRow_SeqBAIJ,MatRestoreRow_SeqBAIJ,
1463        MatMult_SeqBAIJ_N,MatMultAdd_SeqBAIJ_N,
1464        MatMultTrans_SeqBAIJ,MatMultTransAdd_SeqBAIJ,
1465        MatSolve_SeqBAIJ_N,0,
1466        0,0,
1467        MatLUFactor_SeqBAIJ,0,
1468        0,
1469        MatTranspose_SeqBAIJ,
1470        MatGetInfo_SeqBAIJ,MatEqual_SeqBAIJ,
1471        MatGetDiagonal_SeqBAIJ,MatDiagonalScale_SeqBAIJ,MatNorm_SeqBAIJ,
1472        0,MatAssemblyEnd_SeqBAIJ,
1473        0,
1474        MatSetOption_SeqBAIJ,MatZeroEntries_SeqBAIJ,0,
1475        MatGetReordering_SeqBAIJ,
1476        MatLUFactorSymbolic_SeqBAIJ,MatLUFactorNumeric_SeqBAIJ_N,0,0,
1477        MatGetSize_SeqBAIJ,MatGetSize_SeqBAIJ,MatGetOwnershipRange_SeqBAIJ,
1478        MatILUFactorSymbolic_SeqBAIJ,0,
1479        0,0,/*  MatConvert_SeqBAIJ  */ 0,
1480        MatGetSubMatrix_SeqBAIJ,0,
1481        MatConvertSameType_SeqBAIJ,0,0,
1482        MatILUFactor_SeqBAIJ,0,0,
1483        MatGetSubMatrices_SeqBAIJ,MatIncreaseOverlap_SeqBAIJ,
1484        MatGetValues_SeqBAIJ,0,
1485        0,MatScale_SeqBAIJ,
1486        0};
1487 
1488 /*@C
1489    MatCreateSeqBAIJ - Creates a sparse matrix in block AIJ (block
1490    compressed row) format.  For good matrix assembly performance the
1491    user should preallocate the matrix storage by setting the parameter nz
1492    (or the array nzz).  By setting these parameters accurately, performance
1493    during matrix assembly can be increased by more than a factor of 50.
1494 
1495    Input Parameters:
1496 .  comm - MPI communicator, set to MPI_COMM_SELF
1497 .  bs - size of block
1498 .  m - number of rows
1499 .  n - number of columns
1500 .  nz - number of block nonzeros per block row (same for all rows)
1501 .  nzz - array containing the number of block nonzeros in the various block rows
1502          (possibly different for each block row) or PETSC_NULL
1503 
1504    Output Parameter:
1505 .  A - the matrix
1506 
1507    Notes:
1508    The block AIJ format is fully compatible with standard Fortran 77
1509    storage.  That is, the stored row and column indices can begin at
1510    either one (as in Fortran) or zero.  See the users' manual for details.
1511 
1512    Specify the preallocated storage with either nz or nnz (not both).
1513    Set nz=PETSC_DEFAULT and nnz=PETSC_NULL for PETSc to control dynamic memory
1514    allocation.  For additional details, see the users manual chapter on
1515    matrices and the file $(PETSC_DIR)/Performance.
1516 
1517 .seealso: MatCreate(), MatCreateSeqAIJ(), MatSetValues()
1518 @*/
1519 int MatCreateSeqBAIJ(MPI_Comm comm,int bs,int m,int n,int nz,int *nnz, Mat *A)
1520 {
1521   Mat         B;
1522   Mat_SeqBAIJ *b;
1523   int         i,len,ierr,flg,mbs=m/bs,nbs=n/bs,bs2=bs*bs;
1524 
1525   if (mbs*bs!=m || nbs*bs!=n)
1526     SETERRQ(1,"MatCreateSeqBAIJ:Number rows, cols must be divisible by blocksize");
1527 
1528   *A = 0;
1529   PetscHeaderCreate(B,_Mat,MAT_COOKIE,MATSEQBAIJ,comm);
1530   PLogObjectCreate(B);
1531   B->data = (void *) (b = PetscNew(Mat_SeqBAIJ)); CHKPTRQ(b);
1532   PetscMemzero(b,sizeof(Mat_SeqBAIJ));
1533   PetscMemcpy(&B->ops,&MatOps,sizeof(struct _MatOps));
1534   ierr = OptionsHasName(PETSC_NULL,"-mat_no_unroll",&flg); CHKERRQ(ierr);
1535   if (!flg) {
1536     switch (bs) {
1537       case 1:
1538         B->ops.lufactornumeric = MatLUFactorNumeric_SeqBAIJ_1;
1539         B->ops.solve           = MatSolve_SeqBAIJ_1;
1540         B->ops.mult            = MatMult_SeqBAIJ_1;
1541         B->ops.multadd         = MatMultAdd_SeqBAIJ_1;
1542        break;
1543       case 2:
1544         B->ops.lufactornumeric = MatLUFactorNumeric_SeqBAIJ_2;
1545         B->ops.solve           = MatSolve_SeqBAIJ_2;
1546         B->ops.mult            = MatMult_SeqBAIJ_2;
1547         B->ops.multadd         = MatMultAdd_SeqBAIJ_2;
1548         break;
1549       case 3:
1550         B->ops.lufactornumeric = MatLUFactorNumeric_SeqBAIJ_3;
1551         B->ops.solve           = MatSolve_SeqBAIJ_3;
1552         B->ops.mult            = MatMult_SeqBAIJ_3;
1553         B->ops.multadd         = MatMultAdd_SeqBAIJ_3;
1554         break;
1555       case 4:
1556         B->ops.lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4;
1557         B->ops.solve           = MatSolve_SeqBAIJ_4;
1558         B->ops.mult            = MatMult_SeqBAIJ_4;
1559         B->ops.multadd         = MatMultAdd_SeqBAIJ_4;
1560         break;
1561       case 5:
1562         B->ops.lufactornumeric = MatLUFactorNumeric_SeqBAIJ_5;
1563         B->ops.solve           = MatSolve_SeqBAIJ_5;
1564         B->ops.mult            = MatMult_SeqBAIJ_5;
1565         B->ops.multadd         = MatMultAdd_SeqBAIJ_5;
1566         break;
1567     }
1568   }
1569   B->destroy          = MatDestroy_SeqBAIJ;
1570   B->view             = MatView_SeqBAIJ;
1571   B->factor           = 0;
1572   B->lupivotthreshold = 1.0;
1573   b->row              = 0;
1574   b->col              = 0;
1575   b->reallocs         = 0;
1576 
1577   b->m       = m; B->m = m; B->M = m;
1578   b->n       = n; B->n = n; B->N = n;
1579   b->mbs     = mbs;
1580   b->nbs     = nbs;
1581   b->imax    = (int *) PetscMalloc( (mbs+1)*sizeof(int) ); CHKPTRQ(b->imax);
1582   if (nnz == PETSC_NULL) {
1583     if (nz == PETSC_DEFAULT) nz = 5;
1584     else if (nz <= 0)        nz = 1;
1585     for ( i=0; i<mbs; i++ ) b->imax[i] = nz;
1586     nz = nz*mbs;
1587   }
1588   else {
1589     nz = 0;
1590     for ( i=0; i<mbs; i++ ) {b->imax[i] = nnz[i]; nz += nnz[i];}
1591   }
1592 
1593   /* allocate the matrix space */
1594   len   = nz*sizeof(int) + nz*bs2*sizeof(Scalar) + (b->m+1)*sizeof(int);
1595   b->a  = (Scalar *) PetscMalloc( len ); CHKPTRQ(b->a);
1596   PetscMemzero(b->a,nz*bs2*sizeof(Scalar));
1597   b->j  = (int *) (b->a + nz*bs2);
1598   PetscMemzero(b->j,nz*sizeof(int));
1599   b->i  = b->j + nz;
1600   b->singlemalloc = 1;
1601 
1602   b->i[0] = 0;
1603   for (i=1; i<mbs+1; i++) {
1604     b->i[i] = b->i[i-1] + b->imax[i-1];
1605   }
1606 
1607   /* b->ilen will count nonzeros in each block row so far. */
1608   b->ilen = (int *) PetscMalloc((mbs+1)*sizeof(int));
1609   PLogObjectMemory(B,len+2*(mbs+1)*sizeof(int)+sizeof(struct _Mat)+sizeof(Mat_SeqBAIJ));
1610   for ( i=0; i<mbs; i++ ) { b->ilen[i] = 0;}
1611 
1612   b->bs               = bs;
1613   b->bs2              = bs2;
1614   b->mbs              = mbs;
1615   b->nz               = 0;
1616   b->maxnz            = nz*bs2;
1617   b->sorted           = 0;
1618   b->roworiented      = 1;
1619   b->nonew            = 0;
1620   b->diag             = 0;
1621   b->solve_work       = 0;
1622   b->mult_work        = 0;
1623   b->spptr            = 0;
1624   *A = B;
1625   ierr = OptionsHasName(PETSC_NULL,"-help", &flg); CHKERRQ(ierr);
1626   if (flg) {ierr = MatPrintHelp(B); CHKERRQ(ierr); }
1627   return 0;
1628 }
1629 
1630 int MatConvertSameType_SeqBAIJ(Mat A,Mat *B,int cpvalues)
1631 {
1632   Mat         C;
1633   Mat_SeqBAIJ *c,*a = (Mat_SeqBAIJ *) A->data;
1634   int         i,len, mbs = a->mbs,nz = a->nz,bs2 =a->bs2;
1635 
1636   if (a->i[mbs] != nz) SETERRQ(1,"MatConvertSameType_SeqBAIJ:Corrupt matrix");
1637 
1638   *B = 0;
1639   PetscHeaderCreate(C,_Mat,MAT_COOKIE,MATSEQBAIJ,A->comm);
1640   PLogObjectCreate(C);
1641   C->data       = (void *) (c = PetscNew(Mat_SeqBAIJ)); CHKPTRQ(c);
1642   PetscMemcpy(&C->ops,&A->ops,sizeof(struct _MatOps));
1643   C->destroy    = MatDestroy_SeqBAIJ;
1644   C->view       = MatView_SeqBAIJ;
1645   C->factor     = A->factor;
1646   c->row        = 0;
1647   c->col        = 0;
1648   C->assembled  = PETSC_TRUE;
1649 
1650   c->m = C->m   = a->m;
1651   c->n = C->n   = a->n;
1652   C->M          = a->m;
1653   C->N          = a->n;
1654 
1655   c->bs         = a->bs;
1656   c->bs2        = a->bs2;
1657   c->mbs        = a->mbs;
1658   c->nbs        = a->nbs;
1659 
1660   c->imax       = (int *) PetscMalloc((mbs+1)*sizeof(int)); CHKPTRQ(c->imax);
1661   c->ilen       = (int *) PetscMalloc((mbs+1)*sizeof(int)); CHKPTRQ(c->ilen);
1662   for ( i=0; i<mbs; i++ ) {
1663     c->imax[i] = a->imax[i];
1664     c->ilen[i] = a->ilen[i];
1665   }
1666 
1667   /* allocate the matrix space */
1668   c->singlemalloc = 1;
1669   len   = (mbs+1)*sizeof(int) + nz*(bs2*sizeof(Scalar) + sizeof(int));
1670   c->a  = (Scalar *) PetscMalloc( len ); CHKPTRQ(c->a);
1671   c->j  = (int *) (c->a + nz*bs2);
1672   c->i  = c->j + nz;
1673   PetscMemcpy(c->i,a->i,(mbs+1)*sizeof(int));
1674   if (mbs > 0) {
1675     PetscMemcpy(c->j,a->j,nz*sizeof(int));
1676     if (cpvalues == COPY_VALUES) {
1677       PetscMemcpy(c->a,a->a,bs2*nz*sizeof(Scalar));
1678     }
1679   }
1680 
1681   PLogObjectMemory(C,len+2*(mbs+1)*sizeof(int)+sizeof(struct _Mat)+sizeof(Mat_SeqBAIJ));
1682   c->sorted      = a->sorted;
1683   c->roworiented = a->roworiented;
1684   c->nonew       = a->nonew;
1685 
1686   if (a->diag) {
1687     c->diag = (int *) PetscMalloc( (mbs+1)*sizeof(int) ); CHKPTRQ(c->diag);
1688     PLogObjectMemory(C,(mbs+1)*sizeof(int));
1689     for ( i=0; i<mbs; i++ ) {
1690       c->diag[i] = a->diag[i];
1691     }
1692   }
1693   else c->diag          = 0;
1694   c->nz                 = a->nz;
1695   c->maxnz              = a->maxnz;
1696   c->solve_work         = 0;
1697   c->spptr              = 0;      /* Dangerous -I'm throwing away a->spptr */
1698   c->mult_work          = 0;
1699   *B = C;
1700   return 0;
1701 }
1702 
1703 int MatLoad_SeqBAIJ(Viewer viewer,MatType type,Mat *A)
1704 {
1705   Mat_SeqBAIJ  *a;
1706   Mat          B;
1707   int          i,nz,ierr,fd,header[4],size,*rowlengths=0,M,N,bs=1,flg;
1708   int          *mask,mbs,*jj,j,rowcount,nzcount,k,*browlengths,maskcount;
1709   int          kmax,jcount,block,idx,point,nzcountb,extra_rows;
1710   int          *masked, nmask,tmp,bs2,ishift;
1711   Scalar       *aa;
1712   MPI_Comm     comm = ((PetscObject) viewer)->comm;
1713 
1714   ierr = OptionsGetInt(PETSC_NULL,"-matload_block_size",&bs,&flg);CHKERRQ(ierr);
1715   bs2  = bs*bs;
1716 
1717   MPI_Comm_size(comm,&size);
1718   if (size > 1) SETERRQ(1,"MatLoad_SeqBAIJ:view must have one processor");
1719   ierr = ViewerBinaryGetDescriptor(viewer,&fd); CHKERRQ(ierr);
1720   ierr = PetscBinaryRead(fd,header,4,BINARY_INT); CHKERRQ(ierr);
1721   if (header[0] != MAT_COOKIE) SETERRQ(1,"MatLoad_SeqBAIJ:not Mat object");
1722   M = header[1]; N = header[2]; nz = header[3];
1723 
1724   if (M != N) SETERRQ(1,"MatLoad_SeqBAIJ:Can only do square matrices");
1725 
1726   /*
1727      This code adds extra rows to make sure the number of rows is
1728     divisible by the blocksize
1729   */
1730   mbs        = M/bs;
1731   extra_rows = bs - M + bs*(mbs);
1732   if (extra_rows == bs) extra_rows = 0;
1733   else                  mbs++;
1734   if (extra_rows) {
1735     PLogInfo(0,"MatLoad_SeqBAIJ:Padding loaded matrix to match blocksize");
1736   }
1737 
1738   /* read in row lengths */
1739   rowlengths = (int*) PetscMalloc((M+extra_rows)*sizeof(int));CHKPTRQ(rowlengths);
1740   ierr = PetscBinaryRead(fd,rowlengths,M,BINARY_INT); CHKERRQ(ierr);
1741   for ( i=0; i<extra_rows; i++ ) rowlengths[M+i] = 1;
1742 
1743   /* read in column indices */
1744   jj = (int*) PetscMalloc( (nz+extra_rows)*sizeof(int) ); CHKPTRQ(jj);
1745   ierr = PetscBinaryRead(fd,jj,nz,BINARY_INT); CHKERRQ(ierr);
1746   for ( i=0; i<extra_rows; i++ ) jj[nz+i] = M+i;
1747 
1748   /* loop over row lengths determining block row lengths */
1749   browlengths = (int *) PetscMalloc(mbs*sizeof(int));CHKPTRQ(browlengths);
1750   PetscMemzero(browlengths,mbs*sizeof(int));
1751   mask   = (int *) PetscMalloc( 2*mbs*sizeof(int) ); CHKPTRQ(mask);
1752   PetscMemzero(mask,mbs*sizeof(int));
1753   masked = mask + mbs;
1754   rowcount = 0; nzcount = 0;
1755   for ( i=0; i<mbs; i++ ) {
1756     nmask = 0;
1757     for ( j=0; j<bs; j++ ) {
1758       kmax = rowlengths[rowcount];
1759       for ( k=0; k<kmax; k++ ) {
1760         tmp = jj[nzcount++]/bs;
1761         if (!mask[tmp]) {masked[nmask++] = tmp; mask[tmp] = 1;}
1762       }
1763       rowcount++;
1764     }
1765     browlengths[i] += nmask;
1766     /* zero out the mask elements we set */
1767     for ( j=0; j<nmask; j++ ) mask[masked[j]] = 0;
1768   }
1769 
1770   /* create our matrix */
1771   ierr = MatCreateSeqBAIJ(comm,bs,M+extra_rows,N+extra_rows,0,browlengths,A);
1772          CHKERRQ(ierr);
1773   B = *A;
1774   a = (Mat_SeqBAIJ *) B->data;
1775 
1776   /* set matrix "i" values */
1777   a->i[0] = 0;
1778   for ( i=1; i<= mbs; i++ ) {
1779     a->i[i]      = a->i[i-1] + browlengths[i-1];
1780     a->ilen[i-1] = browlengths[i-1];
1781   }
1782   a->nz         = 0;
1783   for ( i=0; i<mbs; i++ ) a->nz += browlengths[i];
1784 
1785   /* read in nonzero values */
1786   aa = (Scalar *) PetscMalloc((nz+extra_rows)*sizeof(Scalar));CHKPTRQ(aa);
1787   ierr = PetscBinaryRead(fd,aa,nz,BINARY_SCALAR); CHKERRQ(ierr);
1788   for ( i=0; i<extra_rows; i++ ) aa[nz+i] = 1.0;
1789 
1790   /* set "a" and "j" values into matrix */
1791   nzcount = 0; jcount = 0;
1792   for ( i=0; i<mbs; i++ ) {
1793     nzcountb = nzcount;
1794     nmask    = 0;
1795     for ( j=0; j<bs; j++ ) {
1796       kmax = rowlengths[i*bs+j];
1797       for ( k=0; k<kmax; k++ ) {
1798         tmp = jj[nzcount++]/bs;
1799 	if (!mask[tmp]) { masked[nmask++] = tmp; mask[tmp] = 1;}
1800       }
1801       rowcount++;
1802     }
1803     /* sort the masked values */
1804     PetscSortInt(nmask,masked);
1805 
1806     /* set "j" values into matrix */
1807     maskcount = 1;
1808     for ( j=0; j<nmask; j++ ) {
1809       a->j[jcount++]  = masked[j];
1810       mask[masked[j]] = maskcount++;
1811     }
1812     /* set "a" values into matrix */
1813     ishift = bs2*a->i[i];
1814     for ( j=0; j<bs; j++ ) {
1815       kmax = rowlengths[i*bs+j];
1816       for ( k=0; k<kmax; k++ ) {
1817         tmp    = jj[nzcountb]/bs ;
1818         block  = mask[tmp] - 1;
1819         point  = jj[nzcountb] - bs*tmp;
1820         idx    = ishift + bs2*block + j + bs*point;
1821         a->a[idx] = aa[nzcountb++];
1822       }
1823     }
1824     /* zero out the mask elements we set */
1825     for ( j=0; j<nmask; j++ ) mask[masked[j]] = 0;
1826   }
1827   if (jcount != a->nz) SETERRQ(1,"MatLoad_SeqBAIJ:Error bad binary matrix");
1828 
1829   PetscFree(rowlengths);
1830   PetscFree(browlengths);
1831   PetscFree(aa);
1832   PetscFree(jj);
1833   PetscFree(mask);
1834 
1835   B->assembled = PETSC_TRUE;
1836 
1837   ierr = OptionsHasName(PETSC_NULL,"-mat_view_info",&flg); CHKERRQ(ierr);
1838   if (flg) {
1839     Viewer tviewer;
1840     ierr = ViewerFileOpenASCII(B->comm,"stdout",&tviewer);CHKERRQ(ierr);
1841     ierr = ViewerSetFormat(tviewer,ASCII_FORMAT_INFO,0);CHKERRQ(ierr);
1842     ierr = MatView(B,tviewer); CHKERRQ(ierr);
1843     ierr = ViewerDestroy(tviewer); CHKERRQ(ierr);
1844   }
1845   ierr = OptionsHasName(PETSC_NULL,"-mat_view_info_detailed",&flg);CHKERRQ(ierr);
1846   if (flg) {
1847     Viewer tviewer;
1848     ierr = ViewerFileOpenASCII(B->comm,"stdout",&tviewer);CHKERRQ(ierr);
1849     ierr = ViewerSetFormat(tviewer,ASCII_FORMAT_INFO_DETAILED,0);CHKERRQ(ierr);
1850     ierr = MatView(B,tviewer); CHKERRQ(ierr);
1851     ierr = ViewerDestroy(tviewer); CHKERRQ(ierr);
1852   }
1853   ierr = OptionsHasName(PETSC_NULL,"-mat_view",&flg); CHKERRQ(ierr);
1854   if (flg) {
1855     Viewer tviewer;
1856     ierr = ViewerFileOpenASCII(B->comm,"stdout",&tviewer);CHKERRQ(ierr);
1857     ierr = MatView(B,tviewer); CHKERRQ(ierr);
1858     ierr = ViewerDestroy(tviewer); CHKERRQ(ierr);
1859   }
1860   ierr = OptionsHasName(PETSC_NULL,"-mat_view_matlab",&flg); CHKERRQ(ierr);
1861   if (flg) {
1862     Viewer tviewer;
1863     ierr = ViewerFileOpenASCII(B->comm,"stdout",&tviewer);CHKERRQ(ierr);
1864     ierr = ViewerSetFormat(tviewer,ASCII_FORMAT_MATLAB,"M");CHKERRQ(ierr);
1865     ierr = MatView(B,tviewer); CHKERRQ(ierr);
1866     ierr = ViewerDestroy(tviewer); CHKERRQ(ierr);
1867   }
1868   ierr = OptionsHasName(PETSC_NULL,"-mat_view_draw",&flg); CHKERRQ(ierr);
1869   if (flg) {
1870     Viewer tviewer;
1871     ierr = ViewerDrawOpenX(B->comm,0,0,0,0,300,300,&tviewer); CHKERRQ(ierr);
1872     ierr = MatView(B,(Viewer)tviewer); CHKERRQ(ierr);
1873     ierr = ViewerFlush(tviewer); CHKERRQ(ierr);
1874     ierr = ViewerDestroy(tviewer); CHKERRQ(ierr);
1875   }
1876   return 0;
1877 }
1878 
1879 
1880 
1881