xref: /petsc/src/mat/impls/aij/mpi/mpiaij.c (revision c74985f65431909a76d9ed326b933d7bd12add40)
1 
2 #include "mpiaij.h"
3 #include "vec/vecimpl.h"
4 
5 
6 #define CHUNCKSIZE   100
7 /*
8    This is a simple minded stash. Do a linear search to determine if
9  in stash, if not add to end.
10 */
11 static int StashValues(Stash *stash,int row,int n, int *idxn,
12                        Scalar *values,InsertMode addv)
13 {
14   int    i,j,N = stash->n,found,*n_idx, *n_idy;
15   Scalar val,*n_array;
16 
17   for ( i=0; i<n; i++ ) {
18     found = 0;
19     val = *values++;
20     for ( j=0; j<N; j++ ) {
21       if ( stash->idx[j] == row && stash->idy[j] == idxn[i]) {
22         /* found a match */
23         if (addv == AddValues) stash->array[j] += val;
24         else stash->array[j] = val;
25         found = 1;
26         break;
27       }
28     }
29     if (!found) { /* not found so add to end */
30       if ( stash->n == stash->nmax ) {
31         /* allocate a larger stash */
32         n_array = (Scalar *) MALLOC( (stash->nmax + CHUNCKSIZE)*(
33                                      2*sizeof(int) + sizeof(Scalar)));
34         CHKPTR(n_array);
35         n_idx = (int *) (n_array + stash->nmax + CHUNCKSIZE);
36         n_idy = (int *) (n_idx + stash->nmax + CHUNCKSIZE);
37         MEMCPY(n_array,stash->array,stash->nmax*sizeof(Scalar));
38         MEMCPY(n_idx,stash->idx,stash->nmax*sizeof(int));
39         MEMCPY(n_idy,stash->idy,stash->nmax*sizeof(int));
40         if (stash->array) FREE(stash->array);
41         stash->array = n_array; stash->idx = n_idx; stash->idy = n_idy;
42         stash->nmax += CHUNCKSIZE;
43       }
44       stash->array[stash->n]   = val;
45       stash->idx[stash->n]     = row;
46       stash->idy[stash->n++]   = idxn[i];
47     }
48   }
49   return 0;
50 }
51 
52 static int MatiAIJInsertValues(Mat mat,int m,int *idxm,int n,
53                             int *idxn,Scalar *v,InsertMode addv)
54 {
55   Matimpiaij *aij = (Matimpiaij *) mat->data;
56   int        ierr,i,j, rstart = aij->rstart, rend = aij->rend;
57   int        cstart = aij->cstart, cend = aij->cend,row,col;
58 
59   if (aij->insertmode != NotSetValues && aij->insertmode != addv) {
60     SETERR(1,"You cannot mix inserts and adds");
61   }
62   aij->insertmode = addv;
63   for ( i=0; i<m; i++ ) {
64     if (idxm[i] < 0 || idxm[i] >= aij->M) {
65       if (aij->outofrange) continue;
66       else SETERR(1,"row index out of range");
67     }
68     if (idxm[i] >= rstart && idxm[i] < rend) {
69       row = idxm[i] - rstart;
70       for ( j=0; j<n; j++ ) {
71         if (idxn[i] < 0 || idxn[i] >= aij->N) {
72           if (aij->outofrange) continue;
73           else SETERR(1,"column index out of range");
74         }
75         if (idxn[j] >= cstart && idxn[j] < cend){
76           col = idxn[j] - cstart;
77           ierr = MatSetValues(aij->A,1,&row,1,&col,v+i*n+j,addv);CHKERR(ierr);
78         }
79         else {
80           col = idxn[j];
81           ierr = MatSetValues(aij->B,1,&row,1,&col,v+i*n+j,addv);CHKERR(ierr);
82         }
83       }
84     }
85     else {
86       ierr = StashValues(&aij->stash,idxm[i],n,idxn,v+i*n,addv);CHKERR(ierr);
87     }
88   }
89   return 0;
90 }
91 
92 /*
93     the assembly code is alot like the code for vectors, we should
94     sometime derive a single assembly code that can be used for
95     either case.
96 */
97 
98 static int MatiAIJBeginAssemble(Mat mat)
99 {
100   Matimpiaij  *aij = (Matimpiaij *) mat->data;
101   MPI_Comm    comm = aij->comm;
102   int         ierr, numtids = aij->numtids, *owners = aij->rowners;
103   int         mytid = aij->mytid;
104   MPI_Request *send_waits,*recv_waits;
105   int         *nprocs,i,j,n,idx,*procs,nsends,nreceives,nmax,*work;
106   int         tag = 50, *owner,*starts,count;
107   InsertMode  addv;
108   Scalar      *rvalues,*svalues;
109 
110   /* make sure all processors are either in INSERTMODE or ADDMODE */
111   MPI_Allreduce((void *) &aij->insertmode,(void *) &addv,numtids,MPI_INT,
112                 MPI_BOR,comm);
113   if (addv == (AddValues|InsertValues)) {
114     SETERR(1,"Some processors have inserted while others have added");
115   }
116   aij->insertmode = addv; /* in case this processor had no cache */
117 
118   /*  first count number of contributors to each processor */
119   nprocs = (int *) MALLOC( 2*numtids*sizeof(int) ); CHKPTR(nprocs);
120   MEMSET(nprocs,0,2*numtids*sizeof(int)); procs = nprocs + numtids;
121   owner = (int *) MALLOC( (aij->stash.n+1)*sizeof(int) ); CHKPTR(owner);
122   for ( i=0; i<aij->stash.n; i++ ) {
123     idx = aij->stash.idx[i];
124     for ( j=0; j<numtids; j++ ) {
125       if (idx >= owners[j] && idx < owners[j+1]) {
126         nprocs[j]++; procs[j] = 1; owner[i] = j; break;
127       }
128     }
129   }
130   nsends = 0;  for ( i=0; i<numtids; i++ ) { nsends += procs[i];}
131 
132   /* inform other processors of number of messages and max length*/
133   work = (int *) MALLOC( numtids*sizeof(int) ); CHKPTR(work);
134   MPI_Allreduce((void *) procs,(void *) work,numtids,MPI_INT,MPI_SUM,comm);
135   nreceives = work[mytid];
136   MPI_Allreduce((void *) nprocs,(void *) work,numtids,MPI_INT,MPI_MAX,comm);
137   nmax = work[mytid];
138   FREE(work);
139 
140   /* post receives:
141        1) each message will consist of ordered pairs
142      (global index,value) we store the global index as a double
143      to simply the message passing.
144        2) since we don't know how long each individual message is we
145      allocate the largest needed buffer for each receive. Potentially
146      this is a lot of wasted space.
147 
148 
149        This could be done better.
150   */
151   rvalues = (Scalar *) MALLOC(3*(nreceives+1)*nmax*sizeof(Scalar));
152   CHKPTR(rvalues);
153   recv_waits = (MPI_Request *) MALLOC((nreceives+1)*sizeof(MPI_Request));
154   CHKPTR(recv_waits);
155   for ( i=0; i<nreceives; i++ ) {
156     MPI_Irecv((void *)(rvalues+3*nmax*i),3*nmax,MPI_SCALAR,MPI_ANY_SOURCE,tag,
157               comm,recv_waits+i);
158   }
159 
160   /* do sends:
161       1) starts[i] gives the starting index in svalues for stuff going to
162          the ith processor
163   */
164   svalues = (Scalar *) MALLOC( 3*(aij->stash.n+1)*sizeof(Scalar) );
165   CHKPTR(svalues);
166   send_waits = (MPI_Request *) MALLOC( (nsends+1)*sizeof(MPI_Request));
167   CHKPTR(send_waits);
168   starts = (int *) MALLOC( numtids*sizeof(int) ); CHKPTR(starts);
169   starts[0] = 0;
170   for ( i=1; i<numtids; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
171   for ( i=0; i<aij->stash.n; i++ ) {
172     svalues[3*starts[owner[i]]]       = (Scalar)  aij->stash.idx[i];
173     svalues[3*starts[owner[i]]+1]     = (Scalar)  aij->stash.idy[i];
174     svalues[3*(starts[owner[i]]++)+2] =  aij->stash.array[i];
175   }
176   FREE(owner);
177   starts[0] = 0;
178   for ( i=1; i<numtids; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
179   count = 0;
180   for ( i=0; i<numtids; i++ ) {
181     if (procs[i]) {
182       MPI_Isend((void*)(svalues+3*starts[i]),3*nprocs[i],MPI_SCALAR,i,tag,
183                 comm,send_waits+count++);
184     }
185   }
186   FREE(starts); FREE(nprocs);
187 
188   /* Free cache space */
189   aij->stash.nmax = aij->stash.n = 0;
190   if (aij->stash.array){ FREE(aij->stash.array); aij->stash.array = 0;}
191 
192   aij->svalues    = svalues;       aij->rvalues = rvalues;
193   aij->nsends     = nsends;         aij->nrecvs = nreceives;
194   aij->send_waits = send_waits; aij->recv_waits = recv_waits;
195   aij->rmax       = nmax;
196 
197   return 0;
198 }
199 extern int MPIAIJSetUpMultiply(Mat);
200 
201 static int MatiAIJEndAssemble(Mat mat)
202 {
203   int        ierr;
204   Matimpiaij *aij = (Matimpiaij *) mat->data;
205 
206   MPI_Status  *send_status,recv_status;
207   int         index,idx,nrecvs = aij->nrecvs, count = nrecvs, i, n;
208   int         row,col;
209   Scalar      *values,val;
210   InsertMode  addv = aij->insertmode;
211 
212   /*  wait on receives */
213   while (count) {
214     MPI_Waitany(nrecvs,aij->recv_waits,&index,&recv_status);
215     /* unpack receives into our local space */
216     values = aij->rvalues + 3*index*aij->rmax;
217     MPI_Get_count(&recv_status,MPI_SCALAR,&n);
218     n = n/3;
219     for ( i=0; i<n; i++ ) {
220       row = (int) PETSCREAL(values[3*i]) - aij->rstart;
221       col = (int) PETSCREAL(values[3*i+1]);
222       val = values[3*i+2];
223       if (col >= aij->cstart && col < aij->cend) {
224           col -= aij->cstart;
225         MatSetValues(aij->A,1,&row,1,&col,&val,addv);
226       }
227       else {
228         MatSetValues(aij->B,1,&row,1,&col,&val,addv);
229       }
230     }
231     count--;
232   }
233   FREE(aij->recv_waits); FREE(aij->rvalues);
234 
235   /* wait on sends */
236   if (aij->nsends) {
237     send_status = (MPI_Status *) MALLOC( aij->nsends*sizeof(MPI_Status) );
238     CHKPTR(send_status);
239     MPI_Waitall(aij->nsends,aij->send_waits,send_status);
240     FREE(send_status);
241   }
242   FREE(aij->send_waits); FREE(aij->svalues);
243 
244   aij->insertmode = NotSetValues;
245   ierr = MatBeginAssembly(aij->A); CHKERR(ierr);
246   ierr = MatEndAssembly(aij->A); CHKERR(ierr);
247 
248   ierr = MPIAIJSetUpMultiply(mat); CHKERR(ierr);
249   ierr = MatBeginAssembly(aij->B); CHKERR(ierr);
250   ierr = MatEndAssembly(aij->B); CHKERR(ierr);
251   return 0;
252 }
253 
254 static int MatiZero(Mat A)
255 {
256   Matimpiaij *l = (Matimpiaij *) A->data;
257 
258   MatZeroEntries(l->A); MatZeroEntries(l->B);
259   return 0;
260 }
261 
262 /* again this uses the same basic stratagy as in the assembly and
263    scatter create routines, we should try to do it systemamatically
264    if we can figure out the proper level of generality. */
265 
266 /* the code does not do the diagonal entries correctly unless the
267    matrix is square and the column and row owerships are identical.
268    This is a BUG. The only way to fix it seems to be to access
269    aij->A and aij->B directly and not through the MatZeroRows()
270    routine.
271 */
272 static int MatiZerorows(Mat A,IS is,Scalar *diag)
273 {
274   Matimpiaij     *l = (Matimpiaij *) A->data;
275   int            i,ierr,N, *rows,*owners = l->rowners,numtids = l->numtids;
276   int            *localrows,*procs,*nprocs,j,found,idx,nsends,*work;
277   int            nmax,*svalues,*starts,*owner,nrecvs,mytid = l->mytid;
278   int            *rvalues,tag = 67,count,base,slen,n,len,*source;
279   int            *lens,index,*lrows,*values;
280   MPI_Comm       comm = l->comm;
281   MPI_Request    *send_waits,*recv_waits;
282   MPI_Status     recv_status,*send_status;
283   IS             istmp;
284 
285   ierr = ISGetLocalSize(is,&N); CHKERR(ierr);
286   ierr = ISGetIndices(is,&rows); CHKERR(ierr);
287 
288   /*  first count number of contributors to each processor */
289   nprocs = (int *) MALLOC( 2*numtids*sizeof(int) ); CHKPTR(nprocs);
290   MEMSET(nprocs,0,2*numtids*sizeof(int)); procs = nprocs + numtids;
291   owner = (int *) MALLOC((N+1)*sizeof(int)); CHKPTR(owner); /* see note*/
292   for ( i=0; i<N; i++ ) {
293     idx = rows[i];
294     found = 0;
295     for ( j=0; j<numtids; j++ ) {
296       if (idx >= owners[j] && idx < owners[j+1]) {
297         nprocs[j]++; procs[j] = 1; owner[i] = j; found = 1; break;
298       }
299     }
300     if (!found) SETERR(1,"Index out of range");
301   }
302   nsends = 0;  for ( i=0; i<numtids; i++ ) { nsends += procs[i];}
303 
304   /* inform other processors of number of messages and max length*/
305   work = (int *) MALLOC( numtids*sizeof(int) ); CHKPTR(work);
306   MPI_Allreduce((void *) procs,(void *) work,numtids,MPI_INT,MPI_SUM,comm);
307   nrecvs = work[mytid];
308   MPI_Allreduce((void *) nprocs,(void *) work,numtids,MPI_INT,MPI_MAX,comm);
309   nmax = work[mytid];
310   FREE(work);
311 
312   /* post receives:   */
313   rvalues = (int *) MALLOC((nrecvs+1)*nmax*sizeof(int)); /*see note */
314   CHKPTR(rvalues);
315   recv_waits = (MPI_Request *) MALLOC((nrecvs+1)*sizeof(MPI_Request));
316   CHKPTR(recv_waits);
317   for ( i=0; i<nrecvs; i++ ) {
318     MPI_Irecv((void *)(rvalues+nmax*i),nmax,MPI_INT,MPI_ANY_SOURCE,tag,
319               comm,recv_waits+i);
320   }
321 
322   /* do sends:
323       1) starts[i] gives the starting index in svalues for stuff going to
324          the ith processor
325   */
326   svalues = (int *) MALLOC( (N+1)*sizeof(int) ); CHKPTR(svalues);
327   send_waits = (MPI_Request *) MALLOC( (nsends+1)*sizeof(MPI_Request));
328   CHKPTR(send_waits);
329   starts = (int *) MALLOC( (numtids+1)*sizeof(int) ); CHKPTR(starts);
330   starts[0] = 0;
331   for ( i=1; i<numtids; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
332   for ( i=0; i<N; i++ ) {
333     svalues[starts[owner[i]]++] = rows[i];
334   }
335   ISRestoreIndices(is,&rows);
336 
337   starts[0] = 0;
338   for ( i=1; i<numtids+1; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
339   count = 0;
340   for ( i=0; i<numtids; i++ ) {
341     if (procs[i]) {
342       MPI_Isend((void*)(svalues+starts[i]),nprocs[i],MPI_INT,i,tag,
343                 comm,send_waits+count++);
344     }
345   }
346   FREE(starts);
347 
348   base = owners[mytid];
349 
350   /*  wait on receives */
351   lens = (int *) MALLOC( 2*(nrecvs+1)*sizeof(int) ); CHKPTR(lens);
352   source = lens + nrecvs;
353   count = nrecvs; slen = 0;
354   while (count) {
355     MPI_Waitany(nrecvs,recv_waits,&index,&recv_status);
356     /* unpack receives into our local space */
357     MPI_Get_count(&recv_status,MPI_INT,&n);
358     source[index]  = recv_status.MPI_SOURCE;
359     lens[index]  = n;
360     slen += n;
361     count--;
362   }
363   FREE(recv_waits);
364 
365   /* move the data into the send scatter */
366   lrows = (int *) MALLOC( slen*sizeof(int) ); CHKPTR(lrows);
367   count = 0;
368   for ( i=0; i<nrecvs; i++ ) {
369     values = rvalues + i*nmax;
370     for ( j=0; j<lens[i]; j++ ) {
371       lrows[count++] = values[j] - base;
372     }
373   }
374   FREE(rvalues); FREE(lens);
375   FREE(owner); FREE(nprocs);
376 
377   /* actually zap the local rows */
378   ierr = ISCreateSequential(slen,lrows,&istmp); CHKERR(ierr);  FREE(lrows);
379   ierr = MatZeroRows(l->A,istmp,diag); CHKERR(ierr);
380   ierr = MatZeroRows(l->B,istmp,0); CHKERR(ierr);
381   ierr = ISDestroy(istmp); CHKERR(ierr);
382 
383   /* wait on sends */
384   if (nsends) {
385     send_status = (MPI_Status *) MALLOC( nsends*sizeof(MPI_Status) );
386     CHKPTR(send_status);
387     MPI_Waitall(nsends,send_waits,send_status);
388     FREE(send_status);
389   }
390   FREE(send_waits); FREE(svalues);
391 
392 
393   return 0;
394 }
395 
396 static int MatiAIJMult(Mat aijin,Vec xx,Vec yy)
397 {
398   Matimpiaij *aij = (Matimpiaij *) aijin->data;
399   int        ierr;
400 
401   ierr = VecScatterBegin(xx,0,aij->lvec,0,InsertValues,&aij->Mvctx);
402   CHKERR(ierr);
403   ierr = MatMult(aij->A,xx,yy); CHKERR(ierr);
404   ierr = VecScatterEnd(xx,0,aij->lvec,0,InsertValues,&aij->Mvctx); CHKERR(ierr);
405   ierr = MatMultAdd(aij->B,aij->lvec,yy,yy); CHKERR(ierr);
406   return 0;
407 }
408 
409 /*
410   This only works correctly for square matrices where the subblock A->A is the
411    diagonal block
412 */
413 static int MatiAIJgetdiag(Mat Ain,Vec v)
414 {
415   Matimpiaij *A = (Matimpiaij *) Ain->data;
416   return MatGetDiagonal(A->A,v);
417 }
418 
419 static int MatiAIJdestroy(PetscObject obj)
420 {
421   Mat        mat = (Mat) obj;
422   Matimpiaij *aij = (Matimpiaij *) mat->data;
423   int        ierr;
424   FREE(aij->rowners);
425   ierr = MatDestroy(aij->A); CHKERR(ierr);
426   ierr = MatDestroy(aij->B); CHKERR(ierr);
427   FREE(aij); FREE(mat);
428   if (aij->lvec) VecDestroy(aij->lvec);
429   if (aij->Mvctx) VecScatterCtxDestroy(aij->Mvctx);
430   return 0;
431 }
432 
433 static int MatiView(PetscObject obj,Viewer viewer)
434 {
435   Mat        mat = (Mat) obj;
436   Matimpiaij *aij = (Matimpiaij *) mat->data;
437   int        ierr;
438 
439   MPE_Seq_begin(aij->comm,1);
440   printf("[%d] rows %d starts %d ends %d cols %d starts %d ends %d\n",
441           aij->mytid,aij->m,aij->rstart,aij->rend,aij->n,aij->cstart,
442           aij->cend);
443   ierr = MatView(aij->A,viewer); CHKERR(ierr);
444   ierr = MatView(aij->B,viewer); CHKERR(ierr);
445   MPE_Seq_end(aij->comm,1);
446   return 0;
447 }
448 
449 /*
450     This has to provide several versions.
451 
452      1) per sequential
453      2) a) use only local smoothing updating outer values only once.
454         b) local smoothing updating outer values each inner iteration
455      3) color updating out values betwen colors. (this imples an
456         ordering that is sort of related to the IS argument, it
457         is not clear a IS argument makes the most sense perhaps it
458         should be dropped.
459 */
460 static int MatiAIJrelax(Mat matin,Vec bb,double omega,int flag,IS is,
461                         int its,Vec xx)
462 {
463   Matimpiaij *mat = (Matimpiaij *) matin->data;
464   Scalar     zero = 0.0;
465   int        ierr,one = 1, tmp, *idx, *diag;
466   int        n = mat->n, m = mat->m, i, j;
467 
468   if (is) SETERR(1,"No support for ordering in relaxation");
469   if (flag & SOR_ZERO_INITIAL_GUESS) {
470     if (ierr = VecSet(&zero,xx)) return ierr;
471   }
472 
473   /* update outer values from other processors*/
474 
475   /* smooth locally */
476   return 0;
477 }
478 static int MatiAIJinsopt(Mat aijin,int op)
479 {
480   Matimpiaij *aij = (Matimpiaij *) aijin->data;
481 
482   if      (op == NO_NEW_NONZERO_LOCATIONS)  {
483     MatSetOption(aij->A,op);
484     MatSetOption(aij->B,op);
485   }
486   else if (op == YES_NEW_NONZERO_LOCATIONS) {
487     MatSetOption(aij->A,op);
488     MatSetOption(aij->B,op);
489   }
490   else if (op == ALLOW_OUT_OF_RANGE)  aij->outofrange  = 1;
491   else if (op == COLUMN_ORIENTED) SETERR(1,"Column oriented not supported");
492   return 0;
493 }
494 
495 static int MatiAIJsize(Mat matin,int *m,int *n)
496 {
497   Matimpiaij *mat = (Matimpiaij *) matin->data;
498   *m = mat->M; *n = mat->N;
499   return 0;
500 }
501 
502 static int MatiAIJlocalsize(Mat matin,int *m,int *n)
503 {
504   Matimpiaij *mat = (Matimpiaij *) matin->data;
505   *m = mat->m; *n = mat->n;
506   return 0;
507 }
508 
509 static int MatiAIJrange(Mat matin,int *m,int *n)
510 {
511   Matimpiaij *mat = (Matimpiaij *) matin->data;
512   *m = mat->rstart; *n = mat->rend;
513   return 0;
514 }
515 
516 /* -------------------------------------------------------------------*/
517 static struct _MatOps MatOps = {MatiAIJInsertValues,
518        0, 0,
519        MatiAIJMult,0,0,0,
520        0,0,0,0,
521        0,0,
522        MatiAIJrelax,
523        0,
524        0,0,0,
525        0,
526        MatiAIJgetdiag,0,0,
527        MatiAIJBeginAssemble,MatiAIJEndAssemble,
528        0,
529        MatiAIJinsopt,MatiZero,MatiZerorows,0,
530        0,0,0,0,
531        MatiAIJsize,MatiAIJlocalsize,MatiAIJrange };
532 
533 
534 
535 /*@
536 
537       MatCreateMPIAIJ - Creates a sparse parallel matrix
538                                  in AIJ format.
539 
540   Input Parameters:
541 .   comm - MPI communicator
542 .   m,n - number of local rows and columns (or -1 to have calculated)
543 .   M,N - global rows and columns (or -1 to have calculated)
544 .   d_nz - total number nonzeros in diagonal portion of matrix
545 .   d_nzz - number of nonzeros per row in diagonal portion of matrix or null
546 .           You must leave room for the diagonal entry even if it is zero.
547 .   o_nz - total number nonzeros in off-diagonal portion of matrix
548 .   o_nzz - number of nonzeros per row in off-diagonal portion of matrix
549 .           or null. You must have at least one nonzero per row.
550 
551   Output parameters:
552 .  newmat - the matrix
553 
554   Keywords: matrix, aij, compressed row, sparse, parallel
555 @*/
556 int MatCreateMPIAIJ(MPI_Comm comm,int m,int n,int M,int N,
557                  int d_nz,int *d_nnz, int o_nz,int *o_nnz,Mat *newmat)
558 {
559   Mat          mat;
560   Matimpiaij   *aij;
561   int          ierr, i,rl,len,sum[2],work[2];
562   *newmat         = 0;
563   CREATEHEADER(mat,_Mat);
564   mat->data       = (void *) (aij = NEW(Matimpiaij)); CHKPTR(aij);
565   mat->cookie     = MAT_COOKIE;
566   mat->ops        = &MatOps;
567   mat->destroy    = MatiAIJdestroy;
568   mat->view       = MatiView;
569   mat->type       = MATAIJMPI;
570   mat->factor     = 0;
571   mat->row        = 0;
572   mat->col        = 0;
573   aij->comm       = comm;
574   aij->insertmode = NotSetValues;
575   MPI_Comm_rank(comm,&aij->mytid);
576   MPI_Comm_size(comm,&aij->numtids);
577 
578   if (M == -1 || N == -1) {
579     work[0] = m; work[1] = n;
580     MPI_Allreduce((void *) work,(void *) sum,1,MPI_INT,MPI_SUM,comm );
581     if (M == -1) M = sum[0];
582     if (N == -1) N = sum[1];
583   }
584   if (m == -1) {m = M/aij->numtids + ((M % aij->numtids) > aij->mytid);}
585   if (n == -1) {n = N/aij->numtids + ((N % aij->numtids) > aij->mytid);}
586   aij->m       = m;
587   aij->n       = n;
588   aij->N       = N;
589   aij->M       = M;
590 
591   /* build local table of row and column ownerships */
592   aij->rowners = (int *) MALLOC(2*(aij->numtids+2)*sizeof(int));
593   CHKPTR(aij->rowners);
594   aij->cowners = aij->rowners + aij->numtids + 1;
595   MPI_Allgather(&m,1,MPI_INT,aij->rowners+1,1,MPI_INT,comm);
596   aij->rowners[0] = 0;
597   for ( i=2; i<=aij->numtids; i++ ) {
598     aij->rowners[i] += aij->rowners[i-1];
599   }
600   aij->rstart = aij->rowners[aij->mytid];
601   aij->rend   = aij->rowners[aij->mytid+1];
602   MPI_Allgather(&n,1,MPI_INT,aij->cowners+1,1,MPI_INT,comm);
603   aij->cowners[0] = 0;
604   for ( i=2; i<=aij->numtids; i++ ) {
605     aij->cowners[i] += aij->cowners[i-1];
606   }
607   aij->cstart = aij->cowners[aij->mytid];
608   aij->cend   = aij->cowners[aij->mytid+1];
609 
610 
611   ierr = MatCreateSequentialAIJ(m,n,d_nz,d_nnz,&aij->A); CHKERR(ierr);
612   ierr = MatCreateSequentialAIJ(m,N,o_nz,o_nnz,&aij->B); CHKERR(ierr);
613 
614   /* build cache for off array entries formed */
615   aij->stash.nmax = CHUNCKSIZE; /* completely arbratray number */
616   aij->stash.n    = 0;
617   aij->stash.array = (Scalar *) MALLOC( aij->stash.nmax*(2*sizeof(int) +
618                             sizeof(Scalar))); CHKPTR(aij->stash.array);
619   aij->stash.idx = (int *) (aij->stash.array + aij->stash.nmax);
620   aij->stash.idy = (int *) (aij->stash.idx + aij->stash.nmax);
621 
622   /* stuff used for matrix vector multiply */
623   aij->lvec      = 0;
624   aij->Mvctx     = 0;
625 
626   aij->outofrange = 0;
627 
628   *newmat = mat;
629   return 0;
630 }
631