1*4c1414c8SBarry Smith #define PETSCMAT_DLL 2*4c1414c8SBarry Smith 3*4c1414c8SBarry Smith /* 4*4c1414c8SBarry Smith This file provides high performance routines for the Inode format (compressed sparse row) 5*4c1414c8SBarry Smith by taking advantage of rows with identical nonzero structure (I-nodes). 6*4c1414c8SBarry Smith */ 7*4c1414c8SBarry Smith #include "src/mat/impls/aij/seq/aij.h" 8*4c1414c8SBarry Smith 9*4c1414c8SBarry Smith #undef __FUNCT__ 10*4c1414c8SBarry Smith #define __FUNCT__ "Mat_CreateColInode" 11*4c1414c8SBarry Smith static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns) 12*4c1414c8SBarry Smith { 13*4c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 14*4c1414c8SBarry Smith PetscErrorCode ierr; 15*4c1414c8SBarry Smith PetscInt i,count,m,n,min_mn,*ns_row,*ns_col; 16*4c1414c8SBarry Smith 17*4c1414c8SBarry Smith PetscFunctionBegin; 18*4c1414c8SBarry Smith n = A->cmap.n; 19*4c1414c8SBarry Smith m = A->rmap.n; 20*4c1414c8SBarry Smith ns_row = a->inode.size; 21*4c1414c8SBarry Smith 22*4c1414c8SBarry Smith min_mn = (m < n) ? m : n; 23*4c1414c8SBarry Smith if (!ns) { 24*4c1414c8SBarry Smith for (count=0,i=0; count<min_mn; count+=ns_row[i],i++); 25*4c1414c8SBarry Smith for(; count+1 < n; count++,i++); 26*4c1414c8SBarry Smith if (count < n) { 27*4c1414c8SBarry Smith i++; 28*4c1414c8SBarry Smith } 29*4c1414c8SBarry Smith *size = i; 30*4c1414c8SBarry Smith PetscFunctionReturn(0); 31*4c1414c8SBarry Smith } 32*4c1414c8SBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr); 33*4c1414c8SBarry Smith 34*4c1414c8SBarry Smith /* Use the same row structure wherever feasible. */ 35*4c1414c8SBarry Smith for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) { 36*4c1414c8SBarry Smith ns_col[i] = ns_row[i]; 37*4c1414c8SBarry Smith } 38*4c1414c8SBarry Smith 39*4c1414c8SBarry Smith /* if m < n; pad up the remainder with inode_limit */ 40*4c1414c8SBarry Smith for(; count+1 < n; count++,i++) { 41*4c1414c8SBarry Smith ns_col[i] = 1; 42*4c1414c8SBarry Smith } 43*4c1414c8SBarry Smith /* The last node is the odd ball. padd it up with the remaining rows; */ 44*4c1414c8SBarry Smith if (count < n) { 45*4c1414c8SBarry Smith ns_col[i] = n - count; 46*4c1414c8SBarry Smith i++; 47*4c1414c8SBarry Smith } else if (count > n) { 48*4c1414c8SBarry Smith /* Adjust for the over estimation */ 49*4c1414c8SBarry Smith ns_col[i-1] += n - count; 50*4c1414c8SBarry Smith } 51*4c1414c8SBarry Smith *size = i; 52*4c1414c8SBarry Smith *ns = ns_col; 53*4c1414c8SBarry Smith PetscFunctionReturn(0); 54*4c1414c8SBarry Smith } 55*4c1414c8SBarry Smith 56*4c1414c8SBarry Smith 57*4c1414c8SBarry Smith /* 58*4c1414c8SBarry Smith This builds symmetric version of nonzero structure, 59*4c1414c8SBarry Smith */ 60*4c1414c8SBarry Smith #undef __FUNCT__ 61*4c1414c8SBarry Smith #define __FUNCT__ "MatGetRowIJ_Inode_Symmetric" 62*4c1414c8SBarry Smith static PetscErrorCode MatGetRowIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 63*4c1414c8SBarry Smith { 64*4c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 65*4c1414c8SBarry Smith PetscErrorCode ierr; 66*4c1414c8SBarry Smith PetscInt *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n; 67*4c1414c8SBarry Smith PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j; 68*4c1414c8SBarry Smith 69*4c1414c8SBarry Smith PetscFunctionBegin; 70*4c1414c8SBarry Smith nslim_row = a->inode.node_count; 71*4c1414c8SBarry Smith m = A->rmap.n; 72*4c1414c8SBarry Smith n = A->cmap.n; 73*4c1414c8SBarry Smith if (m != n) SETERRQ(PETSC_ERR_SUP,"MatGetRowIJ_Inode_Symmetric: Matrix should be square"); 74*4c1414c8SBarry Smith 75*4c1414c8SBarry Smith /* Use the row_inode as column_inode */ 76*4c1414c8SBarry Smith nslim_col = nslim_row; 77*4c1414c8SBarry Smith ns_col = ns_row; 78*4c1414c8SBarry Smith 79*4c1414c8SBarry Smith /* allocate space for reformated inode structure */ 80*4c1414c8SBarry Smith ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 81*4c1414c8SBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 82*4c1414c8SBarry Smith for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1]; 83*4c1414c8SBarry Smith 84*4c1414c8SBarry Smith for (i1=0,col=0; i1<nslim_col; ++i1){ 85*4c1414c8SBarry Smith nsz = ns_col[i1]; 86*4c1414c8SBarry Smith for (i2=0; i2<nsz; ++i2,++col) 87*4c1414c8SBarry Smith tvc[col] = i1; 88*4c1414c8SBarry Smith } 89*4c1414c8SBarry Smith /* allocate space for row pointers */ 90*4c1414c8SBarry Smith ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 91*4c1414c8SBarry Smith *iia = ia; 92*4c1414c8SBarry Smith ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 93*4c1414c8SBarry Smith ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 94*4c1414c8SBarry Smith 95*4c1414c8SBarry Smith /* determine the number of columns in each row */ 96*4c1414c8SBarry Smith ia[0] = oshift; 97*4c1414c8SBarry Smith for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) { 98*4c1414c8SBarry Smith 99*4c1414c8SBarry Smith j = aj + ai[row] + ishift; 100*4c1414c8SBarry Smith jmax = aj + ai[row+1] + ishift; 101*4c1414c8SBarry Smith i2 = 0; 102*4c1414c8SBarry Smith col = *j++ + ishift; 103*4c1414c8SBarry Smith i2 = tvc[col]; 104*4c1414c8SBarry Smith while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */ 105*4c1414c8SBarry Smith ia[i1+1]++; 106*4c1414c8SBarry Smith ia[i2+1]++; 107*4c1414c8SBarry Smith i2++; /* Start col of next node */ 108*4c1414c8SBarry Smith while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j; 109*4c1414c8SBarry Smith i2 = tvc[col]; 110*4c1414c8SBarry Smith } 111*4c1414c8SBarry Smith if(i2 == i1) ia[i2+1]++; /* now the diagonal element */ 112*4c1414c8SBarry Smith } 113*4c1414c8SBarry Smith 114*4c1414c8SBarry Smith /* shift ia[i] to point to next row */ 115*4c1414c8SBarry Smith for (i1=1; i1<nslim_row+1; i1++) { 116*4c1414c8SBarry Smith row = ia[i1-1]; 117*4c1414c8SBarry Smith ia[i1] += row; 118*4c1414c8SBarry Smith work[i1-1] = row - oshift; 119*4c1414c8SBarry Smith } 120*4c1414c8SBarry Smith 121*4c1414c8SBarry Smith /* allocate space for column pointers */ 122*4c1414c8SBarry Smith nz = ia[nslim_row] + (!ishift); 123*4c1414c8SBarry Smith ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 124*4c1414c8SBarry Smith *jja = ja; 125*4c1414c8SBarry Smith 126*4c1414c8SBarry Smith /* loop over lower triangular part putting into ja */ 127*4c1414c8SBarry Smith for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) { 128*4c1414c8SBarry Smith j = aj + ai[row] + ishift; 129*4c1414c8SBarry Smith jmax = aj + ai[row+1] + ishift; 130*4c1414c8SBarry Smith i2 = 0; /* Col inode index */ 131*4c1414c8SBarry Smith col = *j++ + ishift; 132*4c1414c8SBarry Smith i2 = tvc[col]; 133*4c1414c8SBarry Smith while (i2<i1 && j<jmax) { 134*4c1414c8SBarry Smith ja[work[i2]++] = i1 + oshift; 135*4c1414c8SBarry Smith ja[work[i1]++] = i2 + oshift; 136*4c1414c8SBarry Smith ++i2; 137*4c1414c8SBarry Smith while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */ 138*4c1414c8SBarry Smith i2 = tvc[col]; 139*4c1414c8SBarry Smith } 140*4c1414c8SBarry Smith if (i2 == i1) ja[work[i1]++] = i2 + oshift; 141*4c1414c8SBarry Smith 142*4c1414c8SBarry Smith } 143*4c1414c8SBarry Smith ierr = PetscFree(work);CHKERRQ(ierr); 144*4c1414c8SBarry Smith ierr = PetscFree(tns);CHKERRQ(ierr); 145*4c1414c8SBarry Smith ierr = PetscFree(tvc);CHKERRQ(ierr); 146*4c1414c8SBarry Smith PetscFunctionReturn(0); 147*4c1414c8SBarry Smith } 148*4c1414c8SBarry Smith 149*4c1414c8SBarry Smith /* 150*4c1414c8SBarry Smith This builds nonsymmetric version of nonzero structure, 151*4c1414c8SBarry Smith */ 152*4c1414c8SBarry Smith #undef __FUNCT__ 153*4c1414c8SBarry Smith #define __FUNCT__ "MatGetRowIJ_Inode_Nonsymmetric" 154*4c1414c8SBarry Smith static PetscErrorCode MatGetRowIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 155*4c1414c8SBarry Smith { 156*4c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 157*4c1414c8SBarry Smith PetscErrorCode ierr; 158*4c1414c8SBarry Smith PetscInt *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col; 159*4c1414c8SBarry Smith PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 160*4c1414c8SBarry Smith 161*4c1414c8SBarry Smith PetscFunctionBegin; 162*4c1414c8SBarry Smith nslim_row = a->inode.node_count; 163*4c1414c8SBarry Smith n = A->cmap.n; 164*4c1414c8SBarry Smith 165*4c1414c8SBarry Smith /* Create The column_inode for this matrix */ 166*4c1414c8SBarry Smith ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 167*4c1414c8SBarry Smith 168*4c1414c8SBarry Smith /* allocate space for reformated column_inode structure */ 169*4c1414c8SBarry Smith ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 170*4c1414c8SBarry Smith ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 171*4c1414c8SBarry Smith for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 172*4c1414c8SBarry Smith 173*4c1414c8SBarry Smith for (i1=0,col=0; i1<nslim_col; ++i1){ 174*4c1414c8SBarry Smith nsz = ns_col[i1]; 175*4c1414c8SBarry Smith for (i2=0; i2<nsz; ++i2,++col) 176*4c1414c8SBarry Smith tvc[col] = i1; 177*4c1414c8SBarry Smith } 178*4c1414c8SBarry Smith /* allocate space for row pointers */ 179*4c1414c8SBarry Smith ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 180*4c1414c8SBarry Smith *iia = ia; 181*4c1414c8SBarry Smith ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 182*4c1414c8SBarry Smith ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 183*4c1414c8SBarry Smith 184*4c1414c8SBarry Smith /* determine the number of columns in each row */ 185*4c1414c8SBarry Smith ia[0] = oshift; 186*4c1414c8SBarry Smith for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 187*4c1414c8SBarry Smith j = aj + ai[row] + ishift; 188*4c1414c8SBarry Smith col = *j++ + ishift; 189*4c1414c8SBarry Smith i2 = tvc[col]; 190*4c1414c8SBarry Smith nz = ai[row+1] - ai[row]; 191*4c1414c8SBarry Smith while (nz-- > 0) { /* off-diagonal elemets */ 192*4c1414c8SBarry Smith ia[i1+1]++; 193*4c1414c8SBarry Smith i2++; /* Start col of next node */ 194*4c1414c8SBarry Smith while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 195*4c1414c8SBarry Smith if (nz > 0) i2 = tvc[col]; 196*4c1414c8SBarry Smith } 197*4c1414c8SBarry Smith } 198*4c1414c8SBarry Smith 199*4c1414c8SBarry Smith /* shift ia[i] to point to next row */ 200*4c1414c8SBarry Smith for (i1=1; i1<nslim_row+1; i1++) { 201*4c1414c8SBarry Smith row = ia[i1-1]; 202*4c1414c8SBarry Smith ia[i1] += row; 203*4c1414c8SBarry Smith work[i1-1] = row - oshift; 204*4c1414c8SBarry Smith } 205*4c1414c8SBarry Smith 206*4c1414c8SBarry Smith /* allocate space for column pointers */ 207*4c1414c8SBarry Smith nz = ia[nslim_row] + (!ishift); 208*4c1414c8SBarry Smith ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 209*4c1414c8SBarry Smith *jja = ja; 210*4c1414c8SBarry Smith 211*4c1414c8SBarry Smith /* loop over matrix putting into ja */ 212*4c1414c8SBarry Smith for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 213*4c1414c8SBarry Smith j = aj + ai[row] + ishift; 214*4c1414c8SBarry Smith i2 = 0; /* Col inode index */ 215*4c1414c8SBarry Smith col = *j++ + ishift; 216*4c1414c8SBarry Smith i2 = tvc[col]; 217*4c1414c8SBarry Smith nz = ai[row+1] - ai[row]; 218*4c1414c8SBarry Smith while (nz-- > 0) { 219*4c1414c8SBarry Smith ja[work[i1]++] = i2 + oshift; 220*4c1414c8SBarry Smith ++i2; 221*4c1414c8SBarry Smith while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 222*4c1414c8SBarry Smith if (nz > 0) i2 = tvc[col]; 223*4c1414c8SBarry Smith } 224*4c1414c8SBarry Smith } 225*4c1414c8SBarry Smith ierr = PetscFree(ns_col);CHKERRQ(ierr); 226*4c1414c8SBarry Smith ierr = PetscFree(work);CHKERRQ(ierr); 227*4c1414c8SBarry Smith ierr = PetscFree(tns);CHKERRQ(ierr); 228*4c1414c8SBarry Smith ierr = PetscFree(tvc);CHKERRQ(ierr); 229*4c1414c8SBarry Smith PetscFunctionReturn(0); 230*4c1414c8SBarry Smith } 231*4c1414c8SBarry Smith 232*4c1414c8SBarry Smith #undef __FUNCT__ 233*4c1414c8SBarry Smith #define __FUNCT__ "MatGetRowIJ_Inode" 234*4c1414c8SBarry Smith static PetscErrorCode MatGetRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 235*4c1414c8SBarry Smith { 236*4c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 237*4c1414c8SBarry Smith PetscErrorCode ierr; 238*4c1414c8SBarry Smith 239*4c1414c8SBarry Smith PetscFunctionBegin; 240*4c1414c8SBarry Smith *n = a->inode.node_count; 241*4c1414c8SBarry Smith if (!ia) PetscFunctionReturn(0); 242*4c1414c8SBarry Smith 243*4c1414c8SBarry Smith if (symmetric) { 244*4c1414c8SBarry Smith ierr = MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 245*4c1414c8SBarry Smith } else { 246*4c1414c8SBarry Smith ierr = MatGetRowIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 247*4c1414c8SBarry Smith } 248*4c1414c8SBarry Smith PetscFunctionReturn(0); 249*4c1414c8SBarry Smith } 250*4c1414c8SBarry Smith 251*4c1414c8SBarry Smith #undef __FUNCT__ 252*4c1414c8SBarry Smith #define __FUNCT__ "MatRestoreRowIJ_Inode" 253*4c1414c8SBarry Smith static PetscErrorCode MatRestoreRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 254*4c1414c8SBarry Smith { 255*4c1414c8SBarry Smith PetscErrorCode ierr; 256*4c1414c8SBarry Smith 257*4c1414c8SBarry Smith PetscFunctionBegin; 258*4c1414c8SBarry Smith if (!ia) PetscFunctionReturn(0); 259*4c1414c8SBarry Smith ierr = PetscFree(*ia);CHKERRQ(ierr); 260*4c1414c8SBarry Smith ierr = PetscFree(*ja);CHKERRQ(ierr); 261*4c1414c8SBarry Smith PetscFunctionReturn(0); 262*4c1414c8SBarry Smith } 263*4c1414c8SBarry Smith 264*4c1414c8SBarry Smith /* ----------------------------------------------------------- */ 265*4c1414c8SBarry Smith 266*4c1414c8SBarry Smith #undef __FUNCT__ 267*4c1414c8SBarry Smith #define __FUNCT__ "MatGetColumnIJ_Inode_Nonsymmetric" 268*4c1414c8SBarry Smith static PetscErrorCode MatGetColumnIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 269*4c1414c8SBarry Smith { 270*4c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 271*4c1414c8SBarry Smith PetscErrorCode ierr; 272*4c1414c8SBarry Smith PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col; 273*4c1414c8SBarry Smith PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 274*4c1414c8SBarry Smith 275*4c1414c8SBarry Smith PetscFunctionBegin; 276*4c1414c8SBarry Smith nslim_row = a->inode.node_count; 277*4c1414c8SBarry Smith n = A->cmap.n; 278*4c1414c8SBarry Smith 279*4c1414c8SBarry Smith /* Create The column_inode for this matrix */ 280*4c1414c8SBarry Smith ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 281*4c1414c8SBarry Smith 282*4c1414c8SBarry Smith /* allocate space for reformated column_inode structure */ 283*4c1414c8SBarry Smith ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 284*4c1414c8SBarry Smith ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 285*4c1414c8SBarry Smith for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 286*4c1414c8SBarry Smith 287*4c1414c8SBarry Smith for (i1=0,col=0; i1<nslim_col; ++i1){ 288*4c1414c8SBarry Smith nsz = ns_col[i1]; 289*4c1414c8SBarry Smith for (i2=0; i2<nsz; ++i2,++col) 290*4c1414c8SBarry Smith tvc[col] = i1; 291*4c1414c8SBarry Smith } 292*4c1414c8SBarry Smith /* allocate space for column pointers */ 293*4c1414c8SBarry Smith ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 294*4c1414c8SBarry Smith *iia = ia; 295*4c1414c8SBarry Smith ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr); 296*4c1414c8SBarry Smith ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 297*4c1414c8SBarry Smith 298*4c1414c8SBarry Smith /* determine the number of columns in each row */ 299*4c1414c8SBarry Smith ia[0] = oshift; 300*4c1414c8SBarry Smith for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 301*4c1414c8SBarry Smith j = aj + ai[row] + ishift; 302*4c1414c8SBarry Smith col = *j++ + ishift; 303*4c1414c8SBarry Smith i2 = tvc[col]; 304*4c1414c8SBarry Smith nz = ai[row+1] - ai[row]; 305*4c1414c8SBarry Smith while (nz-- > 0) { /* off-diagonal elemets */ 306*4c1414c8SBarry Smith /* ia[i1+1]++; */ 307*4c1414c8SBarry Smith ia[i2+1]++; 308*4c1414c8SBarry Smith i2++; 309*4c1414c8SBarry Smith while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 310*4c1414c8SBarry Smith if (nz > 0) i2 = tvc[col]; 311*4c1414c8SBarry Smith } 312*4c1414c8SBarry Smith } 313*4c1414c8SBarry Smith 314*4c1414c8SBarry Smith /* shift ia[i] to point to next col */ 315*4c1414c8SBarry Smith for (i1=1; i1<nslim_col+1; i1++) { 316*4c1414c8SBarry Smith col = ia[i1-1]; 317*4c1414c8SBarry Smith ia[i1] += col; 318*4c1414c8SBarry Smith work[i1-1] = col - oshift; 319*4c1414c8SBarry Smith } 320*4c1414c8SBarry Smith 321*4c1414c8SBarry Smith /* allocate space for column pointers */ 322*4c1414c8SBarry Smith nz = ia[nslim_col] + (!ishift); 323*4c1414c8SBarry Smith ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 324*4c1414c8SBarry Smith *jja = ja; 325*4c1414c8SBarry Smith 326*4c1414c8SBarry Smith /* loop over matrix putting into ja */ 327*4c1414c8SBarry Smith for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 328*4c1414c8SBarry Smith j = aj + ai[row] + ishift; 329*4c1414c8SBarry Smith i2 = 0; /* Col inode index */ 330*4c1414c8SBarry Smith col = *j++ + ishift; 331*4c1414c8SBarry Smith i2 = tvc[col]; 332*4c1414c8SBarry Smith nz = ai[row+1] - ai[row]; 333*4c1414c8SBarry Smith while (nz-- > 0) { 334*4c1414c8SBarry Smith /* ja[work[i1]++] = i2 + oshift; */ 335*4c1414c8SBarry Smith ja[work[i2]++] = i1 + oshift; 336*4c1414c8SBarry Smith i2++; 337*4c1414c8SBarry Smith while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 338*4c1414c8SBarry Smith if (nz > 0) i2 = tvc[col]; 339*4c1414c8SBarry Smith } 340*4c1414c8SBarry Smith } 341*4c1414c8SBarry Smith ierr = PetscFree(ns_col);CHKERRQ(ierr); 342*4c1414c8SBarry Smith ierr = PetscFree(work);CHKERRQ(ierr); 343*4c1414c8SBarry Smith ierr = PetscFree(tns);CHKERRQ(ierr); 344*4c1414c8SBarry Smith ierr = PetscFree(tvc);CHKERRQ(ierr); 345*4c1414c8SBarry Smith PetscFunctionReturn(0); 346*4c1414c8SBarry Smith } 347*4c1414c8SBarry Smith 348*4c1414c8SBarry Smith #undef __FUNCT__ 349*4c1414c8SBarry Smith #define __FUNCT__ "MatGetColumnIJ_Inode" 350*4c1414c8SBarry Smith static PetscErrorCode MatGetColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 351*4c1414c8SBarry Smith { 352*4c1414c8SBarry Smith PetscErrorCode ierr; 353*4c1414c8SBarry Smith 354*4c1414c8SBarry Smith PetscFunctionBegin; 355*4c1414c8SBarry Smith ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr); 356*4c1414c8SBarry Smith if (!ia) PetscFunctionReturn(0); 357*4c1414c8SBarry Smith 358*4c1414c8SBarry Smith if (symmetric) { 359*4c1414c8SBarry Smith /* Since the indices are symmetric it does'nt matter */ 360*4c1414c8SBarry Smith ierr = MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 361*4c1414c8SBarry Smith } else { 362*4c1414c8SBarry Smith ierr = MatGetColumnIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 363*4c1414c8SBarry Smith } 364*4c1414c8SBarry Smith PetscFunctionReturn(0); 365*4c1414c8SBarry Smith } 366*4c1414c8SBarry Smith 367*4c1414c8SBarry Smith #undef __FUNCT__ 368*4c1414c8SBarry Smith #define __FUNCT__ "MatRestoreColumnIJ_Inode" 369*4c1414c8SBarry Smith static PetscErrorCode MatRestoreColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 370*4c1414c8SBarry Smith { 371*4c1414c8SBarry Smith PetscErrorCode ierr; 372*4c1414c8SBarry Smith 373*4c1414c8SBarry Smith PetscFunctionBegin; 374*4c1414c8SBarry Smith if (!ia) PetscFunctionReturn(0); 375*4c1414c8SBarry Smith ierr = PetscFree(*ia);CHKERRQ(ierr); 376*4c1414c8SBarry Smith ierr = PetscFree(*ja);CHKERRQ(ierr); 377*4c1414c8SBarry Smith PetscFunctionReturn(0); 378*4c1414c8SBarry Smith } 379*4c1414c8SBarry Smith 380*4c1414c8SBarry Smith /* ----------------------------------------------------------- */ 381*4c1414c8SBarry Smith 382*4c1414c8SBarry Smith #undef __FUNCT__ 383*4c1414c8SBarry Smith #define __FUNCT__ "MatMult_Inode" 384*4c1414c8SBarry Smith static PetscErrorCode MatMult_Inode(Mat A,Vec xx,Vec yy) 385*4c1414c8SBarry Smith { 386*4c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 387*4c1414c8SBarry Smith PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 388*4c1414c8SBarry Smith PetscScalar *v1,*v2,*v3,*v4,*v5,*x,*y; 389*4c1414c8SBarry Smith PetscErrorCode ierr; 390*4c1414c8SBarry Smith PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz; 391*4c1414c8SBarry Smith 392*4c1414c8SBarry Smith #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 393*4c1414c8SBarry Smith #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5) 394*4c1414c8SBarry Smith #endif 395*4c1414c8SBarry Smith 396*4c1414c8SBarry Smith PetscFunctionBegin; 397*4c1414c8SBarry Smith if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 398*4c1414c8SBarry Smith node_max = a->inode.node_count; 399*4c1414c8SBarry Smith ns = a->inode.size; /* Node Size array */ 400*4c1414c8SBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 401*4c1414c8SBarry Smith ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 402*4c1414c8SBarry Smith idx = a->j; 403*4c1414c8SBarry Smith v1 = a->a; 404*4c1414c8SBarry Smith ii = a->i; 405*4c1414c8SBarry Smith 406*4c1414c8SBarry Smith for (i = 0,row = 0; i< node_max; ++i){ 407*4c1414c8SBarry Smith nsz = ns[i]; 408*4c1414c8SBarry Smith n = ii[1] - ii[0]; 409*4c1414c8SBarry Smith ii += nsz; 410*4c1414c8SBarry Smith sz = n; /* No of non zeros in this row */ 411*4c1414c8SBarry Smith /* Switch on the size of Node */ 412*4c1414c8SBarry Smith switch (nsz){ /* Each loop in 'case' is unrolled */ 413*4c1414c8SBarry Smith case 1 : 414*4c1414c8SBarry Smith sum1 = 0; 415*4c1414c8SBarry Smith 416*4c1414c8SBarry Smith for(n = 0; n< sz-1; n+=2) { 417*4c1414c8SBarry Smith i1 = idx[0]; /* The instructions are ordered to */ 418*4c1414c8SBarry Smith i2 = idx[1]; /* make the compiler's job easy */ 419*4c1414c8SBarry Smith idx += 2; 420*4c1414c8SBarry Smith tmp0 = x[i1]; 421*4c1414c8SBarry Smith tmp1 = x[i2]; 422*4c1414c8SBarry Smith sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 423*4c1414c8SBarry Smith } 424*4c1414c8SBarry Smith 425*4c1414c8SBarry Smith if (n == sz-1){ /* Take care of the last nonzero */ 426*4c1414c8SBarry Smith tmp0 = x[*idx++]; 427*4c1414c8SBarry Smith sum1 += *v1++ * tmp0; 428*4c1414c8SBarry Smith } 429*4c1414c8SBarry Smith y[row++]=sum1; 430*4c1414c8SBarry Smith break; 431*4c1414c8SBarry Smith case 2: 432*4c1414c8SBarry Smith sum1 = 0; 433*4c1414c8SBarry Smith sum2 = 0; 434*4c1414c8SBarry Smith v2 = v1 + n; 435*4c1414c8SBarry Smith 436*4c1414c8SBarry Smith for (n = 0; n< sz-1; n+=2) { 437*4c1414c8SBarry Smith i1 = idx[0]; 438*4c1414c8SBarry Smith i2 = idx[1]; 439*4c1414c8SBarry Smith idx += 2; 440*4c1414c8SBarry Smith tmp0 = x[i1]; 441*4c1414c8SBarry Smith tmp1 = x[i2]; 442*4c1414c8SBarry Smith sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 443*4c1414c8SBarry Smith sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 444*4c1414c8SBarry Smith } 445*4c1414c8SBarry Smith if (n == sz-1){ 446*4c1414c8SBarry Smith tmp0 = x[*idx++]; 447*4c1414c8SBarry Smith sum1 += *v1++ * tmp0; 448*4c1414c8SBarry Smith sum2 += *v2++ * tmp0; 449*4c1414c8SBarry Smith } 450*4c1414c8SBarry Smith y[row++]=sum1; 451*4c1414c8SBarry Smith y[row++]=sum2; 452*4c1414c8SBarry Smith v1 =v2; /* Since the next block to be processed starts there*/ 453*4c1414c8SBarry Smith idx +=sz; 454*4c1414c8SBarry Smith break; 455*4c1414c8SBarry Smith case 3: 456*4c1414c8SBarry Smith sum1 = 0; 457*4c1414c8SBarry Smith sum2 = 0; 458*4c1414c8SBarry Smith sum3 = 0; 459*4c1414c8SBarry Smith v2 = v1 + n; 460*4c1414c8SBarry Smith v3 = v2 + n; 461*4c1414c8SBarry Smith 462*4c1414c8SBarry Smith for (n = 0; n< sz-1; n+=2) { 463*4c1414c8SBarry Smith i1 = idx[0]; 464*4c1414c8SBarry Smith i2 = idx[1]; 465*4c1414c8SBarry Smith idx += 2; 466*4c1414c8SBarry Smith tmp0 = x[i1]; 467*4c1414c8SBarry Smith tmp1 = x[i2]; 468*4c1414c8SBarry Smith sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 469*4c1414c8SBarry Smith sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 470*4c1414c8SBarry Smith sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 471*4c1414c8SBarry Smith } 472*4c1414c8SBarry Smith if (n == sz-1){ 473*4c1414c8SBarry Smith tmp0 = x[*idx++]; 474*4c1414c8SBarry Smith sum1 += *v1++ * tmp0; 475*4c1414c8SBarry Smith sum2 += *v2++ * tmp0; 476*4c1414c8SBarry Smith sum3 += *v3++ * tmp0; 477*4c1414c8SBarry Smith } 478*4c1414c8SBarry Smith y[row++]=sum1; 479*4c1414c8SBarry Smith y[row++]=sum2; 480*4c1414c8SBarry Smith y[row++]=sum3; 481*4c1414c8SBarry Smith v1 =v3; /* Since the next block to be processed starts there*/ 482*4c1414c8SBarry Smith idx +=2*sz; 483*4c1414c8SBarry Smith break; 484*4c1414c8SBarry Smith case 4: 485*4c1414c8SBarry Smith sum1 = 0; 486*4c1414c8SBarry Smith sum2 = 0; 487*4c1414c8SBarry Smith sum3 = 0; 488*4c1414c8SBarry Smith sum4 = 0; 489*4c1414c8SBarry Smith v2 = v1 + n; 490*4c1414c8SBarry Smith v3 = v2 + n; 491*4c1414c8SBarry Smith v4 = v3 + n; 492*4c1414c8SBarry Smith 493*4c1414c8SBarry Smith for (n = 0; n< sz-1; n+=2) { 494*4c1414c8SBarry Smith i1 = idx[0]; 495*4c1414c8SBarry Smith i2 = idx[1]; 496*4c1414c8SBarry Smith idx += 2; 497*4c1414c8SBarry Smith tmp0 = x[i1]; 498*4c1414c8SBarry Smith tmp1 = x[i2]; 499*4c1414c8SBarry Smith sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 500*4c1414c8SBarry Smith sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 501*4c1414c8SBarry Smith sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 502*4c1414c8SBarry Smith sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 503*4c1414c8SBarry Smith } 504*4c1414c8SBarry Smith if (n == sz-1){ 505*4c1414c8SBarry Smith tmp0 = x[*idx++]; 506*4c1414c8SBarry Smith sum1 += *v1++ * tmp0; 507*4c1414c8SBarry Smith sum2 += *v2++ * tmp0; 508*4c1414c8SBarry Smith sum3 += *v3++ * tmp0; 509*4c1414c8SBarry Smith sum4 += *v4++ * tmp0; 510*4c1414c8SBarry Smith } 511*4c1414c8SBarry Smith y[row++]=sum1; 512*4c1414c8SBarry Smith y[row++]=sum2; 513*4c1414c8SBarry Smith y[row++]=sum3; 514*4c1414c8SBarry Smith y[row++]=sum4; 515*4c1414c8SBarry Smith v1 =v4; /* Since the next block to be processed starts there*/ 516*4c1414c8SBarry Smith idx +=3*sz; 517*4c1414c8SBarry Smith break; 518*4c1414c8SBarry Smith case 5: 519*4c1414c8SBarry Smith sum1 = 0; 520*4c1414c8SBarry Smith sum2 = 0; 521*4c1414c8SBarry Smith sum3 = 0; 522*4c1414c8SBarry Smith sum4 = 0; 523*4c1414c8SBarry Smith sum5 = 0; 524*4c1414c8SBarry Smith v2 = v1 + n; 525*4c1414c8SBarry Smith v3 = v2 + n; 526*4c1414c8SBarry Smith v4 = v3 + n; 527*4c1414c8SBarry Smith v5 = v4 + n; 528*4c1414c8SBarry Smith 529*4c1414c8SBarry Smith for (n = 0; n<sz-1; n+=2) { 530*4c1414c8SBarry Smith i1 = idx[0]; 531*4c1414c8SBarry Smith i2 = idx[1]; 532*4c1414c8SBarry Smith idx += 2; 533*4c1414c8SBarry Smith tmp0 = x[i1]; 534*4c1414c8SBarry Smith tmp1 = x[i2]; 535*4c1414c8SBarry Smith sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 536*4c1414c8SBarry Smith sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 537*4c1414c8SBarry Smith sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 538*4c1414c8SBarry Smith sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 539*4c1414c8SBarry Smith sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 540*4c1414c8SBarry Smith } 541*4c1414c8SBarry Smith if (n == sz-1){ 542*4c1414c8SBarry Smith tmp0 = x[*idx++]; 543*4c1414c8SBarry Smith sum1 += *v1++ * tmp0; 544*4c1414c8SBarry Smith sum2 += *v2++ * tmp0; 545*4c1414c8SBarry Smith sum3 += *v3++ * tmp0; 546*4c1414c8SBarry Smith sum4 += *v4++ * tmp0; 547*4c1414c8SBarry Smith sum5 += *v5++ * tmp0; 548*4c1414c8SBarry Smith } 549*4c1414c8SBarry Smith y[row++]=sum1; 550*4c1414c8SBarry Smith y[row++]=sum2; 551*4c1414c8SBarry Smith y[row++]=sum3; 552*4c1414c8SBarry Smith y[row++]=sum4; 553*4c1414c8SBarry Smith y[row++]=sum5; 554*4c1414c8SBarry Smith v1 =v5; /* Since the next block to be processed starts there */ 555*4c1414c8SBarry Smith idx +=4*sz; 556*4c1414c8SBarry Smith break; 557*4c1414c8SBarry Smith default : 558*4c1414c8SBarry Smith SETERRQ(PETSC_ERR_COR,"Node size not yet supported"); 559*4c1414c8SBarry Smith } 560*4c1414c8SBarry Smith } 561*4c1414c8SBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 562*4c1414c8SBarry Smith ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 563*4c1414c8SBarry Smith ierr = PetscLogFlops(2*a->nz - A->rmap.n);CHKERRQ(ierr); 564*4c1414c8SBarry Smith PetscFunctionReturn(0); 565*4c1414c8SBarry Smith } 566*4c1414c8SBarry Smith /* ----------------------------------------------------------- */ 567*4c1414c8SBarry Smith /* Almost same code as the MatMult_Inode() */ 568*4c1414c8SBarry Smith #undef __FUNCT__ 569*4c1414c8SBarry Smith #define __FUNCT__ "MatMultAdd_Inode" 570*4c1414c8SBarry Smith static PetscErrorCode MatMultAdd_Inode(Mat A,Vec xx,Vec zz,Vec yy) 571*4c1414c8SBarry Smith { 572*4c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 573*4c1414c8SBarry Smith PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 574*4c1414c8SBarry Smith PetscScalar *v1,*v2,*v3,*v4,*v5,*x,*y,*z,*zt; 575*4c1414c8SBarry Smith PetscErrorCode ierr; 576*4c1414c8SBarry Smith PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz; 577*4c1414c8SBarry Smith 578*4c1414c8SBarry Smith PetscFunctionBegin; 579*4c1414c8SBarry Smith if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 580*4c1414c8SBarry Smith node_max = a->inode.node_count; 581*4c1414c8SBarry Smith ns = a->inode.size; /* Node Size array */ 582*4c1414c8SBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 583*4c1414c8SBarry Smith ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 584*4c1414c8SBarry Smith if (zz != yy) { 585*4c1414c8SBarry Smith ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 586*4c1414c8SBarry Smith } else { 587*4c1414c8SBarry Smith z = y; 588*4c1414c8SBarry Smith } 589*4c1414c8SBarry Smith zt = z; 590*4c1414c8SBarry Smith 591*4c1414c8SBarry Smith idx = a->j; 592*4c1414c8SBarry Smith v1 = a->a; 593*4c1414c8SBarry Smith ii = a->i; 594*4c1414c8SBarry Smith 595*4c1414c8SBarry Smith for (i = 0,row = 0; i< node_max; ++i){ 596*4c1414c8SBarry Smith nsz = ns[i]; 597*4c1414c8SBarry Smith n = ii[1] - ii[0]; 598*4c1414c8SBarry Smith ii += nsz; 599*4c1414c8SBarry Smith sz = n; /* No of non zeros in this row */ 600*4c1414c8SBarry Smith /* Switch on the size of Node */ 601*4c1414c8SBarry Smith switch (nsz){ /* Each loop in 'case' is unrolled */ 602*4c1414c8SBarry Smith case 1 : 603*4c1414c8SBarry Smith sum1 = *zt++; 604*4c1414c8SBarry Smith 605*4c1414c8SBarry Smith for(n = 0; n< sz-1; n+=2) { 606*4c1414c8SBarry Smith i1 = idx[0]; /* The instructions are ordered to */ 607*4c1414c8SBarry Smith i2 = idx[1]; /* make the compiler's job easy */ 608*4c1414c8SBarry Smith idx += 2; 609*4c1414c8SBarry Smith tmp0 = x[i1]; 610*4c1414c8SBarry Smith tmp1 = x[i2]; 611*4c1414c8SBarry Smith sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 612*4c1414c8SBarry Smith } 613*4c1414c8SBarry Smith 614*4c1414c8SBarry Smith if(n == sz-1){ /* Take care of the last nonzero */ 615*4c1414c8SBarry Smith tmp0 = x[*idx++]; 616*4c1414c8SBarry Smith sum1 += *v1++ * tmp0; 617*4c1414c8SBarry Smith } 618*4c1414c8SBarry Smith y[row++]=sum1; 619*4c1414c8SBarry Smith break; 620*4c1414c8SBarry Smith case 2: 621*4c1414c8SBarry Smith sum1 = *zt++; 622*4c1414c8SBarry Smith sum2 = *zt++; 623*4c1414c8SBarry Smith v2 = v1 + n; 624*4c1414c8SBarry Smith 625*4c1414c8SBarry Smith for(n = 0; n< sz-1; n+=2) { 626*4c1414c8SBarry Smith i1 = idx[0]; 627*4c1414c8SBarry Smith i2 = idx[1]; 628*4c1414c8SBarry Smith idx += 2; 629*4c1414c8SBarry Smith tmp0 = x[i1]; 630*4c1414c8SBarry Smith tmp1 = x[i2]; 631*4c1414c8SBarry Smith sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 632*4c1414c8SBarry Smith sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 633*4c1414c8SBarry Smith } 634*4c1414c8SBarry Smith if(n == sz-1){ 635*4c1414c8SBarry Smith tmp0 = x[*idx++]; 636*4c1414c8SBarry Smith sum1 += *v1++ * tmp0; 637*4c1414c8SBarry Smith sum2 += *v2++ * tmp0; 638*4c1414c8SBarry Smith } 639*4c1414c8SBarry Smith y[row++]=sum1; 640*4c1414c8SBarry Smith y[row++]=sum2; 641*4c1414c8SBarry Smith v1 =v2; /* Since the next block to be processed starts there*/ 642*4c1414c8SBarry Smith idx +=sz; 643*4c1414c8SBarry Smith break; 644*4c1414c8SBarry Smith case 3: 645*4c1414c8SBarry Smith sum1 = *zt++; 646*4c1414c8SBarry Smith sum2 = *zt++; 647*4c1414c8SBarry Smith sum3 = *zt++; 648*4c1414c8SBarry Smith v2 = v1 + n; 649*4c1414c8SBarry Smith v3 = v2 + n; 650*4c1414c8SBarry Smith 651*4c1414c8SBarry Smith for (n = 0; n< sz-1; n+=2) { 652*4c1414c8SBarry Smith i1 = idx[0]; 653*4c1414c8SBarry Smith i2 = idx[1]; 654*4c1414c8SBarry Smith idx += 2; 655*4c1414c8SBarry Smith tmp0 = x[i1]; 656*4c1414c8SBarry Smith tmp1 = x[i2]; 657*4c1414c8SBarry Smith sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 658*4c1414c8SBarry Smith sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 659*4c1414c8SBarry Smith sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 660*4c1414c8SBarry Smith } 661*4c1414c8SBarry Smith if (n == sz-1){ 662*4c1414c8SBarry Smith tmp0 = x[*idx++]; 663*4c1414c8SBarry Smith sum1 += *v1++ * tmp0; 664*4c1414c8SBarry Smith sum2 += *v2++ * tmp0; 665*4c1414c8SBarry Smith sum3 += *v3++ * tmp0; 666*4c1414c8SBarry Smith } 667*4c1414c8SBarry Smith y[row++]=sum1; 668*4c1414c8SBarry Smith y[row++]=sum2; 669*4c1414c8SBarry Smith y[row++]=sum3; 670*4c1414c8SBarry Smith v1 =v3; /* Since the next block to be processed starts there*/ 671*4c1414c8SBarry Smith idx +=2*sz; 672*4c1414c8SBarry Smith break; 673*4c1414c8SBarry Smith case 4: 674*4c1414c8SBarry Smith sum1 = *zt++; 675*4c1414c8SBarry Smith sum2 = *zt++; 676*4c1414c8SBarry Smith sum3 = *zt++; 677*4c1414c8SBarry Smith sum4 = *zt++; 678*4c1414c8SBarry Smith v2 = v1 + n; 679*4c1414c8SBarry Smith v3 = v2 + n; 680*4c1414c8SBarry Smith v4 = v3 + n; 681*4c1414c8SBarry Smith 682*4c1414c8SBarry Smith for (n = 0; n< sz-1; n+=2) { 683*4c1414c8SBarry Smith i1 = idx[0]; 684*4c1414c8SBarry Smith i2 = idx[1]; 685*4c1414c8SBarry Smith idx += 2; 686*4c1414c8SBarry Smith tmp0 = x[i1]; 687*4c1414c8SBarry Smith tmp1 = x[i2]; 688*4c1414c8SBarry Smith sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 689*4c1414c8SBarry Smith sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 690*4c1414c8SBarry Smith sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 691*4c1414c8SBarry Smith sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 692*4c1414c8SBarry Smith } 693*4c1414c8SBarry Smith if (n == sz-1){ 694*4c1414c8SBarry Smith tmp0 = x[*idx++]; 695*4c1414c8SBarry Smith sum1 += *v1++ * tmp0; 696*4c1414c8SBarry Smith sum2 += *v2++ * tmp0; 697*4c1414c8SBarry Smith sum3 += *v3++ * tmp0; 698*4c1414c8SBarry Smith sum4 += *v4++ * tmp0; 699*4c1414c8SBarry Smith } 700*4c1414c8SBarry Smith y[row++]=sum1; 701*4c1414c8SBarry Smith y[row++]=sum2; 702*4c1414c8SBarry Smith y[row++]=sum3; 703*4c1414c8SBarry Smith y[row++]=sum4; 704*4c1414c8SBarry Smith v1 =v4; /* Since the next block to be processed starts there*/ 705*4c1414c8SBarry Smith idx +=3*sz; 706*4c1414c8SBarry Smith break; 707*4c1414c8SBarry Smith case 5: 708*4c1414c8SBarry Smith sum1 = *zt++; 709*4c1414c8SBarry Smith sum2 = *zt++; 710*4c1414c8SBarry Smith sum3 = *zt++; 711*4c1414c8SBarry Smith sum4 = *zt++; 712*4c1414c8SBarry Smith sum5 = *zt++; 713*4c1414c8SBarry Smith v2 = v1 + n; 714*4c1414c8SBarry Smith v3 = v2 + n; 715*4c1414c8SBarry Smith v4 = v3 + n; 716*4c1414c8SBarry Smith v5 = v4 + n; 717*4c1414c8SBarry Smith 718*4c1414c8SBarry Smith for (n = 0; n<sz-1; n+=2) { 719*4c1414c8SBarry Smith i1 = idx[0]; 720*4c1414c8SBarry Smith i2 = idx[1]; 721*4c1414c8SBarry Smith idx += 2; 722*4c1414c8SBarry Smith tmp0 = x[i1]; 723*4c1414c8SBarry Smith tmp1 = x[i2]; 724*4c1414c8SBarry Smith sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 725*4c1414c8SBarry Smith sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 726*4c1414c8SBarry Smith sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 727*4c1414c8SBarry Smith sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 728*4c1414c8SBarry Smith sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 729*4c1414c8SBarry Smith } 730*4c1414c8SBarry Smith if(n == sz-1){ 731*4c1414c8SBarry Smith tmp0 = x[*idx++]; 732*4c1414c8SBarry Smith sum1 += *v1++ * tmp0; 733*4c1414c8SBarry Smith sum2 += *v2++ * tmp0; 734*4c1414c8SBarry Smith sum3 += *v3++ * tmp0; 735*4c1414c8SBarry Smith sum4 += *v4++ * tmp0; 736*4c1414c8SBarry Smith sum5 += *v5++ * tmp0; 737*4c1414c8SBarry Smith } 738*4c1414c8SBarry Smith y[row++]=sum1; 739*4c1414c8SBarry Smith y[row++]=sum2; 740*4c1414c8SBarry Smith y[row++]=sum3; 741*4c1414c8SBarry Smith y[row++]=sum4; 742*4c1414c8SBarry Smith y[row++]=sum5; 743*4c1414c8SBarry Smith v1 =v5; /* Since the next block to be processed starts there */ 744*4c1414c8SBarry Smith idx +=4*sz; 745*4c1414c8SBarry Smith break; 746*4c1414c8SBarry Smith default : 747*4c1414c8SBarry Smith SETERRQ(PETSC_ERR_COR,"Node size not yet supported"); 748*4c1414c8SBarry Smith } 749*4c1414c8SBarry Smith } 750*4c1414c8SBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 751*4c1414c8SBarry Smith ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 752*4c1414c8SBarry Smith if (zz != yy) { 753*4c1414c8SBarry Smith ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 754*4c1414c8SBarry Smith } 755*4c1414c8SBarry Smith ierr = PetscLogFlops(2*a->nz);CHKERRQ(ierr); 756*4c1414c8SBarry Smith PetscFunctionReturn(0); 757*4c1414c8SBarry Smith } 758*4c1414c8SBarry Smith 759*4c1414c8SBarry Smith /* ----------------------------------------------------------- */ 760*4c1414c8SBarry Smith #undef __FUNCT__ 761*4c1414c8SBarry Smith #define __FUNCT__ "MatSolve_Inode" 762*4c1414c8SBarry Smith PetscErrorCode MatSolve_Inode(Mat A,Vec bb,Vec xx) 763*4c1414c8SBarry Smith { 764*4c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 765*4c1414c8SBarry Smith IS iscol = a->col,isrow = a->row; 766*4c1414c8SBarry Smith PetscErrorCode ierr; 767*4c1414c8SBarry Smith PetscInt *r,*c,i,j,n = A->rmap.n,*ai = a->i,nz,*a_j = a->j; 768*4c1414c8SBarry Smith PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1,*rout,*cout; 769*4c1414c8SBarry Smith PetscScalar *x,*b,*a_a = a->a,*tmp,*tmps,*aa,tmp0,tmp1; 770*4c1414c8SBarry Smith PetscScalar sum1,sum2,sum3,sum4,sum5,*v1,*v2,*v3,*v4,*v5; 771*4c1414c8SBarry Smith 772*4c1414c8SBarry Smith PetscFunctionBegin; 773*4c1414c8SBarry Smith if (A->factor!=FACTOR_LU) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unfactored matrix"); 774*4c1414c8SBarry Smith if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 775*4c1414c8SBarry Smith node_max = a->inode.node_count; 776*4c1414c8SBarry Smith ns = a->inode.size; /* Node Size array */ 777*4c1414c8SBarry Smith 778*4c1414c8SBarry Smith ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 779*4c1414c8SBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 780*4c1414c8SBarry Smith tmp = a->solve_work; 781*4c1414c8SBarry Smith 782*4c1414c8SBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 783*4c1414c8SBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 784*4c1414c8SBarry Smith 785*4c1414c8SBarry Smith /* forward solve the lower triangular */ 786*4c1414c8SBarry Smith tmps = tmp ; 787*4c1414c8SBarry Smith aa = a_a ; 788*4c1414c8SBarry Smith aj = a_j ; 789*4c1414c8SBarry Smith ad = a->diag; 790*4c1414c8SBarry Smith 791*4c1414c8SBarry Smith for (i = 0,row = 0; i< node_max; ++i){ 792*4c1414c8SBarry Smith nsz = ns[i]; 793*4c1414c8SBarry Smith aii = ai[row]; 794*4c1414c8SBarry Smith v1 = aa + aii; 795*4c1414c8SBarry Smith vi = aj + aii; 796*4c1414c8SBarry Smith nz = ad[row]- aii; 797*4c1414c8SBarry Smith 798*4c1414c8SBarry Smith switch (nsz){ /* Each loop in 'case' is unrolled */ 799*4c1414c8SBarry Smith case 1 : 800*4c1414c8SBarry Smith sum1 = b[*r++]; 801*4c1414c8SBarry Smith /* while (nz--) sum1 -= *v1++ *tmps[*vi++];*/ 802*4c1414c8SBarry Smith for(j=0; j<nz-1; j+=2){ 803*4c1414c8SBarry Smith i0 = vi[0]; 804*4c1414c8SBarry Smith i1 = vi[1]; 805*4c1414c8SBarry Smith vi +=2; 806*4c1414c8SBarry Smith tmp0 = tmps[i0]; 807*4c1414c8SBarry Smith tmp1 = tmps[i1]; 808*4c1414c8SBarry Smith sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 809*4c1414c8SBarry Smith } 810*4c1414c8SBarry Smith if(j == nz-1){ 811*4c1414c8SBarry Smith tmp0 = tmps[*vi++]; 812*4c1414c8SBarry Smith sum1 -= *v1++ *tmp0; 813*4c1414c8SBarry Smith } 814*4c1414c8SBarry Smith tmp[row ++]=sum1; 815*4c1414c8SBarry Smith break; 816*4c1414c8SBarry Smith case 2: 817*4c1414c8SBarry Smith sum1 = b[*r++]; 818*4c1414c8SBarry Smith sum2 = b[*r++]; 819*4c1414c8SBarry Smith v2 = aa + ai[row+1]; 820*4c1414c8SBarry Smith 821*4c1414c8SBarry Smith for(j=0; j<nz-1; j+=2){ 822*4c1414c8SBarry Smith i0 = vi[0]; 823*4c1414c8SBarry Smith i1 = vi[1]; 824*4c1414c8SBarry Smith vi +=2; 825*4c1414c8SBarry Smith tmp0 = tmps[i0]; 826*4c1414c8SBarry Smith tmp1 = tmps[i1]; 827*4c1414c8SBarry Smith sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 828*4c1414c8SBarry Smith sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 829*4c1414c8SBarry Smith } 830*4c1414c8SBarry Smith if(j == nz-1){ 831*4c1414c8SBarry Smith tmp0 = tmps[*vi++]; 832*4c1414c8SBarry Smith sum1 -= *v1++ *tmp0; 833*4c1414c8SBarry Smith sum2 -= *v2++ *tmp0; 834*4c1414c8SBarry Smith } 835*4c1414c8SBarry Smith sum2 -= *v2++ * sum1; 836*4c1414c8SBarry Smith tmp[row ++]=sum1; 837*4c1414c8SBarry Smith tmp[row ++]=sum2; 838*4c1414c8SBarry Smith break; 839*4c1414c8SBarry Smith case 3: 840*4c1414c8SBarry Smith sum1 = b[*r++]; 841*4c1414c8SBarry Smith sum2 = b[*r++]; 842*4c1414c8SBarry Smith sum3 = b[*r++]; 843*4c1414c8SBarry Smith v2 = aa + ai[row+1]; 844*4c1414c8SBarry Smith v3 = aa + ai[row+2]; 845*4c1414c8SBarry Smith 846*4c1414c8SBarry Smith for (j=0; j<nz-1; j+=2){ 847*4c1414c8SBarry Smith i0 = vi[0]; 848*4c1414c8SBarry Smith i1 = vi[1]; 849*4c1414c8SBarry Smith vi +=2; 850*4c1414c8SBarry Smith tmp0 = tmps[i0]; 851*4c1414c8SBarry Smith tmp1 = tmps[i1]; 852*4c1414c8SBarry Smith sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 853*4c1414c8SBarry Smith sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 854*4c1414c8SBarry Smith sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 855*4c1414c8SBarry Smith } 856*4c1414c8SBarry Smith if (j == nz-1){ 857*4c1414c8SBarry Smith tmp0 = tmps[*vi++]; 858*4c1414c8SBarry Smith sum1 -= *v1++ *tmp0; 859*4c1414c8SBarry Smith sum2 -= *v2++ *tmp0; 860*4c1414c8SBarry Smith sum3 -= *v3++ *tmp0; 861*4c1414c8SBarry Smith } 862*4c1414c8SBarry Smith sum2 -= *v2++ * sum1; 863*4c1414c8SBarry Smith sum3 -= *v3++ * sum1; 864*4c1414c8SBarry Smith sum3 -= *v3++ * sum2; 865*4c1414c8SBarry Smith tmp[row ++]=sum1; 866*4c1414c8SBarry Smith tmp[row ++]=sum2; 867*4c1414c8SBarry Smith tmp[row ++]=sum3; 868*4c1414c8SBarry Smith break; 869*4c1414c8SBarry Smith 870*4c1414c8SBarry Smith case 4: 871*4c1414c8SBarry Smith sum1 = b[*r++]; 872*4c1414c8SBarry Smith sum2 = b[*r++]; 873*4c1414c8SBarry Smith sum3 = b[*r++]; 874*4c1414c8SBarry Smith sum4 = b[*r++]; 875*4c1414c8SBarry Smith v2 = aa + ai[row+1]; 876*4c1414c8SBarry Smith v3 = aa + ai[row+2]; 877*4c1414c8SBarry Smith v4 = aa + ai[row+3]; 878*4c1414c8SBarry Smith 879*4c1414c8SBarry Smith for (j=0; j<nz-1; j+=2){ 880*4c1414c8SBarry Smith i0 = vi[0]; 881*4c1414c8SBarry Smith i1 = vi[1]; 882*4c1414c8SBarry Smith vi +=2; 883*4c1414c8SBarry Smith tmp0 = tmps[i0]; 884*4c1414c8SBarry Smith tmp1 = tmps[i1]; 885*4c1414c8SBarry Smith sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 886*4c1414c8SBarry Smith sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 887*4c1414c8SBarry Smith sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 888*4c1414c8SBarry Smith sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 889*4c1414c8SBarry Smith } 890*4c1414c8SBarry Smith if (j == nz-1){ 891*4c1414c8SBarry Smith tmp0 = tmps[*vi++]; 892*4c1414c8SBarry Smith sum1 -= *v1++ *tmp0; 893*4c1414c8SBarry Smith sum2 -= *v2++ *tmp0; 894*4c1414c8SBarry Smith sum3 -= *v3++ *tmp0; 895*4c1414c8SBarry Smith sum4 -= *v4++ *tmp0; 896*4c1414c8SBarry Smith } 897*4c1414c8SBarry Smith sum2 -= *v2++ * sum1; 898*4c1414c8SBarry Smith sum3 -= *v3++ * sum1; 899*4c1414c8SBarry Smith sum4 -= *v4++ * sum1; 900*4c1414c8SBarry Smith sum3 -= *v3++ * sum2; 901*4c1414c8SBarry Smith sum4 -= *v4++ * sum2; 902*4c1414c8SBarry Smith sum4 -= *v4++ * sum3; 903*4c1414c8SBarry Smith 904*4c1414c8SBarry Smith tmp[row ++]=sum1; 905*4c1414c8SBarry Smith tmp[row ++]=sum2; 906*4c1414c8SBarry Smith tmp[row ++]=sum3; 907*4c1414c8SBarry Smith tmp[row ++]=sum4; 908*4c1414c8SBarry Smith break; 909*4c1414c8SBarry Smith case 5: 910*4c1414c8SBarry Smith sum1 = b[*r++]; 911*4c1414c8SBarry Smith sum2 = b[*r++]; 912*4c1414c8SBarry Smith sum3 = b[*r++]; 913*4c1414c8SBarry Smith sum4 = b[*r++]; 914*4c1414c8SBarry Smith sum5 = b[*r++]; 915*4c1414c8SBarry Smith v2 = aa + ai[row+1]; 916*4c1414c8SBarry Smith v3 = aa + ai[row+2]; 917*4c1414c8SBarry Smith v4 = aa + ai[row+3]; 918*4c1414c8SBarry Smith v5 = aa + ai[row+4]; 919*4c1414c8SBarry Smith 920*4c1414c8SBarry Smith for (j=0; j<nz-1; j+=2){ 921*4c1414c8SBarry Smith i0 = vi[0]; 922*4c1414c8SBarry Smith i1 = vi[1]; 923*4c1414c8SBarry Smith vi +=2; 924*4c1414c8SBarry Smith tmp0 = tmps[i0]; 925*4c1414c8SBarry Smith tmp1 = tmps[i1]; 926*4c1414c8SBarry Smith sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 927*4c1414c8SBarry Smith sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 928*4c1414c8SBarry Smith sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 929*4c1414c8SBarry Smith sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 930*4c1414c8SBarry Smith sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 931*4c1414c8SBarry Smith } 932*4c1414c8SBarry Smith if (j == nz-1){ 933*4c1414c8SBarry Smith tmp0 = tmps[*vi++]; 934*4c1414c8SBarry Smith sum1 -= *v1++ *tmp0; 935*4c1414c8SBarry Smith sum2 -= *v2++ *tmp0; 936*4c1414c8SBarry Smith sum3 -= *v3++ *tmp0; 937*4c1414c8SBarry Smith sum4 -= *v4++ *tmp0; 938*4c1414c8SBarry Smith sum5 -= *v5++ *tmp0; 939*4c1414c8SBarry Smith } 940*4c1414c8SBarry Smith 941*4c1414c8SBarry Smith sum2 -= *v2++ * sum1; 942*4c1414c8SBarry Smith sum3 -= *v3++ * sum1; 943*4c1414c8SBarry Smith sum4 -= *v4++ * sum1; 944*4c1414c8SBarry Smith sum5 -= *v5++ * sum1; 945*4c1414c8SBarry Smith sum3 -= *v3++ * sum2; 946*4c1414c8SBarry Smith sum4 -= *v4++ * sum2; 947*4c1414c8SBarry Smith sum5 -= *v5++ * sum2; 948*4c1414c8SBarry Smith sum4 -= *v4++ * sum3; 949*4c1414c8SBarry Smith sum5 -= *v5++ * sum3; 950*4c1414c8SBarry Smith sum5 -= *v5++ * sum4; 951*4c1414c8SBarry Smith 952*4c1414c8SBarry Smith tmp[row ++]=sum1; 953*4c1414c8SBarry Smith tmp[row ++]=sum2; 954*4c1414c8SBarry Smith tmp[row ++]=sum3; 955*4c1414c8SBarry Smith tmp[row ++]=sum4; 956*4c1414c8SBarry Smith tmp[row ++]=sum5; 957*4c1414c8SBarry Smith break; 958*4c1414c8SBarry Smith default: 959*4c1414c8SBarry Smith SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 960*4c1414c8SBarry Smith } 961*4c1414c8SBarry Smith } 962*4c1414c8SBarry Smith /* backward solve the upper triangular */ 963*4c1414c8SBarry Smith for (i=node_max -1 ,row = n-1 ; i>=0; i--){ 964*4c1414c8SBarry Smith nsz = ns[i]; 965*4c1414c8SBarry Smith aii = ai[row+1] -1; 966*4c1414c8SBarry Smith v1 = aa + aii; 967*4c1414c8SBarry Smith vi = aj + aii; 968*4c1414c8SBarry Smith nz = aii- ad[row]; 969*4c1414c8SBarry Smith switch (nsz){ /* Each loop in 'case' is unrolled */ 970*4c1414c8SBarry Smith case 1 : 971*4c1414c8SBarry Smith sum1 = tmp[row]; 972*4c1414c8SBarry Smith 973*4c1414c8SBarry Smith for(j=nz ; j>1; j-=2){ 974*4c1414c8SBarry Smith vi -=2; 975*4c1414c8SBarry Smith i0 = vi[2]; 976*4c1414c8SBarry Smith i1 = vi[1]; 977*4c1414c8SBarry Smith tmp0 = tmps[i0]; 978*4c1414c8SBarry Smith tmp1 = tmps[i1]; 979*4c1414c8SBarry Smith v1 -= 2; 980*4c1414c8SBarry Smith sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 981*4c1414c8SBarry Smith } 982*4c1414c8SBarry Smith if (j==1){ 983*4c1414c8SBarry Smith tmp0 = tmps[*vi--]; 984*4c1414c8SBarry Smith sum1 -= *v1-- * tmp0; 985*4c1414c8SBarry Smith } 986*4c1414c8SBarry Smith x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 987*4c1414c8SBarry Smith break; 988*4c1414c8SBarry Smith case 2 : 989*4c1414c8SBarry Smith sum1 = tmp[row]; 990*4c1414c8SBarry Smith sum2 = tmp[row -1]; 991*4c1414c8SBarry Smith v2 = aa + ai[row]-1; 992*4c1414c8SBarry Smith for (j=nz ; j>1; j-=2){ 993*4c1414c8SBarry Smith vi -=2; 994*4c1414c8SBarry Smith i0 = vi[2]; 995*4c1414c8SBarry Smith i1 = vi[1]; 996*4c1414c8SBarry Smith tmp0 = tmps[i0]; 997*4c1414c8SBarry Smith tmp1 = tmps[i1]; 998*4c1414c8SBarry Smith v1 -= 2; 999*4c1414c8SBarry Smith v2 -= 2; 1000*4c1414c8SBarry Smith sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1001*4c1414c8SBarry Smith sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1002*4c1414c8SBarry Smith } 1003*4c1414c8SBarry Smith if (j==1){ 1004*4c1414c8SBarry Smith tmp0 = tmps[*vi--]; 1005*4c1414c8SBarry Smith sum1 -= *v1-- * tmp0; 1006*4c1414c8SBarry Smith sum2 -= *v2-- * tmp0; 1007*4c1414c8SBarry Smith } 1008*4c1414c8SBarry Smith 1009*4c1414c8SBarry Smith tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1010*4c1414c8SBarry Smith sum2 -= *v2-- * tmp0; 1011*4c1414c8SBarry Smith x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1012*4c1414c8SBarry Smith break; 1013*4c1414c8SBarry Smith case 3 : 1014*4c1414c8SBarry Smith sum1 = tmp[row]; 1015*4c1414c8SBarry Smith sum2 = tmp[row -1]; 1016*4c1414c8SBarry Smith sum3 = tmp[row -2]; 1017*4c1414c8SBarry Smith v2 = aa + ai[row]-1; 1018*4c1414c8SBarry Smith v3 = aa + ai[row -1]-1; 1019*4c1414c8SBarry Smith for (j=nz ; j>1; j-=2){ 1020*4c1414c8SBarry Smith vi -=2; 1021*4c1414c8SBarry Smith i0 = vi[2]; 1022*4c1414c8SBarry Smith i1 = vi[1]; 1023*4c1414c8SBarry Smith tmp0 = tmps[i0]; 1024*4c1414c8SBarry Smith tmp1 = tmps[i1]; 1025*4c1414c8SBarry Smith v1 -= 2; 1026*4c1414c8SBarry Smith v2 -= 2; 1027*4c1414c8SBarry Smith v3 -= 2; 1028*4c1414c8SBarry Smith sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1029*4c1414c8SBarry Smith sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1030*4c1414c8SBarry Smith sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1031*4c1414c8SBarry Smith } 1032*4c1414c8SBarry Smith if (j==1){ 1033*4c1414c8SBarry Smith tmp0 = tmps[*vi--]; 1034*4c1414c8SBarry Smith sum1 -= *v1-- * tmp0; 1035*4c1414c8SBarry Smith sum2 -= *v2-- * tmp0; 1036*4c1414c8SBarry Smith sum3 -= *v3-- * tmp0; 1037*4c1414c8SBarry Smith } 1038*4c1414c8SBarry Smith tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1039*4c1414c8SBarry Smith sum2 -= *v2-- * tmp0; 1040*4c1414c8SBarry Smith sum3 -= *v3-- * tmp0; 1041*4c1414c8SBarry Smith tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1042*4c1414c8SBarry Smith sum3 -= *v3-- * tmp0; 1043*4c1414c8SBarry Smith x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1044*4c1414c8SBarry Smith 1045*4c1414c8SBarry Smith break; 1046*4c1414c8SBarry Smith case 4 : 1047*4c1414c8SBarry Smith sum1 = tmp[row]; 1048*4c1414c8SBarry Smith sum2 = tmp[row -1]; 1049*4c1414c8SBarry Smith sum3 = tmp[row -2]; 1050*4c1414c8SBarry Smith sum4 = tmp[row -3]; 1051*4c1414c8SBarry Smith v2 = aa + ai[row]-1; 1052*4c1414c8SBarry Smith v3 = aa + ai[row -1]-1; 1053*4c1414c8SBarry Smith v4 = aa + ai[row -2]-1; 1054*4c1414c8SBarry Smith 1055*4c1414c8SBarry Smith for (j=nz ; j>1; j-=2){ 1056*4c1414c8SBarry Smith vi -=2; 1057*4c1414c8SBarry Smith i0 = vi[2]; 1058*4c1414c8SBarry Smith i1 = vi[1]; 1059*4c1414c8SBarry Smith tmp0 = tmps[i0]; 1060*4c1414c8SBarry Smith tmp1 = tmps[i1]; 1061*4c1414c8SBarry Smith v1 -= 2; 1062*4c1414c8SBarry Smith v2 -= 2; 1063*4c1414c8SBarry Smith v3 -= 2; 1064*4c1414c8SBarry Smith v4 -= 2; 1065*4c1414c8SBarry Smith sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1066*4c1414c8SBarry Smith sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1067*4c1414c8SBarry Smith sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1068*4c1414c8SBarry Smith sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1069*4c1414c8SBarry Smith } 1070*4c1414c8SBarry Smith if (j==1){ 1071*4c1414c8SBarry Smith tmp0 = tmps[*vi--]; 1072*4c1414c8SBarry Smith sum1 -= *v1-- * tmp0; 1073*4c1414c8SBarry Smith sum2 -= *v2-- * tmp0; 1074*4c1414c8SBarry Smith sum3 -= *v3-- * tmp0; 1075*4c1414c8SBarry Smith sum4 -= *v4-- * tmp0; 1076*4c1414c8SBarry Smith } 1077*4c1414c8SBarry Smith 1078*4c1414c8SBarry Smith tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1079*4c1414c8SBarry Smith sum2 -= *v2-- * tmp0; 1080*4c1414c8SBarry Smith sum3 -= *v3-- * tmp0; 1081*4c1414c8SBarry Smith sum4 -= *v4-- * tmp0; 1082*4c1414c8SBarry Smith tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1083*4c1414c8SBarry Smith sum3 -= *v3-- * tmp0; 1084*4c1414c8SBarry Smith sum4 -= *v4-- * tmp0; 1085*4c1414c8SBarry Smith tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1086*4c1414c8SBarry Smith sum4 -= *v4-- * tmp0; 1087*4c1414c8SBarry Smith x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1088*4c1414c8SBarry Smith break; 1089*4c1414c8SBarry Smith case 5 : 1090*4c1414c8SBarry Smith sum1 = tmp[row]; 1091*4c1414c8SBarry Smith sum2 = tmp[row -1]; 1092*4c1414c8SBarry Smith sum3 = tmp[row -2]; 1093*4c1414c8SBarry Smith sum4 = tmp[row -3]; 1094*4c1414c8SBarry Smith sum5 = tmp[row -4]; 1095*4c1414c8SBarry Smith v2 = aa + ai[row]-1; 1096*4c1414c8SBarry Smith v3 = aa + ai[row -1]-1; 1097*4c1414c8SBarry Smith v4 = aa + ai[row -2]-1; 1098*4c1414c8SBarry Smith v5 = aa + ai[row -3]-1; 1099*4c1414c8SBarry Smith for (j=nz ; j>1; j-=2){ 1100*4c1414c8SBarry Smith vi -= 2; 1101*4c1414c8SBarry Smith i0 = vi[2]; 1102*4c1414c8SBarry Smith i1 = vi[1]; 1103*4c1414c8SBarry Smith tmp0 = tmps[i0]; 1104*4c1414c8SBarry Smith tmp1 = tmps[i1]; 1105*4c1414c8SBarry Smith v1 -= 2; 1106*4c1414c8SBarry Smith v2 -= 2; 1107*4c1414c8SBarry Smith v3 -= 2; 1108*4c1414c8SBarry Smith v4 -= 2; 1109*4c1414c8SBarry Smith v5 -= 2; 1110*4c1414c8SBarry Smith sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1111*4c1414c8SBarry Smith sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1112*4c1414c8SBarry Smith sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1113*4c1414c8SBarry Smith sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1114*4c1414c8SBarry Smith sum5 -= v5[2] * tmp0 + v5[1] * tmp1; 1115*4c1414c8SBarry Smith } 1116*4c1414c8SBarry Smith if (j==1){ 1117*4c1414c8SBarry Smith tmp0 = tmps[*vi--]; 1118*4c1414c8SBarry Smith sum1 -= *v1-- * tmp0; 1119*4c1414c8SBarry Smith sum2 -= *v2-- * tmp0; 1120*4c1414c8SBarry Smith sum3 -= *v3-- * tmp0; 1121*4c1414c8SBarry Smith sum4 -= *v4-- * tmp0; 1122*4c1414c8SBarry Smith sum5 -= *v5-- * tmp0; 1123*4c1414c8SBarry Smith } 1124*4c1414c8SBarry Smith 1125*4c1414c8SBarry Smith tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1126*4c1414c8SBarry Smith sum2 -= *v2-- * tmp0; 1127*4c1414c8SBarry Smith sum3 -= *v3-- * tmp0; 1128*4c1414c8SBarry Smith sum4 -= *v4-- * tmp0; 1129*4c1414c8SBarry Smith sum5 -= *v5-- * tmp0; 1130*4c1414c8SBarry Smith tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1131*4c1414c8SBarry Smith sum3 -= *v3-- * tmp0; 1132*4c1414c8SBarry Smith sum4 -= *v4-- * tmp0; 1133*4c1414c8SBarry Smith sum5 -= *v5-- * tmp0; 1134*4c1414c8SBarry Smith tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1135*4c1414c8SBarry Smith sum4 -= *v4-- * tmp0; 1136*4c1414c8SBarry Smith sum5 -= *v5-- * tmp0; 1137*4c1414c8SBarry Smith tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1138*4c1414c8SBarry Smith sum5 -= *v5-- * tmp0; 1139*4c1414c8SBarry Smith x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--; 1140*4c1414c8SBarry Smith break; 1141*4c1414c8SBarry Smith default: 1142*4c1414c8SBarry Smith SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 1143*4c1414c8SBarry Smith } 1144*4c1414c8SBarry Smith } 1145*4c1414c8SBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1146*4c1414c8SBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1147*4c1414c8SBarry Smith ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1148*4c1414c8SBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1149*4c1414c8SBarry Smith ierr = PetscLogFlops(2*a->nz - A->cmap.n);CHKERRQ(ierr); 1150*4c1414c8SBarry Smith PetscFunctionReturn(0); 1151*4c1414c8SBarry Smith } 1152*4c1414c8SBarry Smith 1153*4c1414c8SBarry Smith #undef __FUNCT__ 1154*4c1414c8SBarry Smith #define __FUNCT__ "MatLUFactorNumeric_Inode" 1155*4c1414c8SBarry Smith PetscErrorCode MatLUFactorNumeric_Inode(Mat A,MatFactorInfo *info,Mat *B) 1156*4c1414c8SBarry Smith { 1157*4c1414c8SBarry Smith Mat C = *B; 1158*4c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data; 1159*4c1414c8SBarry Smith IS iscol = b->col,isrow = b->row,isicol = b->icol; 1160*4c1414c8SBarry Smith PetscErrorCode ierr; 1161*4c1414c8SBarry Smith PetscInt *r,*ic,*c,n = A->rmap.n,*bi = b->i; 1162*4c1414c8SBarry Smith PetscInt *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow; 1163*4c1414c8SBarry Smith PetscInt *ics,i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz; 1164*4c1414c8SBarry Smith PetscInt *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj; 1165*4c1414c8SBarry Smith PetscScalar *rtmp1,*rtmp2,*rtmp3,*v1,*v2,*v3,*pc1,*pc2,*pc3,mul1,mul2,mul3; 1166*4c1414c8SBarry Smith PetscScalar tmp,*ba = b->a,*aa = a->a,*pv; 1167*4c1414c8SBarry Smith PetscReal rs=0.0; 1168*4c1414c8SBarry Smith LUShift_Ctx sctx; 1169*4c1414c8SBarry Smith PetscInt newshift; 1170*4c1414c8SBarry Smith 1171*4c1414c8SBarry Smith PetscFunctionBegin; 1172*4c1414c8SBarry Smith sctx.shift_top = 0; 1173*4c1414c8SBarry Smith sctx.nshift_max = 0; 1174*4c1414c8SBarry Smith sctx.shift_lo = 0; 1175*4c1414c8SBarry Smith sctx.shift_hi = 0; 1176*4c1414c8SBarry Smith 1177*4c1414c8SBarry Smith /* if both shift schemes are chosen by user, only use info->shiftpd */ 1178*4c1414c8SBarry Smith if (info->shiftpd && info->shiftnz) info->shiftnz = 0.0; 1179*4c1414c8SBarry Smith if (info->shiftpd) { /* set sctx.shift_top=max{rs} */ 1180*4c1414c8SBarry Smith sctx.shift_top = 0; 1181*4c1414c8SBarry Smith for (i=0; i<n; i++) { 1182*4c1414c8SBarry Smith /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1183*4c1414c8SBarry Smith rs = 0.0; 1184*4c1414c8SBarry Smith ajtmp = aj + ai[i]; 1185*4c1414c8SBarry Smith rtmp1 = aa + ai[i]; 1186*4c1414c8SBarry Smith nz = ai[i+1] - ai[i]; 1187*4c1414c8SBarry Smith for (j=0; j<nz; j++){ 1188*4c1414c8SBarry Smith if (*ajtmp != i){ 1189*4c1414c8SBarry Smith rs += PetscAbsScalar(*rtmp1++); 1190*4c1414c8SBarry Smith } else { 1191*4c1414c8SBarry Smith rs -= PetscRealPart(*rtmp1++); 1192*4c1414c8SBarry Smith } 1193*4c1414c8SBarry Smith ajtmp++; 1194*4c1414c8SBarry Smith } 1195*4c1414c8SBarry Smith if (rs>sctx.shift_top) sctx.shift_top = rs; 1196*4c1414c8SBarry Smith } 1197*4c1414c8SBarry Smith if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12; 1198*4c1414c8SBarry Smith sctx.shift_top *= 1.1; 1199*4c1414c8SBarry Smith sctx.nshift_max = 5; 1200*4c1414c8SBarry Smith sctx.shift_lo = 0.; 1201*4c1414c8SBarry Smith sctx.shift_hi = 1.; 1202*4c1414c8SBarry Smith } 1203*4c1414c8SBarry Smith sctx.shift_amount = 0; 1204*4c1414c8SBarry Smith sctx.nshift = 0; 1205*4c1414c8SBarry Smith 1206*4c1414c8SBarry Smith ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1207*4c1414c8SBarry Smith ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 1208*4c1414c8SBarry Smith ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1209*4c1414c8SBarry Smith ierr = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp1);CHKERRQ(ierr); 1210*4c1414c8SBarry Smith ierr = PetscMemzero(rtmp1,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr); 1211*4c1414c8SBarry Smith ics = ic ; 1212*4c1414c8SBarry Smith rtmp2 = rtmp1 + n; 1213*4c1414c8SBarry Smith rtmp3 = rtmp2 + n; 1214*4c1414c8SBarry Smith 1215*4c1414c8SBarry Smith node_max = a->inode.node_count; 1216*4c1414c8SBarry Smith ns = a->inode.size ; 1217*4c1414c8SBarry Smith if (!ns){ 1218*4c1414c8SBarry Smith SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information"); 1219*4c1414c8SBarry Smith } 1220*4c1414c8SBarry Smith 1221*4c1414c8SBarry Smith /* If max inode size > 3, split it into two inodes.*/ 1222*4c1414c8SBarry Smith /* also map the inode sizes according to the ordering */ 1223*4c1414c8SBarry Smith ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr); 1224*4c1414c8SBarry Smith for (i=0,j=0; i<node_max; ++i,++j){ 1225*4c1414c8SBarry Smith if (ns[i]>3) { 1226*4c1414c8SBarry Smith tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */ 1227*4c1414c8SBarry Smith ++j; 1228*4c1414c8SBarry Smith tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1229*4c1414c8SBarry Smith } else { 1230*4c1414c8SBarry Smith tmp_vec1[j] = ns[i]; 1231*4c1414c8SBarry Smith } 1232*4c1414c8SBarry Smith } 1233*4c1414c8SBarry Smith /* Use the correct node_max */ 1234*4c1414c8SBarry Smith node_max = j; 1235*4c1414c8SBarry Smith 1236*4c1414c8SBarry Smith /* Now reorder the inode info based on mat re-ordering info */ 1237*4c1414c8SBarry Smith /* First create a row -> inode_size_array_index map */ 1238*4c1414c8SBarry Smith ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr); 1239*4c1414c8SBarry Smith ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr); 1240*4c1414c8SBarry Smith for (i=0,row=0; i<node_max; i++) { 1241*4c1414c8SBarry Smith nodesz = tmp_vec1[i]; 1242*4c1414c8SBarry Smith for (j=0; j<nodesz; j++,row++) { 1243*4c1414c8SBarry Smith nsmap[row] = i; 1244*4c1414c8SBarry Smith } 1245*4c1414c8SBarry Smith } 1246*4c1414c8SBarry Smith /* Using nsmap, create a reordered ns structure */ 1247*4c1414c8SBarry Smith for (i=0,j=0; i< node_max; i++) { 1248*4c1414c8SBarry Smith nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1249*4c1414c8SBarry Smith tmp_vec2[i] = nodesz; 1250*4c1414c8SBarry Smith j += nodesz; 1251*4c1414c8SBarry Smith } 1252*4c1414c8SBarry Smith ierr = PetscFree(nsmap);CHKERRQ(ierr); 1253*4c1414c8SBarry Smith ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 1254*4c1414c8SBarry Smith /* Now use the correct ns */ 1255*4c1414c8SBarry Smith ns = tmp_vec2; 1256*4c1414c8SBarry Smith 1257*4c1414c8SBarry Smith do { 1258*4c1414c8SBarry Smith sctx.lushift = PETSC_FALSE; 1259*4c1414c8SBarry Smith /* Now loop over each block-row, and do the factorization */ 1260*4c1414c8SBarry Smith for (i=0,row=0; i<node_max; i++) { 1261*4c1414c8SBarry Smith nodesz = ns[i]; 1262*4c1414c8SBarry Smith nz = bi[row+1] - bi[row]; 1263*4c1414c8SBarry Smith bjtmp = bj + bi[row]; 1264*4c1414c8SBarry Smith 1265*4c1414c8SBarry Smith switch (nodesz){ 1266*4c1414c8SBarry Smith case 1: 1267*4c1414c8SBarry Smith for (j=0; j<nz; j++){ 1268*4c1414c8SBarry Smith idx = bjtmp[j]; 1269*4c1414c8SBarry Smith rtmp1[idx] = 0.0; 1270*4c1414c8SBarry Smith } 1271*4c1414c8SBarry Smith 1272*4c1414c8SBarry Smith /* load in initial (unfactored row) */ 1273*4c1414c8SBarry Smith idx = r[row]; 1274*4c1414c8SBarry Smith nz_tmp = ai[idx+1] - ai[idx]; 1275*4c1414c8SBarry Smith ajtmp = aj + ai[idx]; 1276*4c1414c8SBarry Smith v1 = aa + ai[idx]; 1277*4c1414c8SBarry Smith 1278*4c1414c8SBarry Smith for (j=0; j<nz_tmp; j++) { 1279*4c1414c8SBarry Smith idx = ics[ajtmp[j]]; 1280*4c1414c8SBarry Smith rtmp1[idx] = v1[j]; 1281*4c1414c8SBarry Smith } 1282*4c1414c8SBarry Smith rtmp1[ics[r[row]]] += sctx.shift_amount; 1283*4c1414c8SBarry Smith 1284*4c1414c8SBarry Smith prow = *bjtmp++ ; 1285*4c1414c8SBarry Smith while (prow < row) { 1286*4c1414c8SBarry Smith pc1 = rtmp1 + prow; 1287*4c1414c8SBarry Smith if (*pc1 != 0.0){ 1288*4c1414c8SBarry Smith pv = ba + bd[prow]; 1289*4c1414c8SBarry Smith pj = nbj + bd[prow]; 1290*4c1414c8SBarry Smith mul1 = *pc1 * *pv++; 1291*4c1414c8SBarry Smith *pc1 = mul1; 1292*4c1414c8SBarry Smith nz_tmp = bi[prow+1] - bd[prow] - 1; 1293*4c1414c8SBarry Smith ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr); 1294*4c1414c8SBarry Smith for (j=0; j<nz_tmp; j++) { 1295*4c1414c8SBarry Smith tmp = pv[j]; 1296*4c1414c8SBarry Smith idx = pj[j]; 1297*4c1414c8SBarry Smith rtmp1[idx] -= mul1 * tmp; 1298*4c1414c8SBarry Smith } 1299*4c1414c8SBarry Smith } 1300*4c1414c8SBarry Smith prow = *bjtmp++ ; 1301*4c1414c8SBarry Smith } 1302*4c1414c8SBarry Smith pj = bj + bi[row]; 1303*4c1414c8SBarry Smith pc1 = ba + bi[row]; 1304*4c1414c8SBarry Smith 1305*4c1414c8SBarry Smith sctx.pv = rtmp1[row]; 1306*4c1414c8SBarry Smith rtmp1[row] = 1.0/rtmp1[row]; /* invert diag */ 1307*4c1414c8SBarry Smith rs = 0.0; 1308*4c1414c8SBarry Smith for (j=0; j<nz; j++) { 1309*4c1414c8SBarry Smith idx = pj[j]; 1310*4c1414c8SBarry Smith pc1[j] = rtmp1[idx]; /* rtmp1 -> ba */ 1311*4c1414c8SBarry Smith if (idx != row) rs += PetscAbsScalar(pc1[j]); 1312*4c1414c8SBarry Smith } 1313*4c1414c8SBarry Smith sctx.rs = rs; 1314*4c1414c8SBarry Smith ierr = MatLUCheckShift_inline(info,sctx,row,a->diag,newshift);CHKERRQ(ierr); 1315*4c1414c8SBarry Smith if (newshift == 1) goto endofwhile; 1316*4c1414c8SBarry Smith break; 1317*4c1414c8SBarry Smith 1318*4c1414c8SBarry Smith case 2: 1319*4c1414c8SBarry Smith for (j=0; j<nz; j++) { 1320*4c1414c8SBarry Smith idx = bjtmp[j]; 1321*4c1414c8SBarry Smith rtmp1[idx] = 0.0; 1322*4c1414c8SBarry Smith rtmp2[idx] = 0.0; 1323*4c1414c8SBarry Smith } 1324*4c1414c8SBarry Smith 1325*4c1414c8SBarry Smith /* load in initial (unfactored row) */ 1326*4c1414c8SBarry Smith idx = r[row]; 1327*4c1414c8SBarry Smith nz_tmp = ai[idx+1] - ai[idx]; 1328*4c1414c8SBarry Smith ajtmp = aj + ai[idx]; 1329*4c1414c8SBarry Smith v1 = aa + ai[idx]; 1330*4c1414c8SBarry Smith v2 = aa + ai[idx+1]; 1331*4c1414c8SBarry Smith for (j=0; j<nz_tmp; j++) { 1332*4c1414c8SBarry Smith idx = ics[ajtmp[j]]; 1333*4c1414c8SBarry Smith rtmp1[idx] = v1[j]; 1334*4c1414c8SBarry Smith rtmp2[idx] = v2[j]; 1335*4c1414c8SBarry Smith } 1336*4c1414c8SBarry Smith rtmp1[ics[r[row]]] += sctx.shift_amount; 1337*4c1414c8SBarry Smith rtmp2[ics[r[row+1]]] += sctx.shift_amount; 1338*4c1414c8SBarry Smith 1339*4c1414c8SBarry Smith prow = *bjtmp++ ; 1340*4c1414c8SBarry Smith while (prow < row) { 1341*4c1414c8SBarry Smith pc1 = rtmp1 + prow; 1342*4c1414c8SBarry Smith pc2 = rtmp2 + prow; 1343*4c1414c8SBarry Smith if (*pc1 != 0.0 || *pc2 != 0.0){ 1344*4c1414c8SBarry Smith pv = ba + bd[prow]; 1345*4c1414c8SBarry Smith pj = nbj + bd[prow]; 1346*4c1414c8SBarry Smith mul1 = *pc1 * *pv; 1347*4c1414c8SBarry Smith mul2 = *pc2 * *pv; 1348*4c1414c8SBarry Smith ++pv; 1349*4c1414c8SBarry Smith *pc1 = mul1; 1350*4c1414c8SBarry Smith *pc2 = mul2; 1351*4c1414c8SBarry Smith 1352*4c1414c8SBarry Smith nz_tmp = bi[prow+1] - bd[prow] - 1; 1353*4c1414c8SBarry Smith for (j=0; j<nz_tmp; j++) { 1354*4c1414c8SBarry Smith tmp = pv[j]; 1355*4c1414c8SBarry Smith idx = pj[j]; 1356*4c1414c8SBarry Smith rtmp1[idx] -= mul1 * tmp; 1357*4c1414c8SBarry Smith rtmp2[idx] -= mul2 * tmp; 1358*4c1414c8SBarry Smith } 1359*4c1414c8SBarry Smith ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr); 1360*4c1414c8SBarry Smith } 1361*4c1414c8SBarry Smith prow = *bjtmp++ ; 1362*4c1414c8SBarry Smith } 1363*4c1414c8SBarry Smith 1364*4c1414c8SBarry Smith /* Now take care of diagonal 2x2 block. Note: prow = row here */ 1365*4c1414c8SBarry Smith pc1 = rtmp1 + prow; 1366*4c1414c8SBarry Smith pc2 = rtmp2 + prow; 1367*4c1414c8SBarry Smith 1368*4c1414c8SBarry Smith sctx.pv = *pc1; 1369*4c1414c8SBarry Smith pj = bj + bi[prow]; 1370*4c1414c8SBarry Smith rs = 0.0; 1371*4c1414c8SBarry Smith for (j=0; j<nz; j++){ 1372*4c1414c8SBarry Smith idx = pj[j]; 1373*4c1414c8SBarry Smith if (idx != prow) rs += PetscAbsScalar(rtmp1[idx]); 1374*4c1414c8SBarry Smith } 1375*4c1414c8SBarry Smith sctx.rs = rs; 1376*4c1414c8SBarry Smith ierr = MatLUCheckShift_inline(info,sctx,row,a->diag,newshift);CHKERRQ(ierr); 1377*4c1414c8SBarry Smith if (newshift == 1) goto endofwhile; 1378*4c1414c8SBarry Smith 1379*4c1414c8SBarry Smith if (*pc2 != 0.0){ 1380*4c1414c8SBarry Smith pj = nbj + bd[prow]; 1381*4c1414c8SBarry Smith mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/ 1382*4c1414c8SBarry Smith *pc2 = mul2; 1383*4c1414c8SBarry Smith nz_tmp = bi[prow+1] - bd[prow] - 1; 1384*4c1414c8SBarry Smith for (j=0; j<nz_tmp; j++) { 1385*4c1414c8SBarry Smith idx = pj[j] ; 1386*4c1414c8SBarry Smith tmp = rtmp1[idx]; 1387*4c1414c8SBarry Smith rtmp2[idx] -= mul2 * tmp; 1388*4c1414c8SBarry Smith } 1389*4c1414c8SBarry Smith ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr); 1390*4c1414c8SBarry Smith } 1391*4c1414c8SBarry Smith 1392*4c1414c8SBarry Smith pj = bj + bi[row]; 1393*4c1414c8SBarry Smith pc1 = ba + bi[row]; 1394*4c1414c8SBarry Smith pc2 = ba + bi[row+1]; 1395*4c1414c8SBarry Smith 1396*4c1414c8SBarry Smith sctx.pv = rtmp2[row+1]; 1397*4c1414c8SBarry Smith rs = 0.0; 1398*4c1414c8SBarry Smith rtmp1[row] = 1.0/rtmp1[row]; 1399*4c1414c8SBarry Smith rtmp2[row+1] = 1.0/rtmp2[row+1]; 1400*4c1414c8SBarry Smith /* copy row entries from dense representation to sparse */ 1401*4c1414c8SBarry Smith for (j=0; j<nz; j++) { 1402*4c1414c8SBarry Smith idx = pj[j]; 1403*4c1414c8SBarry Smith pc1[j] = rtmp1[idx]; 1404*4c1414c8SBarry Smith pc2[j] = rtmp2[idx]; 1405*4c1414c8SBarry Smith if (idx != row+1) rs += PetscAbsScalar(pc2[j]); 1406*4c1414c8SBarry Smith } 1407*4c1414c8SBarry Smith sctx.rs = rs; 1408*4c1414c8SBarry Smith ierr = MatLUCheckShift_inline(info,sctx,row+1,a->diag,newshift);CHKERRQ(ierr); 1409*4c1414c8SBarry Smith if (newshift == 1) goto endofwhile; 1410*4c1414c8SBarry Smith break; 1411*4c1414c8SBarry Smith 1412*4c1414c8SBarry Smith case 3: 1413*4c1414c8SBarry Smith for (j=0; j<nz; j++) { 1414*4c1414c8SBarry Smith idx = bjtmp[j]; 1415*4c1414c8SBarry Smith rtmp1[idx] = 0.0; 1416*4c1414c8SBarry Smith rtmp2[idx] = 0.0; 1417*4c1414c8SBarry Smith rtmp3[idx] = 0.0; 1418*4c1414c8SBarry Smith } 1419*4c1414c8SBarry Smith /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */ 1420*4c1414c8SBarry Smith idx = r[row]; 1421*4c1414c8SBarry Smith nz_tmp = ai[idx+1] - ai[idx]; 1422*4c1414c8SBarry Smith ajtmp = aj + ai[idx]; 1423*4c1414c8SBarry Smith v1 = aa + ai[idx]; 1424*4c1414c8SBarry Smith v2 = aa + ai[idx+1]; 1425*4c1414c8SBarry Smith v3 = aa + ai[idx+2]; 1426*4c1414c8SBarry Smith for (j=0; j<nz_tmp; j++) { 1427*4c1414c8SBarry Smith idx = ics[ajtmp[j]]; 1428*4c1414c8SBarry Smith rtmp1[idx] = v1[j]; 1429*4c1414c8SBarry Smith rtmp2[idx] = v2[j]; 1430*4c1414c8SBarry Smith rtmp3[idx] = v3[j]; 1431*4c1414c8SBarry Smith } 1432*4c1414c8SBarry Smith rtmp1[ics[r[row]]] += sctx.shift_amount; 1433*4c1414c8SBarry Smith rtmp2[ics[r[row+1]]] += sctx.shift_amount; 1434*4c1414c8SBarry Smith rtmp3[ics[r[row+2]]] += sctx.shift_amount; 1435*4c1414c8SBarry Smith 1436*4c1414c8SBarry Smith /* loop over all pivot row blocks above this row block */ 1437*4c1414c8SBarry Smith prow = *bjtmp++ ; 1438*4c1414c8SBarry Smith while (prow < row) { 1439*4c1414c8SBarry Smith pc1 = rtmp1 + prow; 1440*4c1414c8SBarry Smith pc2 = rtmp2 + prow; 1441*4c1414c8SBarry Smith pc3 = rtmp3 + prow; 1442*4c1414c8SBarry Smith if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){ 1443*4c1414c8SBarry Smith pv = ba + bd[prow]; 1444*4c1414c8SBarry Smith pj = nbj + bd[prow]; 1445*4c1414c8SBarry Smith mul1 = *pc1 * *pv; 1446*4c1414c8SBarry Smith mul2 = *pc2 * *pv; 1447*4c1414c8SBarry Smith mul3 = *pc3 * *pv; 1448*4c1414c8SBarry Smith ++pv; 1449*4c1414c8SBarry Smith *pc1 = mul1; 1450*4c1414c8SBarry Smith *pc2 = mul2; 1451*4c1414c8SBarry Smith *pc3 = mul3; 1452*4c1414c8SBarry Smith 1453*4c1414c8SBarry Smith nz_tmp = bi[prow+1] - bd[prow] - 1; 1454*4c1414c8SBarry Smith /* update this row based on pivot row */ 1455*4c1414c8SBarry Smith for (j=0; j<nz_tmp; j++) { 1456*4c1414c8SBarry Smith tmp = pv[j]; 1457*4c1414c8SBarry Smith idx = pj[j]; 1458*4c1414c8SBarry Smith rtmp1[idx] -= mul1 * tmp; 1459*4c1414c8SBarry Smith rtmp2[idx] -= mul2 * tmp; 1460*4c1414c8SBarry Smith rtmp3[idx] -= mul3 * tmp; 1461*4c1414c8SBarry Smith } 1462*4c1414c8SBarry Smith ierr = PetscLogFlops(6*nz_tmp);CHKERRQ(ierr); 1463*4c1414c8SBarry Smith } 1464*4c1414c8SBarry Smith prow = *bjtmp++ ; 1465*4c1414c8SBarry Smith } 1466*4c1414c8SBarry Smith 1467*4c1414c8SBarry Smith /* Now take care of diagonal 3x3 block in this set of rows */ 1468*4c1414c8SBarry Smith /* note: prow = row here */ 1469*4c1414c8SBarry Smith pc1 = rtmp1 + prow; 1470*4c1414c8SBarry Smith pc2 = rtmp2 + prow; 1471*4c1414c8SBarry Smith pc3 = rtmp3 + prow; 1472*4c1414c8SBarry Smith 1473*4c1414c8SBarry Smith sctx.pv = *pc1; 1474*4c1414c8SBarry Smith pj = bj + bi[prow]; 1475*4c1414c8SBarry Smith rs = 0.0; 1476*4c1414c8SBarry Smith for (j=0; j<nz; j++){ 1477*4c1414c8SBarry Smith idx = pj[j]; 1478*4c1414c8SBarry Smith if (idx != row) rs += PetscAbsScalar(rtmp1[idx]); 1479*4c1414c8SBarry Smith } 1480*4c1414c8SBarry Smith sctx.rs = rs; 1481*4c1414c8SBarry Smith ierr = MatLUCheckShift_inline(info,sctx,row,a->diag,newshift);CHKERRQ(ierr); 1482*4c1414c8SBarry Smith if (newshift == 1) goto endofwhile; 1483*4c1414c8SBarry Smith 1484*4c1414c8SBarry Smith if (*pc2 != 0.0 || *pc3 != 0.0){ 1485*4c1414c8SBarry Smith mul2 = (*pc2)/(*pc1); 1486*4c1414c8SBarry Smith mul3 = (*pc3)/(*pc1); 1487*4c1414c8SBarry Smith *pc2 = mul2; 1488*4c1414c8SBarry Smith *pc3 = mul3; 1489*4c1414c8SBarry Smith nz_tmp = bi[prow+1] - bd[prow] - 1; 1490*4c1414c8SBarry Smith pj = nbj + bd[prow]; 1491*4c1414c8SBarry Smith for (j=0; j<nz_tmp; j++) { 1492*4c1414c8SBarry Smith idx = pj[j] ; 1493*4c1414c8SBarry Smith tmp = rtmp1[idx]; 1494*4c1414c8SBarry Smith rtmp2[idx] -= mul2 * tmp; 1495*4c1414c8SBarry Smith rtmp3[idx] -= mul3 * tmp; 1496*4c1414c8SBarry Smith } 1497*4c1414c8SBarry Smith ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr); 1498*4c1414c8SBarry Smith } 1499*4c1414c8SBarry Smith ++prow; 1500*4c1414c8SBarry Smith 1501*4c1414c8SBarry Smith pc2 = rtmp2 + prow; 1502*4c1414c8SBarry Smith pc3 = rtmp3 + prow; 1503*4c1414c8SBarry Smith sctx.pv = *pc2; 1504*4c1414c8SBarry Smith pj = bj + bi[prow]; 1505*4c1414c8SBarry Smith rs = 0.0; 1506*4c1414c8SBarry Smith for (j=0; j<nz; j++){ 1507*4c1414c8SBarry Smith idx = pj[j]; 1508*4c1414c8SBarry Smith if (idx != prow) rs += PetscAbsScalar(rtmp2[idx]); 1509*4c1414c8SBarry Smith } 1510*4c1414c8SBarry Smith sctx.rs = rs; 1511*4c1414c8SBarry Smith ierr = MatLUCheckShift_inline(info,sctx,row+1,a->diag,newshift);CHKERRQ(ierr); 1512*4c1414c8SBarry Smith if (newshift == 1) goto endofwhile; 1513*4c1414c8SBarry Smith 1514*4c1414c8SBarry Smith if (*pc3 != 0.0){ 1515*4c1414c8SBarry Smith mul3 = (*pc3)/(*pc2); 1516*4c1414c8SBarry Smith *pc3 = mul3; 1517*4c1414c8SBarry Smith pj = nbj + bd[prow]; 1518*4c1414c8SBarry Smith nz_tmp = bi[prow+1] - bd[prow] - 1; 1519*4c1414c8SBarry Smith for (j=0; j<nz_tmp; j++) { 1520*4c1414c8SBarry Smith idx = pj[j] ; 1521*4c1414c8SBarry Smith tmp = rtmp2[idx]; 1522*4c1414c8SBarry Smith rtmp3[idx] -= mul3 * tmp; 1523*4c1414c8SBarry Smith } 1524*4c1414c8SBarry Smith ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr); 1525*4c1414c8SBarry Smith } 1526*4c1414c8SBarry Smith 1527*4c1414c8SBarry Smith pj = bj + bi[row]; 1528*4c1414c8SBarry Smith pc1 = ba + bi[row]; 1529*4c1414c8SBarry Smith pc2 = ba + bi[row+1]; 1530*4c1414c8SBarry Smith pc3 = ba + bi[row+2]; 1531*4c1414c8SBarry Smith 1532*4c1414c8SBarry Smith sctx.pv = rtmp3[row+2]; 1533*4c1414c8SBarry Smith rs = 0.0; 1534*4c1414c8SBarry Smith rtmp1[row] = 1.0/rtmp1[row]; 1535*4c1414c8SBarry Smith rtmp2[row+1] = 1.0/rtmp2[row+1]; 1536*4c1414c8SBarry Smith rtmp3[row+2] = 1.0/rtmp3[row+2]; 1537*4c1414c8SBarry Smith /* copy row entries from dense representation to sparse */ 1538*4c1414c8SBarry Smith for (j=0; j<nz; j++) { 1539*4c1414c8SBarry Smith idx = pj[j]; 1540*4c1414c8SBarry Smith pc1[j] = rtmp1[idx]; 1541*4c1414c8SBarry Smith pc2[j] = rtmp2[idx]; 1542*4c1414c8SBarry Smith pc3[j] = rtmp3[idx]; 1543*4c1414c8SBarry Smith if (idx != row+2) rs += PetscAbsScalar(pc3[j]); 1544*4c1414c8SBarry Smith } 1545*4c1414c8SBarry Smith 1546*4c1414c8SBarry Smith sctx.rs = rs; 1547*4c1414c8SBarry Smith ierr = MatLUCheckShift_inline(info,sctx,row+2,a->diag,newshift);CHKERRQ(ierr); 1548*4c1414c8SBarry Smith if (newshift == 1) goto endofwhile; 1549*4c1414c8SBarry Smith break; 1550*4c1414c8SBarry Smith 1551*4c1414c8SBarry Smith default: 1552*4c1414c8SBarry Smith SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 1553*4c1414c8SBarry Smith } 1554*4c1414c8SBarry Smith row += nodesz; /* Update the row */ 1555*4c1414c8SBarry Smith } 1556*4c1414c8SBarry Smith endofwhile:; 1557*4c1414c8SBarry Smith } while (sctx.lushift); 1558*4c1414c8SBarry Smith ierr = PetscFree(rtmp1);CHKERRQ(ierr); 1559*4c1414c8SBarry Smith ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 1560*4c1414c8SBarry Smith ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 1561*4c1414c8SBarry Smith ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1562*4c1414c8SBarry Smith ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 1563*4c1414c8SBarry Smith C->factor = FACTOR_LU; 1564*4c1414c8SBarry Smith C->assembled = PETSC_TRUE; 1565*4c1414c8SBarry Smith if (sctx.nshift) { 1566*4c1414c8SBarry Smith if (info->shiftnz) { 1567*4c1414c8SBarry Smith ierr = PetscInfo2(0,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr); 1568*4c1414c8SBarry Smith } else if (info->shiftpd) { 1569*4c1414c8SBarry Smith ierr = PetscInfo4(0,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,info->shift_fraction,sctx.shift_top);CHKERRQ(ierr); 1570*4c1414c8SBarry Smith } 1571*4c1414c8SBarry Smith } 1572*4c1414c8SBarry Smith ierr = PetscLogFlops(C->cmap.n);CHKERRQ(ierr); 1573*4c1414c8SBarry Smith PetscFunctionReturn(0); 1574*4c1414c8SBarry Smith } 1575*4c1414c8SBarry Smith 1576*4c1414c8SBarry Smith /* 1577*4c1414c8SBarry Smith Makes a longer coloring[] array and calls the usual code with that 1578*4c1414c8SBarry Smith */ 1579*4c1414c8SBarry Smith #undef __FUNCT__ 1580*4c1414c8SBarry Smith #define __FUNCT__ "MatColoringPatch_Inode" 1581*4c1414c8SBarry Smith PetscErrorCode MatColoringPatch_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring) 1582*4c1414c8SBarry Smith { 1583*4c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data; 1584*4c1414c8SBarry Smith PetscErrorCode ierr; 1585*4c1414c8SBarry Smith PetscInt n = mat->cmap.n,m = a->inode.node_count,j,*ns = a->inode.size,row; 1586*4c1414c8SBarry Smith PetscInt *colorused,i; 1587*4c1414c8SBarry Smith ISColoringValue *newcolor; 1588*4c1414c8SBarry Smith 1589*4c1414c8SBarry Smith PetscFunctionBegin; 1590*4c1414c8SBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr); 1591*4c1414c8SBarry Smith /* loop over inodes, marking a color for each column*/ 1592*4c1414c8SBarry Smith row = 0; 1593*4c1414c8SBarry Smith for (i=0; i<m; i++){ 1594*4c1414c8SBarry Smith for (j=0; j<ns[i]; j++) { 1595*4c1414c8SBarry Smith newcolor[row++] = coloring[i] + j*ncolors; 1596*4c1414c8SBarry Smith } 1597*4c1414c8SBarry Smith } 1598*4c1414c8SBarry Smith 1599*4c1414c8SBarry Smith /* eliminate unneeded colors */ 1600*4c1414c8SBarry Smith ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr); 1601*4c1414c8SBarry Smith ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr); 1602*4c1414c8SBarry Smith for (i=0; i<n; i++) { 1603*4c1414c8SBarry Smith colorused[newcolor[i]] = 1; 1604*4c1414c8SBarry Smith } 1605*4c1414c8SBarry Smith 1606*4c1414c8SBarry Smith for (i=1; i<5*ncolors; i++) { 1607*4c1414c8SBarry Smith colorused[i] += colorused[i-1]; 1608*4c1414c8SBarry Smith } 1609*4c1414c8SBarry Smith ncolors = colorused[5*ncolors-1]; 1610*4c1414c8SBarry Smith for (i=0; i<n; i++) { 1611*4c1414c8SBarry Smith newcolor[i] = colorused[newcolor[i]]; 1612*4c1414c8SBarry Smith } 1613*4c1414c8SBarry Smith ierr = PetscFree(colorused);CHKERRQ(ierr); 1614*4c1414c8SBarry Smith ierr = ISColoringCreate(mat->comm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr); 1615*4c1414c8SBarry Smith ierr = PetscFree(coloring);CHKERRQ(ierr); 1616*4c1414c8SBarry Smith PetscFunctionReturn(0); 1617*4c1414c8SBarry Smith } 1618*4c1414c8SBarry Smith 1619*4c1414c8SBarry Smith /* 1620*4c1414c8SBarry Smith samestructure indicates that the matrix has not changed its nonzero structure so we 1621*4c1414c8SBarry Smith do not need to recompute the inodes 1622*4c1414c8SBarry Smith */ 1623*4c1414c8SBarry Smith #undef __FUNCT__ 1624*4c1414c8SBarry Smith #define __FUNCT__ "Mat_CheckInode" 1625*4c1414c8SBarry Smith PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure) 1626*4c1414c8SBarry Smith { 1627*4c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1628*4c1414c8SBarry Smith PetscErrorCode ierr; 1629*4c1414c8SBarry Smith PetscInt i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size; 1630*4c1414c8SBarry Smith PetscTruth flag,flg; 1631*4c1414c8SBarry Smith 1632*4c1414c8SBarry Smith PetscFunctionBegin; 1633*4c1414c8SBarry Smith if (a->inode.checked && samestructure) PetscFunctionReturn(0); 1634*4c1414c8SBarry Smith 1635*4c1414c8SBarry Smith a->inode.checked = PETSC_TRUE; 1636*4c1414c8SBarry Smith 1637*4c1414c8SBarry Smith /* Notes: We set a->inode.limit=5 in MatCreate_Inode(). */ 1638*4c1414c8SBarry Smith if (!a->inode.use) {ierr = PetscInfo(A,"Not using Inode routines due to MatSetOption(MAT_DO_NOT_USE_INODES\n");CHKERRQ(ierr); PetscFunctionReturn(0);} 1639*4c1414c8SBarry Smith 1640*4c1414c8SBarry Smith ierr = PetscOptionsBegin(A->comm,A->prefix,"Options for SEQAIJ matrix","Mat");CHKERRQ(ierr); 1641*4c1414c8SBarry Smith ierr = PetscOptionsTruth("-mat_no_unroll","Do not optimize for inodes (slower)",PETSC_NULL,PETSC_FALSE,&flg,PETSC_NULL);CHKERRQ(ierr); 1642*4c1414c8SBarry Smith if (flg) {ierr = PetscInfo(A,"Not using Inode routines due to -mat_no_unroll\n");CHKERRQ(ierr);PetscFunctionReturn(0);} 1643*4c1414c8SBarry Smith ierr = PetscOptionsTruth("-mat_no_inode","Do not optimize for inodes (slower)",PETSC_NULL,PETSC_FALSE,&flg,PETSC_NULL);CHKERRQ(ierr); 1644*4c1414c8SBarry Smith if (flg) {ierr = PetscInfo(A,"Not using Inode routines due to -mat_no_inode\n");CHKERRQ(ierr);PetscFunctionReturn(0);} 1645*4c1414c8SBarry Smith ierr = PetscOptionsInt("-mat_inode_limit","Do not use inodes larger then this value",PETSC_NULL,a->inode.limit,&a->inode.limit,PETSC_NULL);CHKERRQ(ierr); 1646*4c1414c8SBarry Smith ierr = PetscOptionsEnd();CHKERRQ(ierr); 1647*4c1414c8SBarry Smith if (a->inode.limit > a->inode.max_limit) a->inode.limit = a->inode.max_limit; 1648*4c1414c8SBarry Smith 1649*4c1414c8SBarry Smith m = A->rmap.n; 1650*4c1414c8SBarry Smith if (a->inode.size) {ns = a->inode.size;} 1651*4c1414c8SBarry Smith else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);} 1652*4c1414c8SBarry Smith 1653*4c1414c8SBarry Smith i = 0; 1654*4c1414c8SBarry Smith node_count = 0; 1655*4c1414c8SBarry Smith idx = a->j; 1656*4c1414c8SBarry Smith ii = a->i; 1657*4c1414c8SBarry Smith while (i < m){ /* For each row */ 1658*4c1414c8SBarry Smith nzx = ii[i+1] - ii[i]; /* Number of nonzeros */ 1659*4c1414c8SBarry Smith /* Limits the number of elements in a node to 'a->inode.limit' */ 1660*4c1414c8SBarry Smith for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 1661*4c1414c8SBarry Smith nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */ 1662*4c1414c8SBarry Smith if (nzy != nzx) break; 1663*4c1414c8SBarry Smith idy += nzx; /* Same nonzero pattern */ 1664*4c1414c8SBarry Smith ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 1665*4c1414c8SBarry Smith if (!flag) break; 1666*4c1414c8SBarry Smith } 1667*4c1414c8SBarry Smith ns[node_count++] = blk_size; 1668*4c1414c8SBarry Smith idx += blk_size*nzx; 1669*4c1414c8SBarry Smith i = j; 1670*4c1414c8SBarry Smith } 1671*4c1414c8SBarry Smith /* If not enough inodes found,, do not use inode version of the routines */ 1672*4c1414c8SBarry Smith if (!a->inode.size && m && node_count > 0.9*m) { 1673*4c1414c8SBarry Smith ierr = PetscFree(ns);CHKERRQ(ierr); 1674*4c1414c8SBarry Smith a->inode.node_count = 0; 1675*4c1414c8SBarry Smith a->inode.size = PETSC_NULL; 1676*4c1414c8SBarry Smith a->inode.use = PETSC_FALSE; 1677*4c1414c8SBarry Smith ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 1678*4c1414c8SBarry Smith } else { 1679*4c1414c8SBarry Smith A->ops->mult = MatMult_Inode; 1680*4c1414c8SBarry Smith A->ops->multadd = MatMultAdd_Inode; 1681*4c1414c8SBarry Smith A->ops->solve = MatSolve_Inode; 1682*4c1414c8SBarry Smith A->ops->lufactornumeric = MatLUFactorNumeric_Inode; 1683*4c1414c8SBarry Smith A->ops->getrowij = MatGetRowIJ_Inode; 1684*4c1414c8SBarry Smith A->ops->restorerowij = MatRestoreRowIJ_Inode; 1685*4c1414c8SBarry Smith A->ops->getcolumnij = MatGetColumnIJ_Inode; 1686*4c1414c8SBarry Smith A->ops->restorecolumnij = MatRestoreColumnIJ_Inode; 1687*4c1414c8SBarry Smith A->ops->coloringpatch = MatColoringPatch_Inode; 1688*4c1414c8SBarry Smith a->inode.node_count = node_count; 1689*4c1414c8SBarry Smith a->inode.size = ns; 1690*4c1414c8SBarry Smith ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 1691*4c1414c8SBarry Smith } 1692*4c1414c8SBarry Smith PetscFunctionReturn(0); 1693*4c1414c8SBarry Smith } 1694*4c1414c8SBarry Smith 1695*4c1414c8SBarry Smith /* 1696*4c1414c8SBarry Smith This is really ugly. if inodes are used this replaces the 1697*4c1414c8SBarry Smith permutations with ones that correspond to rows/cols of the matrix 1698*4c1414c8SBarry Smith rather then inode blocks 1699*4c1414c8SBarry Smith */ 1700*4c1414c8SBarry Smith #undef __FUNCT__ 1701*4c1414c8SBarry Smith #define __FUNCT__ "MatInodeAdjustForInodes" 1702*4c1414c8SBarry Smith PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm) 1703*4c1414c8SBarry Smith { 1704*4c1414c8SBarry Smith PetscErrorCode ierr,(*f)(Mat,IS*,IS*); 1705*4c1414c8SBarry Smith 1706*4c1414c8SBarry Smith PetscFunctionBegin; 1707*4c1414c8SBarry Smith ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);CHKERRQ(ierr); 1708*4c1414c8SBarry Smith if (f) { 1709*4c1414c8SBarry Smith ierr = (*f)(A,rperm,cperm);CHKERRQ(ierr); 1710*4c1414c8SBarry Smith } 1711*4c1414c8SBarry Smith PetscFunctionReturn(0); 1712*4c1414c8SBarry Smith } 1713*4c1414c8SBarry Smith 1714*4c1414c8SBarry Smith EXTERN_C_BEGIN 1715*4c1414c8SBarry Smith #undef __FUNCT__ 1716*4c1414c8SBarry Smith #define __FUNCT__ "MatAdjustForInodes_Inode" 1717*4c1414c8SBarry Smith PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes_Inode(Mat A,IS *rperm,IS *cperm) 1718*4c1414c8SBarry Smith { 1719*4c1414c8SBarry Smith Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 1720*4c1414c8SBarry Smith PetscErrorCode ierr; 1721*4c1414c8SBarry Smith PetscInt m = A->rmap.n,n = A->cmap.n,i,j,*ridx,*cidx,nslim_row = a->inode.node_count; 1722*4c1414c8SBarry Smith PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx; 1723*4c1414c8SBarry Smith PetscInt nslim_col,*ns_col; 1724*4c1414c8SBarry Smith IS ris = *rperm,cis = *cperm; 1725*4c1414c8SBarry Smith 1726*4c1414c8SBarry Smith PetscFunctionBegin; 1727*4c1414c8SBarry Smith if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */ 1728*4c1414c8SBarry Smith if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */ 1729*4c1414c8SBarry Smith 1730*4c1414c8SBarry Smith ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 1731*4c1414c8SBarry Smith ierr = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 1732*4c1414c8SBarry Smith ierr = PetscMalloc((m+n+1)*sizeof(PetscInt),&permr);CHKERRQ(ierr); 1733*4c1414c8SBarry Smith permc = permr + m; 1734*4c1414c8SBarry Smith 1735*4c1414c8SBarry Smith ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr); 1736*4c1414c8SBarry Smith ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr); 1737*4c1414c8SBarry Smith 1738*4c1414c8SBarry Smith /* Form the inode structure for the rows of permuted matric using inv perm*/ 1739*4c1414c8SBarry Smith for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i]; 1740*4c1414c8SBarry Smith 1741*4c1414c8SBarry Smith /* Construct the permutations for rows*/ 1742*4c1414c8SBarry Smith for (i=0,row = 0; i<nslim_row; ++i){ 1743*4c1414c8SBarry Smith indx = ridx[i]; 1744*4c1414c8SBarry Smith start_val = tns[indx]; 1745*4c1414c8SBarry Smith end_val = tns[indx + 1]; 1746*4c1414c8SBarry Smith for (j=start_val; j<end_val; ++j,++row) permr[row]= j; 1747*4c1414c8SBarry Smith } 1748*4c1414c8SBarry Smith 1749*4c1414c8SBarry Smith /* Form the inode structure for the columns of permuted matrix using inv perm*/ 1750*4c1414c8SBarry Smith for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i]; 1751*4c1414c8SBarry Smith 1752*4c1414c8SBarry Smith /* Construct permutations for columns */ 1753*4c1414c8SBarry Smith for (i=0,col=0; i<nslim_col; ++i){ 1754*4c1414c8SBarry Smith indx = cidx[i]; 1755*4c1414c8SBarry Smith start_val = tns[indx]; 1756*4c1414c8SBarry Smith end_val = tns[indx + 1]; 1757*4c1414c8SBarry Smith for (j = start_val; j<end_val; ++j,++col) permc[col]= j; 1758*4c1414c8SBarry Smith } 1759*4c1414c8SBarry Smith 1760*4c1414c8SBarry Smith ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);CHKERRQ(ierr); 1761*4c1414c8SBarry Smith ISSetPermutation(*rperm); 1762*4c1414c8SBarry Smith ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);CHKERRQ(ierr); 1763*4c1414c8SBarry Smith ISSetPermutation(*cperm); 1764*4c1414c8SBarry Smith 1765*4c1414c8SBarry Smith ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr); 1766*4c1414c8SBarry Smith ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr); 1767*4c1414c8SBarry Smith 1768*4c1414c8SBarry Smith ierr = PetscFree(ns_col);CHKERRQ(ierr); 1769*4c1414c8SBarry Smith ierr = PetscFree(permr);CHKERRQ(ierr); 1770*4c1414c8SBarry Smith ierr = ISDestroy(cis);CHKERRQ(ierr); 1771*4c1414c8SBarry Smith ierr = ISDestroy(ris);CHKERRQ(ierr); 1772*4c1414c8SBarry Smith ierr = PetscFree(tns);CHKERRQ(ierr); 1773*4c1414c8SBarry Smith PetscFunctionReturn(0); 1774*4c1414c8SBarry Smith } 1775*4c1414c8SBarry Smith EXTERN_C_END 1776*4c1414c8SBarry Smith 1777*4c1414c8SBarry Smith #undef __FUNCT__ 1778*4c1414c8SBarry Smith #define __FUNCT__ "MatInodeGetInodeSizes" 1779*4c1414c8SBarry Smith /*@C 1780*4c1414c8SBarry Smith MatInodeGetInodeSizes - Returns the inode information of the Inode matrix. 1781*4c1414c8SBarry Smith 1782*4c1414c8SBarry Smith Collective on Mat 1783*4c1414c8SBarry Smith 1784*4c1414c8SBarry Smith Input Parameter: 1785*4c1414c8SBarry Smith . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ 1786*4c1414c8SBarry Smith 1787*4c1414c8SBarry Smith Output Parameter: 1788*4c1414c8SBarry Smith + node_count - no of inodes present in the matrix. 1789*4c1414c8SBarry Smith . sizes - an array of size node_count,with sizes of each inode. 1790*4c1414c8SBarry Smith - limit - the max size used to generate the inodes. 1791*4c1414c8SBarry Smith 1792*4c1414c8SBarry Smith Level: advanced 1793*4c1414c8SBarry Smith 1794*4c1414c8SBarry Smith Notes: This routine returns some internal storage information 1795*4c1414c8SBarry Smith of the matrix, it is intended to be used by advanced users. 1796*4c1414c8SBarry Smith It should be called after the matrix is assembled. 1797*4c1414c8SBarry Smith The contents of the sizes[] array should not be changed. 1798*4c1414c8SBarry Smith PETSC_NULL may be passed for information not requested. 1799*4c1414c8SBarry Smith 1800*4c1414c8SBarry Smith .keywords: matrix, seqaij, get, inode 1801*4c1414c8SBarry Smith 1802*4c1414c8SBarry Smith .seealso: MatGetInfo() 1803*4c1414c8SBarry Smith @*/ 1804*4c1414c8SBarry Smith PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 1805*4c1414c8SBarry Smith { 1806*4c1414c8SBarry Smith PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*); 1807*4c1414c8SBarry Smith 1808*4c1414c8SBarry Smith PetscFunctionBegin; 1809*4c1414c8SBarry Smith if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix"); 1810*4c1414c8SBarry Smith ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr); 1811*4c1414c8SBarry Smith if (f) { 1812*4c1414c8SBarry Smith ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr); 1813*4c1414c8SBarry Smith } 1814*4c1414c8SBarry Smith PetscFunctionReturn(0); 1815*4c1414c8SBarry Smith } 1816*4c1414c8SBarry Smith 1817*4c1414c8SBarry Smith EXTERN_C_BEGIN 1818*4c1414c8SBarry Smith #undef __FUNCT__ 1819*4c1414c8SBarry Smith #define __FUNCT__ "MatInodeGetInodeSizes_Inode" 1820*4c1414c8SBarry Smith PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 1821*4c1414c8SBarry Smith { 1822*4c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1823*4c1414c8SBarry Smith 1824*4c1414c8SBarry Smith PetscFunctionBegin; 1825*4c1414c8SBarry Smith if (node_count) *node_count = a->inode.node_count; 1826*4c1414c8SBarry Smith if (sizes) *sizes = a->inode.size; 1827*4c1414c8SBarry Smith if (limit) *limit = a->inode.limit; 1828*4c1414c8SBarry Smith PetscFunctionReturn(0); 1829*4c1414c8SBarry Smith } 1830*4c1414c8SBarry Smith EXTERN_C_END 1831