1*b5b17502SBarry Smith 2*b5b17502SBarry Smith /* 3*b5b17502SBarry Smith This is included by sbaij.c to generate unsigned short and regular versions of these two functions 4*b5b17502SBarry Smith */ 5*b5b17502SBarry Smith #undef USHORT 6*b5b17502SBarry Smith #if defined(USESHORT) 7*b5b17502SBarry Smith #define USHORT _ushort 8*b5b17502SBarry Smith #else 9*b5b17502SBarry Smith #define USHORT 10*b5b17502SBarry Smith #endif 11*b5b17502SBarry Smith #define PETSCMAP1_a(a,b) a ## b 12*b5b17502SBarry Smith #define PETSCMAP1_b(a,b) PETSCMAP1_a(a,b) 13*b5b17502SBarry Smith #define PETSCMAP1(a) PETSCMAP1_b(a,USHORT) 14*b5b17502SBarry Smith 15*b5b17502SBarry Smith #undef __FUNCT__ 16*b5b17502SBarry Smith #define __FUNCT__ "MatMult_SeqSBAIJ_1" 17*b5b17502SBarry Smith PetscErrorCode PETSCMAP1(MatMult_SeqSBAIJ_1)(Mat A,Vec xx,Vec zz) 18*b5b17502SBarry Smith { 19*b5b17502SBarry Smith Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 20*b5b17502SBarry Smith const PetscScalar *x; 21*b5b17502SBarry Smith PetscScalar *z,x1,sum; 22*b5b17502SBarry Smith const MatScalar *v; 23*b5b17502SBarry Smith MatScalar vj; 24*b5b17502SBarry Smith PetscErrorCode ierr; 25*b5b17502SBarry Smith PetscInt mbs=a->mbs,i,j,nz; 26*b5b17502SBarry Smith const PetscInt *ai=a->i; 27*b5b17502SBarry Smith #if defined(USESHORT) 28*b5b17502SBarry Smith const unsigned short *ib=a->jshort; 29*b5b17502SBarry Smith unsigned short ibt; 30*b5b17502SBarry Smith #else 31*b5b17502SBarry Smith const PetscInt *ib=a->j; 32*b5b17502SBarry Smith PetscInt ibt; 33*b5b17502SBarry Smith #endif 34*b5b17502SBarry Smith 35*b5b17502SBarry Smith PetscFunctionBegin; 36*b5b17502SBarry Smith ierr = VecSet(zz,0.0);CHKERRQ(ierr); 37*b5b17502SBarry Smith ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 38*b5b17502SBarry Smith ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 39*b5b17502SBarry Smith 40*b5b17502SBarry Smith v = a->a; 41*b5b17502SBarry Smith for (i=0; i<mbs; i++) { 42*b5b17502SBarry Smith nz = ai[i+1] - ai[i]; /* length of i_th row of A */ 43*b5b17502SBarry Smith x1 = x[i]; 44*b5b17502SBarry Smith sum = v[0]*x1; /* diagonal term */ 45*b5b17502SBarry Smith for (j=1; j<nz; j++) { 46*b5b17502SBarry Smith ibt = ib[j]; 47*b5b17502SBarry Smith vj = v[j]; 48*b5b17502SBarry Smith z[ibt] += vj * x1; /* (strict lower triangular part of A)*x */ 49*b5b17502SBarry Smith sum += vj * x[ibt]; /* (strict upper triangular part of A)*x */ 50*b5b17502SBarry Smith } 51*b5b17502SBarry Smith z[i] += sum; 52*b5b17502SBarry Smith v += nz; 53*b5b17502SBarry Smith ib += nz; 54*b5b17502SBarry Smith } 55*b5b17502SBarry Smith 56*b5b17502SBarry Smith ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 57*b5b17502SBarry Smith ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 58*b5b17502SBarry Smith ierr = PetscLogFlops(2.0*(2.0*a->nz - mbs) - mbs);CHKERRQ(ierr); 59*b5b17502SBarry Smith PetscFunctionReturn(0); 60*b5b17502SBarry Smith } 61*b5b17502SBarry Smith 62*b5b17502SBarry Smith #undef __FUNCT__ 63*b5b17502SBarry Smith #define __FUNCT__ "MatRelax_SeqSBAIJ" 64*b5b17502SBarry Smith PetscErrorCode PETSCMAP1(MatRelax_SeqSBAIJ)(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 65*b5b17502SBarry Smith { 66*b5b17502SBarry Smith Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 67*b5b17502SBarry Smith const MatScalar *aa=a->a,*v,*v1,*aidiag; 68*b5b17502SBarry Smith PetscScalar *x,*t,sum; 69*b5b17502SBarry Smith const PetscScalar *b; 70*b5b17502SBarry Smith MatScalar tmp; 71*b5b17502SBarry Smith PetscErrorCode ierr; 72*b5b17502SBarry Smith PetscInt m=a->mbs,bs=A->rmap->bs,j; 73*b5b17502SBarry Smith const PetscInt *ai=a->i; 74*b5b17502SBarry Smith #if defined(USESHORT) 75*b5b17502SBarry Smith const unsigned short *aj=a->jshort,*vj,*vj1; 76*b5b17502SBarry Smith #else 77*b5b17502SBarry Smith const PetscInt *aj=a->j,*vj,*vj1; 78*b5b17502SBarry Smith #endif 79*b5b17502SBarry Smith PetscInt nz,nz1,i; 80*b5b17502SBarry Smith 81*b5b17502SBarry Smith PetscFunctionBegin; 82*b5b17502SBarry Smith if (flag & SOR_EISENSTAT) SETERRQ(PETSC_ERR_SUP,"No support yet for Eisenstat"); 83*b5b17502SBarry Smith 84*b5b17502SBarry Smith its = its*lits; 85*b5b17502SBarry Smith if (its <= 0) SETERRQ2(PETSC_ERR_ARG_WRONG,"Relaxation requires global its %D and local its %D both positive",its,lits); 86*b5b17502SBarry Smith 87*b5b17502SBarry Smith if (bs > 1) SETERRQ(PETSC_ERR_SUP,"SSOR for block size > 1 is not yet implemented"); 88*b5b17502SBarry Smith 89*b5b17502SBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 90*b5b17502SBarry Smith if (xx != bb) { 91*b5b17502SBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 92*b5b17502SBarry Smith } else { 93*b5b17502SBarry Smith b = x; 94*b5b17502SBarry Smith } 95*b5b17502SBarry Smith 96*b5b17502SBarry Smith if (!a->idiagvalid) { 97*b5b17502SBarry Smith if (!a->idiag) { 98*b5b17502SBarry Smith ierr = PetscMalloc(m*sizeof(PetscScalar),&a->idiag);CHKERRQ(ierr); 99*b5b17502SBarry Smith } 100*b5b17502SBarry Smith for (i=0; i<a->mbs; i++) a->idiag[i] = 1.0/a->a[a->i[i]]; 101*b5b17502SBarry Smith a->idiagvalid = PETSC_TRUE; 102*b5b17502SBarry Smith } 103*b5b17502SBarry Smith 104*b5b17502SBarry Smith if (!a->relax_work) { 105*b5b17502SBarry Smith ierr = PetscMalloc(m*sizeof(PetscScalar),&a->relax_work);CHKERRQ(ierr); 106*b5b17502SBarry Smith } 107*b5b17502SBarry Smith t = a->relax_work; 108*b5b17502SBarry Smith 109*b5b17502SBarry Smith aidiag = a->idiag; 110*b5b17502SBarry Smith 111*b5b17502SBarry Smith if (flag == SOR_APPLY_UPPER) { 112*b5b17502SBarry Smith /* apply (U + D/omega) to the vector */ 113*b5b17502SBarry Smith PetscScalar d; 114*b5b17502SBarry Smith for (i=0; i<m; i++) { 115*b5b17502SBarry Smith d = fshift + aa[ai[i]]; 116*b5b17502SBarry Smith nz = ai[i+1] - ai[i] - 1; 117*b5b17502SBarry Smith vj = aj + ai[i] + 1; 118*b5b17502SBarry Smith v = aa + ai[i] + 1; 119*b5b17502SBarry Smith sum = b[i]*d/omega; 120*b5b17502SBarry Smith PetscSparseDensePlusDot(sum,b,v,vj,nz); 121*b5b17502SBarry Smith x[i] = sum; 122*b5b17502SBarry Smith } 123*b5b17502SBarry Smith ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 124*b5b17502SBarry Smith } 125*b5b17502SBarry Smith 126*b5b17502SBarry Smith if (flag & SOR_ZERO_INITIAL_GUESS) { 127*b5b17502SBarry Smith if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){ 128*b5b17502SBarry Smith ierr = PetscMemcpy(t,b,m*sizeof(PetscScalar));CHKERRQ(ierr); 129*b5b17502SBarry Smith 130*b5b17502SBarry Smith v = aa + 1; 131*b5b17502SBarry Smith vj = aj + 1; 132*b5b17502SBarry Smith for (i=0; i<m; i++){ 133*b5b17502SBarry Smith nz = ai[i+1] - ai[i] - 1; 134*b5b17502SBarry Smith tmp = - (x[i] = omega*t[i]*aidiag[i]); 135*b5b17502SBarry Smith for (j=0; j<nz; j++) { 136*b5b17502SBarry Smith t[vj[j]] += tmp*v[j]; 137*b5b17502SBarry Smith } 138*b5b17502SBarry Smith v += nz + 1; 139*b5b17502SBarry Smith vj += nz + 1; 140*b5b17502SBarry Smith } 141*b5b17502SBarry Smith ierr = PetscLogFlops(2*a->nz);CHKERRQ(ierr); 142*b5b17502SBarry Smith } 143*b5b17502SBarry Smith 144*b5b17502SBarry Smith if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){ 145*b5b17502SBarry Smith int nz2; 146*b5b17502SBarry Smith if (!(flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP)){ 147*b5b17502SBarry Smith #define PETSC_USE_BACKWARD_LOOP 148*b5b17502SBarry Smith #if defined(PETSC_USE_BACKWARD_LOOP) 149*b5b17502SBarry Smith v = aa + ai[m] - 1; 150*b5b17502SBarry Smith vj = aj + ai[m] - 1; 151*b5b17502SBarry Smith for (i=m-1; i>=0; i--){ 152*b5b17502SBarry Smith sum = b[i]; 153*b5b17502SBarry Smith nz = ai[i+1] - ai[i] - 1; 154*b5b17502SBarry Smith {PetscInt __i;for(__i=0;__i<nz;__i++) sum -= v[-__i] * x[vj[-__i]];} 155*b5b17502SBarry Smith #else 156*b5b17502SBarry Smith v = aa + ai[m-1] + 1; 157*b5b17502SBarry Smith vj = aj + ai[m-1] + 1; 158*b5b17502SBarry Smith nz = 0; 159*b5b17502SBarry Smith for (i=m-1; i>=0; i--){ 160*b5b17502SBarry Smith sum = b[i]; 161*b5b17502SBarry Smith nz2 = ai[i] - ai[i-1] - 1; 162*b5b17502SBarry Smith PETSC_Prefetch(v-nz2-1,0,1); 163*b5b17502SBarry Smith PETSC_Prefetch(vj-nz2-1,0,1); 164*b5b17502SBarry Smith PetscSparseDenseMinusDot(sum,x,v,vj,nz); 165*b5b17502SBarry Smith nz = nz2; 166*b5b17502SBarry Smith #endif 167*b5b17502SBarry Smith x[i] = omega*sum*aidiag[i]; 168*b5b17502SBarry Smith v -= nz + 1; 169*b5b17502SBarry Smith vj -= nz + 1; 170*b5b17502SBarry Smith } 171*b5b17502SBarry Smith ierr = PetscLogFlops(2*a->nz);CHKERRQ(ierr); 172*b5b17502SBarry Smith } else { 173*b5b17502SBarry Smith v = aa + ai[m-1] + 1; 174*b5b17502SBarry Smith vj = aj + ai[m-1] + 1; 175*b5b17502SBarry Smith nz = 0; 176*b5b17502SBarry Smith for (i=m-1; i>=0; i--){ 177*b5b17502SBarry Smith sum = t[i]; 178*b5b17502SBarry Smith nz2 = ai[i] - ai[i-1] - 1; 179*b5b17502SBarry Smith PETSC_Prefetch(v-nz2-1,0,1); 180*b5b17502SBarry Smith PETSC_Prefetch(vj-nz2-1,0,1); 181*b5b17502SBarry Smith PetscSparseDenseMinusDot(sum,x,v,vj,nz); 182*b5b17502SBarry Smith x[i] = (1-omega)*x[i] + omega*sum*aidiag[i]; 183*b5b17502SBarry Smith nz = nz2; 184*b5b17502SBarry Smith v -= nz + 1; 185*b5b17502SBarry Smith vj -= nz + 1; 186*b5b17502SBarry Smith } 187*b5b17502SBarry Smith ierr = PetscLogFlops(2*a->nz);CHKERRQ(ierr); 188*b5b17502SBarry Smith } 189*b5b17502SBarry Smith } 190*b5b17502SBarry Smith its--; 191*b5b17502SBarry Smith } 192*b5b17502SBarry Smith 193*b5b17502SBarry Smith while (its--) { 194*b5b17502SBarry Smith /* 195*b5b17502SBarry Smith forward sweep: 196*b5b17502SBarry Smith for i=0,...,m-1: 197*b5b17502SBarry Smith sum[i] = (b[i] - U(i,:)x )/d[i]; 198*b5b17502SBarry Smith x[i] = (1-omega)x[i] + omega*sum[i]; 199*b5b17502SBarry Smith b = b - x[i]*U^T(i,:); 200*b5b17502SBarry Smith 201*b5b17502SBarry Smith */ 202*b5b17502SBarry Smith if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){ 203*b5b17502SBarry Smith ierr = PetscMemcpy(t,b,m*sizeof(PetscScalar));CHKERRQ(ierr); 204*b5b17502SBarry Smith 205*b5b17502SBarry Smith for (i=0; i<m; i++){ 206*b5b17502SBarry Smith v = aa + ai[i] + 1; v1=v; 207*b5b17502SBarry Smith vj = aj + ai[i] + 1; vj1=vj; 208*b5b17502SBarry Smith nz = ai[i+1] - ai[i] - 1; nz1=nz; 209*b5b17502SBarry Smith sum = t[i]; 210*b5b17502SBarry Smith ierr = PetscLogFlops(4.0*nz-2);CHKERRQ(ierr); 211*b5b17502SBarry Smith while (nz1--) sum -= (*v1++)*x[*vj1++]; 212*b5b17502SBarry Smith x[i] = (1-omega)*x[i] + omega*sum*aidiag[i]; 213*b5b17502SBarry Smith while (nz--) t[*vj++] -= x[i]*(*v++); 214*b5b17502SBarry Smith } 215*b5b17502SBarry Smith } 216*b5b17502SBarry Smith 217*b5b17502SBarry Smith if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){ 218*b5b17502SBarry Smith /* 219*b5b17502SBarry Smith backward sweep: 220*b5b17502SBarry Smith b = b - x[i]*U^T(i,:), i=0,...,n-2 221*b5b17502SBarry Smith for i=m-1,...,0: 222*b5b17502SBarry Smith sum[i] = (b[i] - U(i,:)x )/d[i]; 223*b5b17502SBarry Smith x[i] = (1-omega)x[i] + omega*sum[i]; 224*b5b17502SBarry Smith */ 225*b5b17502SBarry Smith /* if there was a forward sweep done above then I thing the next two for loops are not needed */ 226*b5b17502SBarry Smith ierr = PetscMemcpy(t,b,m*sizeof(PetscScalar));CHKERRQ(ierr); 227*b5b17502SBarry Smith 228*b5b17502SBarry Smith for (i=0; i<m-1; i++){ /* update rhs */ 229*b5b17502SBarry Smith v = aa + ai[i] + 1; 230*b5b17502SBarry Smith vj = aj + ai[i] + 1; 231*b5b17502SBarry Smith nz = ai[i+1] - ai[i] - 1; 232*b5b17502SBarry Smith ierr = PetscLogFlops(2.0*nz-1);CHKERRQ(ierr); 233*b5b17502SBarry Smith while (nz--) t[*vj++] -= x[i]*(*v++); 234*b5b17502SBarry Smith } 235*b5b17502SBarry Smith for (i=m-1; i>=0; i--){ 236*b5b17502SBarry Smith v = aa + ai[i] + 1; 237*b5b17502SBarry Smith vj = aj + ai[i] + 1; 238*b5b17502SBarry Smith nz = ai[i+1] - ai[i] - 1; 239*b5b17502SBarry Smith ierr = PetscLogFlops(2.0*nz-1);CHKERRQ(ierr); 240*b5b17502SBarry Smith sum = t[i]; 241*b5b17502SBarry Smith while (nz--) sum -= x[*vj++]*(*v++); 242*b5b17502SBarry Smith x[i] = (1-omega)*x[i] + omega*sum*aidiag[i]; 243*b5b17502SBarry Smith } 244*b5b17502SBarry Smith } 245*b5b17502SBarry Smith } 246*b5b17502SBarry Smith 247*b5b17502SBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 248*b5b17502SBarry Smith if (bb != xx) { 249*b5b17502SBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 250*b5b17502SBarry Smith } 251*b5b17502SBarry Smith PetscFunctionReturn(0); 252*b5b17502SBarry Smith } 253