1 2 /* 3 This is included by sbaij.c to generate unsigned short and regular versions of these two functions 4 */ 5 #undef __FUNCT__ 6 #if defined(USESHORT) 7 #define __FUNCT__ "MatMult_SeqSBAIJ_1_Hermitian_ushort" 8 PetscErrorCode MatMult_SeqSBAIJ_1_Hermitian_ushort(Mat A,Vec xx,Vec zz) 9 #else 10 #define __FUNCT__ "MatMult_SeqSBAIJ_1_Hermitian" 11 PetscErrorCode MatMult_SeqSBAIJ_1_Hermitian(Mat A,Vec xx,Vec zz) 12 #endif 13 { 14 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 15 const PetscScalar *x; 16 PetscScalar *z,x1,sum; 17 const MatScalar *v; 18 MatScalar vj; 19 PetscErrorCode ierr; 20 PetscInt mbs=a->mbs,i,j,nz; 21 const PetscInt *ai=a->i; 22 #if defined(USESHORT) 23 const unsigned short *ib=a->jshort; 24 unsigned short ibt; 25 #else 26 const PetscInt *ib=a->j; 27 PetscInt ibt; 28 #endif 29 30 PetscFunctionBegin; 31 ierr = VecSet(zz,0.0);CHKERRQ(ierr); 32 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 33 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 34 35 v = a->a; 36 for (i=0; i<mbs; i++) { 37 nz = ai[i+1] - ai[i]; /* length of i_th row of A */ 38 x1 = x[i]; 39 sum = v[0]*x1; /* diagonal term */ 40 for (j=1; j<nz; j++) { 41 ibt = ib[j]; 42 vj = v[j]; 43 sum += vj * x[ibt]; /* (strict upper triangular part of A)*x */ 44 z[ibt] += PetscConj(v[j]) * x1; /* (strict lower triangular part of A)*x */ 45 } 46 z[i] += sum; 47 v += nz; 48 ib += nz; 49 } 50 51 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 52 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 53 ierr = PetscLogFlops(2.0*(2.0*a->nz - mbs) - mbs);CHKERRQ(ierr); 54 PetscFunctionReturn(0); 55 } 56 57 #undef __FUNCT__ 58 #if defined(USESHORT) 59 #define __FUNCT__ "MatMult_SeqSBAIJ_1_ushort" 60 PetscErrorCode MatMult_SeqSBAIJ_1_ushort(Mat A,Vec xx,Vec zz) 61 #else 62 #define __FUNCT__ "MatMult_SeqSBAIJ_1" 63 PetscErrorCode MatMult_SeqSBAIJ_1(Mat A,Vec xx,Vec zz) 64 #endif 65 { 66 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 67 const PetscScalar *x; 68 PetscScalar *z,x1,sum; 69 const MatScalar *v; 70 MatScalar vj; 71 PetscErrorCode ierr; 72 PetscInt mbs=a->mbs,i,j,nz; 73 const PetscInt *ai=a->i; 74 #if defined(USESHORT) 75 const unsigned short *ib=a->jshort; 76 unsigned short ibt; 77 #else 78 const PetscInt *ib=a->j; 79 PetscInt ibt; 80 #endif 81 82 PetscFunctionBegin; 83 ierr = VecSet(zz,0.0);CHKERRQ(ierr); 84 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 85 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 86 87 v = a->a; 88 for (i=0; i<mbs; i++) { 89 nz = ai[i+1] - ai[i]; /* length of i_th row of A */ 90 x1 = x[i]; 91 sum = v[0]*x1; /* diagonal term */ 92 for (j=1; j<nz; j++) { 93 ibt = ib[j]; 94 vj = v[j]; 95 z[ibt] += vj * x1; /* (strict lower triangular part of A)*x */ 96 sum += vj * x[ibt]; /* (strict upper triangular part of A)*x */ 97 } 98 z[i] += sum; 99 v += nz; 100 ib += nz; 101 } 102 103 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 104 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 105 ierr = PetscLogFlops(2.0*(2.0*a->nz - mbs) - mbs);CHKERRQ(ierr); 106 PetscFunctionReturn(0); 107 } 108 109 #undef __FUNCT__ 110 #if defined(USESHORT) 111 #define __FUNCT__ "MatSOR_SeqSBAIJ_ushort" 112 PetscErrorCode MatSOR_SeqSBAIJ_ushort(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 113 #else 114 #define __FUNCT__ "MatSOR_SeqSBAIJ" 115 PetscErrorCode MatSOR_SeqSBAIJ(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 116 #endif 117 { 118 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 119 const MatScalar *aa=a->a,*v,*v1,*aidiag; 120 PetscScalar *x,*t,sum; 121 const PetscScalar *b; 122 MatScalar tmp; 123 PetscErrorCode ierr; 124 PetscInt m=a->mbs,bs=A->rmap->bs,j; 125 const PetscInt *ai=a->i; 126 #if defined(USESHORT) 127 const unsigned short *aj=a->jshort,*vj,*vj1; 128 #else 129 const PetscInt *aj=a->j,*vj,*vj1; 130 #endif 131 PetscInt nz,nz1,i; 132 133 PetscFunctionBegin; 134 if (flag & SOR_EISENSTAT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support yet for Eisenstat"); 135 136 its = its*lits; 137 if (its <= 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Relaxation requires global its %D and local its %D both positive",its,lits); 138 139 if (bs > 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"SSOR for block size > 1 is not yet implemented"); 140 141 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 142 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 143 144 if (!a->idiagvalid) { 145 if (!a->idiag) { 146 ierr = PetscMalloc(m*sizeof(PetscScalar),&a->idiag);CHKERRQ(ierr); 147 } 148 for (i=0; i<a->mbs; i++) a->idiag[i] = 1.0/a->a[a->i[i]]; 149 a->idiagvalid = PETSC_TRUE; 150 } 151 152 if (!a->sor_work) { 153 ierr = PetscMalloc(m*sizeof(PetscScalar),&a->sor_work);CHKERRQ(ierr); 154 } 155 t = a->sor_work; 156 157 aidiag = a->idiag; 158 159 if (flag == SOR_APPLY_UPPER) { 160 /* apply (U + D/omega) to the vector */ 161 PetscScalar d; 162 for (i=0; i<m; i++) { 163 d = fshift + aa[ai[i]]; 164 nz = ai[i+1] - ai[i] - 1; 165 vj = aj + ai[i] + 1; 166 v = aa + ai[i] + 1; 167 sum = b[i]*d/omega; 168 PetscSparseDensePlusDot(sum,b,v,vj,nz); 169 x[i] = sum; 170 } 171 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 172 } 173 174 if (flag & SOR_ZERO_INITIAL_GUESS) { 175 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){ 176 ierr = PetscMemcpy(t,b,m*sizeof(PetscScalar));CHKERRQ(ierr); 177 178 v = aa + 1; 179 vj = aj + 1; 180 for (i=0; i<m; i++){ 181 nz = ai[i+1] - ai[i] - 1; 182 tmp = - (x[i] = omega*t[i]*aidiag[i]); 183 for (j=0; j<nz; j++) { 184 t[vj[j]] += tmp*v[j]; 185 } 186 v += nz + 1; 187 vj += nz + 1; 188 } 189 ierr = PetscLogFlops(2*a->nz);CHKERRQ(ierr); 190 } 191 192 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){ 193 int nz2; 194 if (!(flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP)){ 195 #if defined(PETSC_USE_BACKWARD_LOOP) 196 v = aa + ai[m] - 1; 197 vj = aj + ai[m] - 1; 198 for (i=m-1; i>=0; i--){ 199 sum = b[i]; 200 nz = ai[i+1] - ai[i] - 1; 201 {PetscInt __i;for(__i=0;__i<nz;__i++) sum -= v[-__i] * x[vj[-__i]];} 202 #else 203 v = aa + ai[m-1] + 1; 204 vj = aj + ai[m-1] + 1; 205 nz = 0; 206 for (i=m-1; i>=0; i--){ 207 sum = b[i]; 208 nz2 = ai[i] - ai[i-1] - 1; 209 PETSC_Prefetch(v-nz2-1,0,PETSC_PREFETCH_HINT_NTA); 210 PETSC_Prefetch(vj-nz2-1,0,PETSC_PREFETCH_HINT_NTA); 211 PetscSparseDenseMinusDot(sum,x,v,vj,nz); 212 nz = nz2; 213 #endif 214 x[i] = omega*sum*aidiag[i]; 215 v -= nz + 1; 216 vj -= nz + 1; 217 } 218 ierr = PetscLogFlops(2*a->nz);CHKERRQ(ierr); 219 } else { 220 v = aa + ai[m-1] + 1; 221 vj = aj + ai[m-1] + 1; 222 nz = 0; 223 for (i=m-1; i>=0; i--){ 224 sum = t[i]; 225 nz2 = ai[i] - ai[i-1] - 1; 226 PETSC_Prefetch(v-nz2-1,0,PETSC_PREFETCH_HINT_NTA); 227 PETSC_Prefetch(vj-nz2-1,0,PETSC_PREFETCH_HINT_NTA); 228 PetscSparseDenseMinusDot(sum,x,v,vj,nz); 229 x[i] = (1-omega)*x[i] + omega*sum*aidiag[i]; 230 nz = nz2; 231 v -= nz + 1; 232 vj -= nz + 1; 233 } 234 ierr = PetscLogFlops(2*a->nz);CHKERRQ(ierr); 235 } 236 } 237 its--; 238 } 239 240 while (its--) { 241 /* 242 forward sweep: 243 for i=0,...,m-1: 244 sum[i] = (b[i] - U(i,:)x )/d[i]; 245 x[i] = (1-omega)x[i] + omega*sum[i]; 246 b = b - x[i]*U^T(i,:); 247 248 */ 249 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){ 250 ierr = PetscMemcpy(t,b,m*sizeof(PetscScalar));CHKERRQ(ierr); 251 252 for (i=0; i<m; i++){ 253 v = aa + ai[i] + 1; v1=v; 254 vj = aj + ai[i] + 1; vj1=vj; 255 nz = ai[i+1] - ai[i] - 1; nz1=nz; 256 sum = t[i]; 257 ierr = PetscLogFlops(4.0*nz-2);CHKERRQ(ierr); 258 while (nz1--) sum -= (*v1++)*x[*vj1++]; 259 x[i] = (1-omega)*x[i] + omega*sum*aidiag[i]; 260 while (nz--) t[*vj++] -= x[i]*(*v++); 261 } 262 } 263 264 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){ 265 /* 266 backward sweep: 267 b = b - x[i]*U^T(i,:), i=0,...,n-2 268 for i=m-1,...,0: 269 sum[i] = (b[i] - U(i,:)x )/d[i]; 270 x[i] = (1-omega)x[i] + omega*sum[i]; 271 */ 272 /* if there was a forward sweep done above then I thing the next two for loops are not needed */ 273 ierr = PetscMemcpy(t,b,m*sizeof(PetscScalar));CHKERRQ(ierr); 274 275 for (i=0; i<m-1; i++){ /* update rhs */ 276 v = aa + ai[i] + 1; 277 vj = aj + ai[i] + 1; 278 nz = ai[i+1] - ai[i] - 1; 279 ierr = PetscLogFlops(2.0*nz-1);CHKERRQ(ierr); 280 while (nz--) t[*vj++] -= x[i]*(*v++); 281 } 282 for (i=m-1; i>=0; i--){ 283 v = aa + ai[i] + 1; 284 vj = aj + ai[i] + 1; 285 nz = ai[i+1] - ai[i] - 1; 286 ierr = PetscLogFlops(2.0*nz-1);CHKERRQ(ierr); 287 sum = t[i]; 288 while (nz--) sum -= x[*vj++]*(*v++); 289 x[i] = (1-omega)*x[i] + omega*sum*aidiag[i]; 290 } 291 } 292 } 293 294 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 295 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 296 PetscFunctionReturn(0); 297 } 298