1 2 /* 3 This is included by sbaij.c to generate unsigned short and regular versions of these two functions 4 */ 5 #undef __FUNCT__ 6 #define __FUNCT__ "MatMult_SeqSBAIJ_1_Hermitian" 7 #if defined(USESHORT) 8 PetscErrorCode MatMult_SeqSBAIJ_1_Hermitian_ushort(Mat A,Vec xx,Vec zz) 9 #else 10 PetscErrorCode MatMult_SeqSBAIJ_1_Hermitian(Mat A,Vec xx,Vec zz) 11 #endif 12 { 13 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 14 const PetscScalar *x; 15 PetscScalar *z,x1,sum; 16 const MatScalar *v; 17 MatScalar vj; 18 PetscErrorCode ierr; 19 PetscInt mbs=a->mbs,i,j,nz; 20 const PetscInt *ai=a->i; 21 #if defined(USESHORT) 22 const unsigned short *ib=a->jshort; 23 unsigned short ibt; 24 #else 25 const PetscInt *ib=a->j; 26 PetscInt ibt; 27 #endif 28 PetscInt nonzerorow = 0; 29 30 PetscFunctionBegin; 31 ierr = VecSet(zz,0.0);CHKERRQ(ierr); 32 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 33 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 34 35 v = a->a; 36 for (i=0; i<mbs; i++) { 37 nz = ai[i+1] - ai[i]; /* length of i_th row of A */ 38 if (!nz) continue; /* Move to the next row if the current row is empty */ 39 nonzerorow++; 40 x1 = x[i]; 41 sum = v[0]*x1; /* diagonal term */ 42 for (j=1; j<nz; j++) { 43 ibt = ib[j]; 44 vj = v[j]; 45 sum += vj * x[ibt]; /* (strict upper triangular part of A)*x */ 46 z[ibt] += PetscConj(v[j]) * x1; /* (strict lower triangular part of A)*x */ 47 } 48 z[i] += sum; 49 v += nz; 50 ib += nz; 51 } 52 53 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 54 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 55 ierr = PetscLogFlops(2.0*(2.0*a->nz - nonzerorow) - nonzerorow);CHKERRQ(ierr); 56 PetscFunctionReturn(0); 57 } 58 59 #undef __FUNCT__ 60 #define __FUNCT__ "MatMult_SeqSBAIJ_1" 61 #if defined(USESHORT) 62 PetscErrorCode MatMult_SeqSBAIJ_1_ushort(Mat A,Vec xx,Vec zz) 63 #else 64 PetscErrorCode MatMult_SeqSBAIJ_1(Mat A,Vec xx,Vec zz) 65 #endif 66 { 67 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 68 const PetscScalar *x; 69 PetscScalar *z,x1,sum; 70 const MatScalar *v; 71 MatScalar vj; 72 PetscErrorCode ierr; 73 PetscInt mbs=a->mbs,i,j,nz; 74 const PetscInt *ai=a->i; 75 #if defined(USESHORT) 76 const unsigned short *ib=a->jshort; 77 unsigned short ibt; 78 #else 79 const PetscInt *ib=a->j; 80 PetscInt ibt; 81 #endif 82 PetscInt nonzerorow=0; 83 84 PetscFunctionBegin; 85 ierr = VecSet(zz,0.0);CHKERRQ(ierr); 86 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 87 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 88 89 v = a->a; 90 for (i=0; i<mbs; i++) { 91 nz = ai[i+1] - ai[i]; /* length of i_th row of A */ 92 if (!nz) continue; /* Move to the next row if the current row is empty */ 93 nonzerorow++; 94 x1 = x[i]; 95 sum = v[0]*x1; /* diagonal term */ 96 for (j=1; j<nz; j++) { 97 ibt = ib[j]; 98 vj = v[j]; 99 z[ibt] += vj * x1; /* (strict lower triangular part of A)*x */ 100 sum += vj * x[ibt]; /* (strict upper triangular part of A)*x */ 101 } 102 z[i] += sum; 103 v += nz; 104 ib += nz; 105 } 106 107 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 108 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 109 ierr = PetscLogFlops(2.0*(2.0*a->nz - nonzerorow) - nonzerorow);CHKERRQ(ierr); 110 PetscFunctionReturn(0); 111 } 112 113 #undef __FUNCT__ 114 #define __FUNCT__ "MatSOR_SeqSBAIJ" 115 #if defined(USESHORT) 116 PetscErrorCode MatSOR_SeqSBAIJ_ushort(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 117 #else 118 PetscErrorCode MatSOR_SeqSBAIJ(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 119 #endif 120 { 121 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 122 const MatScalar *aa=a->a,*v,*v1,*aidiag; 123 PetscScalar *x,*t,sum; 124 const PetscScalar *b; 125 MatScalar tmp; 126 PetscErrorCode ierr; 127 PetscInt m=a->mbs,bs=A->rmap->bs,j; 128 const PetscInt *ai=a->i; 129 #if defined(USESHORT) 130 const unsigned short *aj=a->jshort,*vj,*vj1; 131 #else 132 const PetscInt *aj=a->j,*vj,*vj1; 133 #endif 134 PetscInt nz,nz1,i; 135 136 PetscFunctionBegin; 137 if (flag & SOR_EISENSTAT) SETERRQ(PETSC_ERR_SUP,"No support yet for Eisenstat"); 138 139 its = its*lits; 140 if (its <= 0) SETERRQ2(PETSC_ERR_ARG_WRONG,"Relaxation requires global its %D and local its %D both positive",its,lits); 141 142 if (bs > 1) SETERRQ(PETSC_ERR_SUP,"SSOR for block size > 1 is not yet implemented"); 143 144 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 145 if (xx != bb) { 146 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 147 } else { 148 b = x; 149 } 150 151 if (!a->idiagvalid) { 152 if (!a->idiag) { 153 ierr = PetscMalloc(m*sizeof(PetscScalar),&a->idiag);CHKERRQ(ierr); 154 } 155 for (i=0; i<a->mbs; i++) a->idiag[i] = 1.0/a->a[a->i[i]]; 156 a->idiagvalid = PETSC_TRUE; 157 } 158 159 if (!a->sor_work) { 160 ierr = PetscMalloc(m*sizeof(PetscScalar),&a->sor_work);CHKERRQ(ierr); 161 } 162 t = a->sor_work; 163 164 aidiag = a->idiag; 165 166 if (flag == SOR_APPLY_UPPER) { 167 /* apply (U + D/omega) to the vector */ 168 PetscScalar d; 169 for (i=0; i<m; i++) { 170 d = fshift + aa[ai[i]]; 171 nz = ai[i+1] - ai[i] - 1; 172 vj = aj + ai[i] + 1; 173 v = aa + ai[i] + 1; 174 sum = b[i]*d/omega; 175 PetscSparseDensePlusDot(sum,b,v,vj,nz); 176 x[i] = sum; 177 } 178 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 179 } 180 181 if (flag & SOR_ZERO_INITIAL_GUESS) { 182 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){ 183 ierr = PetscMemcpy(t,b,m*sizeof(PetscScalar));CHKERRQ(ierr); 184 185 v = aa + 1; 186 vj = aj + 1; 187 for (i=0; i<m; i++){ 188 nz = ai[i+1] - ai[i] - 1; 189 tmp = - (x[i] = omega*t[i]*aidiag[i]); 190 for (j=0; j<nz; j++) { 191 t[vj[j]] += tmp*v[j]; 192 } 193 v += nz + 1; 194 vj += nz + 1; 195 } 196 ierr = PetscLogFlops(2*a->nz);CHKERRQ(ierr); 197 } 198 199 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){ 200 int nz2; 201 if (!(flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP)){ 202 #if defined(PETSC_USE_BACKWARD_LOOP) 203 v = aa + ai[m] - 1; 204 vj = aj + ai[m] - 1; 205 for (i=m-1; i>=0; i--){ 206 sum = b[i]; 207 nz = ai[i+1] - ai[i] - 1; 208 {PetscInt __i;for(__i=0;__i<nz;__i++) sum -= v[-__i] * x[vj[-__i]];} 209 #else 210 v = aa + ai[m-1] + 1; 211 vj = aj + ai[m-1] + 1; 212 nz = 0; 213 for (i=m-1; i>=0; i--){ 214 sum = b[i]; 215 nz2 = ai[i] - ai[i-1] - 1; 216 PETSC_Prefetch(v-nz2-1,0,1); 217 PETSC_Prefetch(vj-nz2-1,0,1); 218 PetscSparseDenseMinusDot(sum,x,v,vj,nz); 219 nz = nz2; 220 #endif 221 x[i] = omega*sum*aidiag[i]; 222 v -= nz + 1; 223 vj -= nz + 1; 224 } 225 ierr = PetscLogFlops(2*a->nz);CHKERRQ(ierr); 226 } else { 227 v = aa + ai[m-1] + 1; 228 vj = aj + ai[m-1] + 1; 229 nz = 0; 230 for (i=m-1; i>=0; i--){ 231 sum = t[i]; 232 nz2 = ai[i] - ai[i-1] - 1; 233 PETSC_Prefetch(v-nz2-1,0,1); 234 PETSC_Prefetch(vj-nz2-1,0,1); 235 PetscSparseDenseMinusDot(sum,x,v,vj,nz); 236 x[i] = (1-omega)*x[i] + omega*sum*aidiag[i]; 237 nz = nz2; 238 v -= nz + 1; 239 vj -= nz + 1; 240 } 241 ierr = PetscLogFlops(2*a->nz);CHKERRQ(ierr); 242 } 243 } 244 its--; 245 } 246 247 while (its--) { 248 /* 249 forward sweep: 250 for i=0,...,m-1: 251 sum[i] = (b[i] - U(i,:)x )/d[i]; 252 x[i] = (1-omega)x[i] + omega*sum[i]; 253 b = b - x[i]*U^T(i,:); 254 255 */ 256 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){ 257 ierr = PetscMemcpy(t,b,m*sizeof(PetscScalar));CHKERRQ(ierr); 258 259 for (i=0; i<m; i++){ 260 v = aa + ai[i] + 1; v1=v; 261 vj = aj + ai[i] + 1; vj1=vj; 262 nz = ai[i+1] - ai[i] - 1; nz1=nz; 263 sum = t[i]; 264 ierr = PetscLogFlops(4.0*nz-2);CHKERRQ(ierr); 265 while (nz1--) sum -= (*v1++)*x[*vj1++]; 266 x[i] = (1-omega)*x[i] + omega*sum*aidiag[i]; 267 while (nz--) t[*vj++] -= x[i]*(*v++); 268 } 269 } 270 271 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){ 272 /* 273 backward sweep: 274 b = b - x[i]*U^T(i,:), i=0,...,n-2 275 for i=m-1,...,0: 276 sum[i] = (b[i] - U(i,:)x )/d[i]; 277 x[i] = (1-omega)x[i] + omega*sum[i]; 278 */ 279 /* if there was a forward sweep done above then I thing the next two for loops are not needed */ 280 ierr = PetscMemcpy(t,b,m*sizeof(PetscScalar));CHKERRQ(ierr); 281 282 for (i=0; i<m-1; i++){ /* update rhs */ 283 v = aa + ai[i] + 1; 284 vj = aj + ai[i] + 1; 285 nz = ai[i+1] - ai[i] - 1; 286 ierr = PetscLogFlops(2.0*nz-1);CHKERRQ(ierr); 287 while (nz--) t[*vj++] -= x[i]*(*v++); 288 } 289 for (i=m-1; i>=0; i--){ 290 v = aa + ai[i] + 1; 291 vj = aj + ai[i] + 1; 292 nz = ai[i+1] - ai[i] - 1; 293 ierr = PetscLogFlops(2.0*nz-1);CHKERRQ(ierr); 294 sum = t[i]; 295 while (nz--) sum -= x[*vj++]*(*v++); 296 x[i] = (1-omega)*x[i] + omega*sum*aidiag[i]; 297 } 298 } 299 } 300 301 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 302 if (bb != xx) { 303 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 304 } 305 PetscFunctionReturn(0); 306 } 307