1 2 /* 3 This is included by sbaij.c to generate unsigned short and regular versions of these two functions 4 */ 5 #undef __FUNCT__ 6 #if defined(USESHORT) 7 #define __FUNCT__ "MatMult_SeqSBAIJ_1_Hermitian_ushort" 8 PetscErrorCode MatMult_SeqSBAIJ_1_Hermitian_ushort(Mat A,Vec xx,Vec zz) 9 #else 10 #define __FUNCT__ "MatMult_SeqSBAIJ_1_Hermitian" 11 PetscErrorCode MatMult_SeqSBAIJ_1_Hermitian(Mat A,Vec xx,Vec zz) 12 #endif 13 { 14 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 15 const PetscScalar *x; 16 PetscScalar *z,x1,sum; 17 const MatScalar *v; 18 MatScalar vj; 19 PetscErrorCode ierr; 20 PetscInt mbs=a->mbs,i,j,nz; 21 const PetscInt *ai=a->i; 22 #if defined(USESHORT) 23 const unsigned short *ib=a->jshort; 24 unsigned short ibt; 25 #else 26 const PetscInt *ib=a->j; 27 PetscInt ibt; 28 #endif 29 PetscInt nonzerorow = 0; 30 31 PetscFunctionBegin; 32 ierr = VecSet(zz,0.0);CHKERRQ(ierr); 33 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 34 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 35 36 v = a->a; 37 for (i=0; i<mbs; i++) { 38 nz = ai[i+1] - ai[i]; /* length of i_th row of A */ 39 if (!nz) continue; /* Move to the next row if the current row is empty */ 40 nonzerorow++; 41 x1 = x[i]; 42 sum = v[0]*x1; /* diagonal term */ 43 for (j=1; j<nz; j++) { 44 ibt = ib[j]; 45 vj = v[j]; 46 sum += vj * x[ibt]; /* (strict upper triangular part of A)*x */ 47 z[ibt] += PetscConj(v[j]) * x1; /* (strict lower triangular part of A)*x */ 48 } 49 z[i] += sum; 50 v += nz; 51 ib += nz; 52 } 53 54 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 55 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 56 ierr = PetscLogFlops(2.0*(2.0*a->nz - nonzerorow) - nonzerorow);CHKERRQ(ierr); 57 PetscFunctionReturn(0); 58 } 59 60 #undef __FUNCT__ 61 #if defined(USESHORT) 62 #define __FUNCT__ "MatMult_SeqSBAIJ_1_ushort" 63 PetscErrorCode MatMult_SeqSBAIJ_1_ushort(Mat A,Vec xx,Vec zz) 64 #else 65 #define __FUNCT__ "MatMult_SeqSBAIJ_1" 66 PetscErrorCode MatMult_SeqSBAIJ_1(Mat A,Vec xx,Vec zz) 67 #endif 68 { 69 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 70 const PetscScalar *x; 71 PetscScalar *z,x1,sum; 72 const MatScalar *v; 73 MatScalar vj; 74 PetscErrorCode ierr; 75 PetscInt mbs=a->mbs,i,j,nz; 76 const PetscInt *ai=a->i; 77 #if defined(USESHORT) 78 const unsigned short *ib=a->jshort; 79 unsigned short ibt; 80 #else 81 const PetscInt *ib=a->j; 82 PetscInt ibt; 83 #endif 84 PetscInt nonzerorow=0; 85 86 PetscFunctionBegin; 87 ierr = VecSet(zz,0.0);CHKERRQ(ierr); 88 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 89 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 90 91 v = a->a; 92 for (i=0; i<mbs; i++) { 93 nz = ai[i+1] - ai[i]; /* length of i_th row of A */ 94 if (!nz) continue; /* Move to the next row if the current row is empty */ 95 nonzerorow++; 96 x1 = x[i]; 97 sum = v[0]*x1; /* diagonal term */ 98 for (j=1; j<nz; j++) { 99 ibt = ib[j]; 100 vj = v[j]; 101 z[ibt] += vj * x1; /* (strict lower triangular part of A)*x */ 102 sum += vj * x[ibt]; /* (strict upper triangular part of A)*x */ 103 } 104 z[i] += sum; 105 v += nz; 106 ib += nz; 107 } 108 109 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 110 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 111 ierr = PetscLogFlops(2.0*(2.0*a->nz - nonzerorow) - nonzerorow);CHKERRQ(ierr); 112 PetscFunctionReturn(0); 113 } 114 115 #undef __FUNCT__ 116 #if defined(USESHORT) 117 #define __FUNCT__ "MatSOR_SeqSBAIJ_ushort" 118 PetscErrorCode MatSOR_SeqSBAIJ_ushort(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 119 #else 120 #define __FUNCT__ "MatSOR_SeqSBAIJ" 121 PetscErrorCode MatSOR_SeqSBAIJ(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 122 #endif 123 { 124 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 125 const MatScalar *aa=a->a,*v,*v1,*aidiag; 126 PetscScalar *x,*t,sum; 127 const PetscScalar *b; 128 MatScalar tmp; 129 PetscErrorCode ierr; 130 PetscInt m=a->mbs,bs=A->rmap->bs,j; 131 const PetscInt *ai=a->i; 132 #if defined(USESHORT) 133 const unsigned short *aj=a->jshort,*vj,*vj1; 134 #else 135 const PetscInt *aj=a->j,*vj,*vj1; 136 #endif 137 PetscInt nz,nz1,i; 138 139 PetscFunctionBegin; 140 if (flag & SOR_EISENSTAT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support yet for Eisenstat"); 141 142 its = its*lits; 143 if (its <= 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Relaxation requires global its %D and local its %D both positive",its,lits); 144 145 if (bs > 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"SSOR for block size > 1 is not yet implemented"); 146 147 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 148 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 149 150 if (!a->idiagvalid) { 151 if (!a->idiag) { 152 ierr = PetscMalloc(m*sizeof(PetscScalar),&a->idiag);CHKERRQ(ierr); 153 } 154 for (i=0; i<a->mbs; i++) a->idiag[i] = 1.0/a->a[a->i[i]]; 155 a->idiagvalid = PETSC_TRUE; 156 } 157 158 if (!a->sor_work) { 159 ierr = PetscMalloc(m*sizeof(PetscScalar),&a->sor_work);CHKERRQ(ierr); 160 } 161 t = a->sor_work; 162 163 aidiag = a->idiag; 164 165 if (flag == SOR_APPLY_UPPER) { 166 /* apply (U + D/omega) to the vector */ 167 PetscScalar d; 168 for (i=0; i<m; i++) { 169 d = fshift + aa[ai[i]]; 170 nz = ai[i+1] - ai[i] - 1; 171 vj = aj + ai[i] + 1; 172 v = aa + ai[i] + 1; 173 sum = b[i]*d/omega; 174 PetscSparseDensePlusDot(sum,b,v,vj,nz); 175 x[i] = sum; 176 } 177 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 178 } 179 180 if (flag & SOR_ZERO_INITIAL_GUESS) { 181 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){ 182 ierr = PetscMemcpy(t,b,m*sizeof(PetscScalar));CHKERRQ(ierr); 183 184 v = aa + 1; 185 vj = aj + 1; 186 for (i=0; i<m; i++){ 187 nz = ai[i+1] - ai[i] - 1; 188 tmp = - (x[i] = omega*t[i]*aidiag[i]); 189 for (j=0; j<nz; j++) { 190 t[vj[j]] += tmp*v[j]; 191 } 192 v += nz + 1; 193 vj += nz + 1; 194 } 195 ierr = PetscLogFlops(2*a->nz);CHKERRQ(ierr); 196 } 197 198 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){ 199 int nz2; 200 if (!(flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP)){ 201 #if defined(PETSC_USE_BACKWARD_LOOP) 202 v = aa + ai[m] - 1; 203 vj = aj + ai[m] - 1; 204 for (i=m-1; i>=0; i--){ 205 sum = b[i]; 206 nz = ai[i+1] - ai[i] - 1; 207 {PetscInt __i;for(__i=0;__i<nz;__i++) sum -= v[-__i] * x[vj[-__i]];} 208 #else 209 v = aa + ai[m-1] + 1; 210 vj = aj + ai[m-1] + 1; 211 nz = 0; 212 for (i=m-1; i>=0; i--){ 213 sum = b[i]; 214 nz2 = ai[i] - ai[i-1] - 1; 215 PETSC_Prefetch(v-nz2-1,0,PETSC_PREFETCH_HINT_NTA); 216 PETSC_Prefetch(vj-nz2-1,0,PETSC_PREFETCH_HINT_NTA); 217 PetscSparseDenseMinusDot(sum,x,v,vj,nz); 218 nz = nz2; 219 #endif 220 x[i] = omega*sum*aidiag[i]; 221 v -= nz + 1; 222 vj -= nz + 1; 223 } 224 ierr = PetscLogFlops(2*a->nz);CHKERRQ(ierr); 225 } else { 226 v = aa + ai[m-1] + 1; 227 vj = aj + ai[m-1] + 1; 228 nz = 0; 229 for (i=m-1; i>=0; i--){ 230 sum = t[i]; 231 nz2 = ai[i] - ai[i-1] - 1; 232 PETSC_Prefetch(v-nz2-1,0,PETSC_PREFETCH_HINT_NTA); 233 PETSC_Prefetch(vj-nz2-1,0,PETSC_PREFETCH_HINT_NTA); 234 PetscSparseDenseMinusDot(sum,x,v,vj,nz); 235 x[i] = (1-omega)*x[i] + omega*sum*aidiag[i]; 236 nz = nz2; 237 v -= nz + 1; 238 vj -= nz + 1; 239 } 240 ierr = PetscLogFlops(2*a->nz);CHKERRQ(ierr); 241 } 242 } 243 its--; 244 } 245 246 while (its--) { 247 /* 248 forward sweep: 249 for i=0,...,m-1: 250 sum[i] = (b[i] - U(i,:)x )/d[i]; 251 x[i] = (1-omega)x[i] + omega*sum[i]; 252 b = b - x[i]*U^T(i,:); 253 254 */ 255 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){ 256 ierr = PetscMemcpy(t,b,m*sizeof(PetscScalar));CHKERRQ(ierr); 257 258 for (i=0; i<m; i++){ 259 v = aa + ai[i] + 1; v1=v; 260 vj = aj + ai[i] + 1; vj1=vj; 261 nz = ai[i+1] - ai[i] - 1; nz1=nz; 262 sum = t[i]; 263 ierr = PetscLogFlops(4.0*nz-2);CHKERRQ(ierr); 264 while (nz1--) sum -= (*v1++)*x[*vj1++]; 265 x[i] = (1-omega)*x[i] + omega*sum*aidiag[i]; 266 while (nz--) t[*vj++] -= x[i]*(*v++); 267 } 268 } 269 270 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){ 271 /* 272 backward sweep: 273 b = b - x[i]*U^T(i,:), i=0,...,n-2 274 for i=m-1,...,0: 275 sum[i] = (b[i] - U(i,:)x )/d[i]; 276 x[i] = (1-omega)x[i] + omega*sum[i]; 277 */ 278 /* if there was a forward sweep done above then I thing the next two for loops are not needed */ 279 ierr = PetscMemcpy(t,b,m*sizeof(PetscScalar));CHKERRQ(ierr); 280 281 for (i=0; i<m-1; i++){ /* update rhs */ 282 v = aa + ai[i] + 1; 283 vj = aj + ai[i] + 1; 284 nz = ai[i+1] - ai[i] - 1; 285 ierr = PetscLogFlops(2.0*nz-1);CHKERRQ(ierr); 286 while (nz--) t[*vj++] -= x[i]*(*v++); 287 } 288 for (i=m-1; i>=0; i--){ 289 v = aa + ai[i] + 1; 290 vj = aj + ai[i] + 1; 291 nz = ai[i+1] - ai[i] - 1; 292 ierr = PetscLogFlops(2.0*nz-1);CHKERRQ(ierr); 293 sum = t[i]; 294 while (nz--) sum -= x[*vj++]*(*v++); 295 x[i] = (1-omega)*x[i] + omega*sum*aidiag[i]; 296 } 297 } 298 } 299 300 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 301 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 302 PetscFunctionReturn(0); 303 } 304