1b5b17502SBarry Smith 2b5b17502SBarry Smith /* 3b5b17502SBarry Smith This is included by sbaij.c to generate unsigned short and regular versions of these two functions 4b5b17502SBarry Smith */ 554e8760dSRichard Tran Mills 654e8760dSRichard Tran Mills /* We cut-and-past below from aij.h to make a "no_function" version of PetscSparseDensePlusDot(). 754e8760dSRichard Tran Mills * This is necessary because the USESHORT case cannot use the inlined functions that may be employed. */ 854e8760dSRichard Tran Mills 954e8760dSRichard Tran Mills #if defined(PETSC_KERNEL_USE_UNROLL_4) 109371c9d4SSatish Balay #define PetscSparseDensePlusDot_no_function(sum, r, xv, xi, nnz) \ 11*a8f51744SPierre Jolivet do { \ 1254e8760dSRichard Tran Mills if (nnz > 0) { \ 13a9ce9505SJunchao Zhang PetscInt nnz2 = nnz, rem = nnz & 0x3; \ 14a9ce9505SJunchao Zhang switch (rem) { \ 15d71ae5a4SJacob Faibussowitsch case 3: \ 16d71ae5a4SJacob Faibussowitsch sum += *xv++ * r[*xi++]; \ 17d71ae5a4SJacob Faibussowitsch case 2: \ 18d71ae5a4SJacob Faibussowitsch sum += *xv++ * r[*xi++]; \ 19d71ae5a4SJacob Faibussowitsch case 1: \ 20d71ae5a4SJacob Faibussowitsch sum += *xv++ * r[*xi++]; \ 21d71ae5a4SJacob Faibussowitsch nnz2 -= rem; \ 22a9ce9505SJunchao Zhang } \ 239371c9d4SSatish Balay while (nnz2 > 0) { \ 249371c9d4SSatish Balay sum += xv[0] * r[xi[0]] + xv[1] * r[xi[1]] + xv[2] * r[xi[2]] + xv[3] * r[xi[3]]; \ 259371c9d4SSatish Balay xv += 4; \ 269371c9d4SSatish Balay xi += 4; \ 279371c9d4SSatish Balay nnz2 -= 4; \ 289371c9d4SSatish Balay } \ 299371c9d4SSatish Balay xv -= nnz; \ 309371c9d4SSatish Balay xi -= nnz; \ 31a9ce9505SJunchao Zhang } \ 32*a8f51744SPierre Jolivet } while (0) 3354e8760dSRichard Tran Mills 3454e8760dSRichard Tran Mills #elif defined(PETSC_KERNEL_USE_UNROLL_2) 359371c9d4SSatish Balay #define PetscSparseDensePlusDot_no_function(sum, r, xv, xi, nnz) \ 36*a8f51744SPierre Jolivet do { \ 3754e8760dSRichard Tran Mills PetscInt __i, __i1, __i2; \ 389371c9d4SSatish Balay for (__i = 0; __i < nnz - 1; __i += 2) { \ 399371c9d4SSatish Balay __i1 = xi[__i]; \ 409371c9d4SSatish Balay __i2 = xi[__i + 1]; \ 419371c9d4SSatish Balay sum += (xv[__i] * r[__i1] + xv[__i + 1] * r[__i2]); \ 429371c9d4SSatish Balay } \ 439371c9d4SSatish Balay if (nnz & 0x1) sum += xv[__i] * r[xi[__i]]; \ 44*a8f51744SPierre Jolivet } while (0) 4554e8760dSRichard Tran Mills 4654e8760dSRichard Tran Mills #else 479371c9d4SSatish Balay #define PetscSparseDensePlusDot_no_function(sum, r, xv, xi, nnz) \ 48*a8f51744SPierre Jolivet do { \ 4954e8760dSRichard Tran Mills PetscInt __i; \ 509371c9d4SSatish Balay for (__i = 0; __i < nnz; __i++) sum += xv[__i] * r[xi[__i]]; \ 51*a8f51744SPierre Jolivet } while (0) 5254e8760dSRichard Tran Mills #endif 5354e8760dSRichard Tran Mills 54018dd85eSSatish Balay #if defined(USESHORT) 55018dd85eSSatish Balay PetscErrorCode MatMult_SeqSBAIJ_1_ushort(Mat A, Vec xx, Vec zz) 56018dd85eSSatish Balay #else 57018dd85eSSatish Balay PetscErrorCode MatMult_SeqSBAIJ_1(Mat A, Vec xx, Vec zz) 58018dd85eSSatish Balay #endif 59b5b17502SBarry Smith { 60b5b17502SBarry Smith Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 61b5b17502SBarry Smith const PetscScalar *x; 62b5b17502SBarry Smith PetscScalar *z, x1, sum; 63b5b17502SBarry Smith const MatScalar *v; 64b5b17502SBarry Smith MatScalar vj; 65b5b17502SBarry Smith PetscInt mbs = a->mbs, i, j, nz; 66b5b17502SBarry Smith const PetscInt *ai = a->i; 67b5b17502SBarry Smith #if defined(USESHORT) 68b5b17502SBarry Smith const unsigned short *ib = a->jshort; 69b5b17502SBarry Smith unsigned short ibt; 70b5b17502SBarry Smith #else 71b5b17502SBarry Smith const PetscInt *ib = a->j; 72b5b17502SBarry Smith PetscInt ibt; 73b5b17502SBarry Smith #endif 748dca527aSShri Abhyankar PetscInt nonzerorow = 0, jmin; 75eb1ec7c1SStefano Zampini #if defined(PETSC_USE_COMPLEX) 76b94d7dedSBarry Smith const int aconj = A->hermitian == PETSC_BOOL3_TRUE; 77eb1ec7c1SStefano Zampini #else 78eb1ec7c1SStefano Zampini const int aconj = 0; 79eb1ec7c1SStefano Zampini #endif 80b5b17502SBarry Smith 81b5b17502SBarry Smith PetscFunctionBegin; 829566063dSJacob Faibussowitsch PetscCall(VecSet(zz, 0.0)); 839566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 849566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z)); 85b5b17502SBarry Smith 86b5b17502SBarry Smith v = a->a; 87b5b17502SBarry Smith for (i = 0; i < mbs; i++) { 88b5b17502SBarry Smith nz = ai[i + 1] - ai[i]; /* length of i_th row of A */ 8903d09022SShri Abhyankar if (!nz) continue; /* Move to the next row if the current row is empty */ 9003d09022SShri Abhyankar nonzerorow++; 918dca527aSShri Abhyankar sum = 0.0; 928dca527aSShri Abhyankar jmin = 0; 93b5b17502SBarry Smith x1 = x[i]; 948dca527aSShri Abhyankar if (ib[0] == i) { 95b5b17502SBarry Smith sum = v[0] * x1; /* diagonal term */ 968dca527aSShri Abhyankar jmin++; 978dca527aSShri Abhyankar } 98444d8c10SJed Brown PetscPrefetchBlock(ib + nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 99444d8c10SJed Brown PetscPrefetchBlock(v + nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 100eb1ec7c1SStefano Zampini if (aconj) { 101eb1ec7c1SStefano Zampini for (j = jmin; j < nz; j++) { 102eb1ec7c1SStefano Zampini ibt = ib[j]; 103eb1ec7c1SStefano Zampini vj = v[j]; 104eb1ec7c1SStefano Zampini z[ibt] += PetscConj(vj) * x1; /* (strict lower triangular part of A)*x */ 105eb1ec7c1SStefano Zampini sum += vj * x[ibt]; /* (strict upper triangular part of A)*x */ 106eb1ec7c1SStefano Zampini } 107eb1ec7c1SStefano Zampini } else { 1088dca527aSShri Abhyankar for (j = jmin; j < nz; j++) { 109b5b17502SBarry Smith ibt = ib[j]; 110b5b17502SBarry Smith vj = v[j]; 111b5b17502SBarry Smith z[ibt] += vj * x1; /* (strict lower triangular part of A)*x */ 112b5b17502SBarry Smith sum += vj * x[ibt]; /* (strict upper triangular part of A)*x */ 113b5b17502SBarry Smith } 114eb1ec7c1SStefano Zampini } 115b5b17502SBarry Smith z[i] += sum; 116b5b17502SBarry Smith v += nz; 117b5b17502SBarry Smith ib += nz; 118b5b17502SBarry Smith } 119b5b17502SBarry Smith 1209566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 1219566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z)); 1229566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * (2.0 * a->nz - nonzerorow) - nonzerorow)); 1233ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 124b5b17502SBarry Smith } 125b5b17502SBarry Smith 126018dd85eSSatish Balay #if defined(USESHORT) 127018dd85eSSatish Balay PetscErrorCode MatSOR_SeqSBAIJ_ushort(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx) 128018dd85eSSatish Balay #else 129018dd85eSSatish Balay PetscErrorCode MatSOR_SeqSBAIJ(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx) 130018dd85eSSatish Balay #endif 131b5b17502SBarry Smith { 132b5b17502SBarry Smith Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 133b5b17502SBarry Smith const MatScalar *aa = a->a, *v, *v1, *aidiag; 134b5b17502SBarry Smith PetscScalar *x, *t, sum; 135b5b17502SBarry Smith const PetscScalar *b; 136b5b17502SBarry Smith MatScalar tmp; 137b5b17502SBarry Smith PetscInt m = a->mbs, bs = A->rmap->bs, j; 138b5b17502SBarry Smith const PetscInt *ai = a->i; 139b5b17502SBarry Smith #if defined(USESHORT) 140b5b17502SBarry Smith const unsigned short *aj = a->jshort, *vj, *vj1; 141b5b17502SBarry Smith #else 142b5b17502SBarry Smith const PetscInt *aj = a->j, *vj, *vj1; 143b5b17502SBarry Smith #endif 144b5b17502SBarry Smith PetscInt nz, nz1, i; 145b5b17502SBarry Smith 146b5b17502SBarry Smith PetscFunctionBegin; 147422a814eSBarry Smith if (fshift == -1.0) fshift = 0.0; /* negative fshift indicates do not error on zero diagonal; this code never errors on zero diagonal */ 148aed4548fSBarry Smith PetscCheck(!(flag & SOR_EISENSTAT), PETSC_COMM_SELF, PETSC_ERR_SUP, "No support yet for Eisenstat"); 149b5b17502SBarry Smith 150b5b17502SBarry Smith its = its * lits; 15108401ef6SPierre Jolivet PetscCheck(its > 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Relaxation requires global its %" PetscInt_FMT " and local its %" PetscInt_FMT " both positive", its, lits); 152b5b17502SBarry Smith 15308401ef6SPierre Jolivet PetscCheck(bs <= 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "SSOR for block size > 1 is not yet implemented"); 154b5b17502SBarry Smith 1559566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 1569566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 157b5b17502SBarry Smith 158b5b17502SBarry Smith if (!a->idiagvalid) { 15948a46eb9SPierre Jolivet if (!a->idiag) PetscCall(PetscMalloc1(m, &a->idiag)); 160b5b17502SBarry Smith for (i = 0; i < a->mbs; i++) a->idiag[i] = 1.0 / a->a[a->i[i]]; 161b5b17502SBarry Smith a->idiagvalid = PETSC_TRUE; 162b5b17502SBarry Smith } 163b5b17502SBarry Smith 16448a46eb9SPierre Jolivet if (!a->sor_work) PetscCall(PetscMalloc1(m, &a->sor_work)); 16541f059aeSBarry Smith t = a->sor_work; 166b5b17502SBarry Smith 167b5b17502SBarry Smith aidiag = a->idiag; 168b5b17502SBarry Smith 169b5b17502SBarry Smith if (flag == SOR_APPLY_UPPER) { 170b5b17502SBarry Smith /* apply (U + D/omega) to the vector */ 171b5b17502SBarry Smith PetscScalar d; 172b5b17502SBarry Smith for (i = 0; i < m; i++) { 173b5b17502SBarry Smith d = fshift + aa[ai[i]]; 174b5b17502SBarry Smith nz = ai[i + 1] - ai[i] - 1; 175b5b17502SBarry Smith vj = aj + ai[i] + 1; 176b5b17502SBarry Smith v = aa + ai[i] + 1; 177b5b17502SBarry Smith sum = b[i] * d / omega; 17854e8760dSRichard Tran Mills #ifdef USESHORT 17954e8760dSRichard Tran Mills PetscSparseDensePlusDot_no_function(sum, b, v, vj, nz); 18054e8760dSRichard Tran Mills #else 181b5b17502SBarry Smith PetscSparseDensePlusDot(sum, b, v, vj, nz); 18254e8760dSRichard Tran Mills #endif 183b5b17502SBarry Smith x[i] = sum; 184b5b17502SBarry Smith } 1859566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(a->nz)); 186b5b17502SBarry Smith } 187b5b17502SBarry Smith 188b5b17502SBarry Smith if (flag & SOR_ZERO_INITIAL_GUESS) { 189b5b17502SBarry Smith if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 1909566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(t, b, m)); 191b5b17502SBarry Smith 192b5b17502SBarry Smith v = aa + 1; 193b5b17502SBarry Smith vj = aj + 1; 194b5b17502SBarry Smith for (i = 0; i < m; i++) { 195b5b17502SBarry Smith nz = ai[i + 1] - ai[i] - 1; 196b5b17502SBarry Smith tmp = -(x[i] = omega * t[i] * aidiag[i]); 19726fbe8dcSKarl Rupp for (j = 0; j < nz; j++) t[vj[j]] += tmp * v[j]; 198b5b17502SBarry Smith v += nz + 1; 199b5b17502SBarry Smith vj += nz + 1; 200b5b17502SBarry Smith } 2019566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz)); 202b5b17502SBarry Smith } 203b5b17502SBarry Smith 204b5b17502SBarry Smith if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 205b5b17502SBarry Smith int nz2; 206b5b17502SBarry Smith if (!(flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP)) { 207b5b17502SBarry Smith #if defined(PETSC_USE_BACKWARD_LOOP) 208b5b17502SBarry Smith v = aa + ai[m] - 1; 209b5b17502SBarry Smith vj = aj + ai[m] - 1; 210b5b17502SBarry Smith for (i = m - 1; i >= 0; i--) { 211b5b17502SBarry Smith sum = b[i]; 212b5b17502SBarry Smith nz = ai[i + 1] - ai[i] - 1; 2139371c9d4SSatish Balay { 2149371c9d4SSatish Balay PetscInt __i; 2159371c9d4SSatish Balay for (__i = 0; __i < nz; __i++) sum -= v[-__i] * x[vj[-__i]]; 2169371c9d4SSatish Balay } 217b5b17502SBarry Smith #else 218b5b17502SBarry Smith v = aa + ai[m - 1] + 1; 219b5b17502SBarry Smith vj = aj + ai[m - 1] + 1; 220b5b17502SBarry Smith nz = 0; 221b5b17502SBarry Smith for (i = m - 1; i >= 0; i--) { 222b5b17502SBarry Smith sum = b[i]; 2236c11f900SJed Brown nz2 = ai[i] - ai[PetscMax(i - 1, 0)] - 1; /* avoid referencing ai[-1], nonsense nz2 is okay on last iteration */ 22450d8bf02SJed Brown PETSC_Prefetch(v - nz2 - 1, 0, PETSC_PREFETCH_HINT_NTA); 22550d8bf02SJed Brown PETSC_Prefetch(vj - nz2 - 1, 0, PETSC_PREFETCH_HINT_NTA); 226b5b17502SBarry Smith PetscSparseDenseMinusDot(sum, x, v, vj, nz); 227b5b17502SBarry Smith nz = nz2; 228b5b17502SBarry Smith #endif 229b5b17502SBarry Smith x[i] = omega * sum * aidiag[i]; 230b5b17502SBarry Smith v -= nz + 1; 231b5b17502SBarry Smith vj -= nz + 1; 232b5b17502SBarry Smith } 2339566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz)); 234b5b17502SBarry Smith } else { 235b5b17502SBarry Smith v = aa + ai[m - 1] + 1; 236b5b17502SBarry Smith vj = aj + ai[m - 1] + 1; 237b5b17502SBarry Smith nz = 0; 238b5b17502SBarry Smith for (i = m - 1; i >= 0; i--) { 239b5b17502SBarry Smith sum = t[i]; 2406c11f900SJed Brown nz2 = ai[i] - ai[PetscMax(i - 1, 0)] - 1; /* avoid referencing ai[-1], nonsense nz2 is okay on last iteration */ 24150d8bf02SJed Brown PETSC_Prefetch(v - nz2 - 1, 0, PETSC_PREFETCH_HINT_NTA); 24250d8bf02SJed Brown PETSC_Prefetch(vj - nz2 - 1, 0, PETSC_PREFETCH_HINT_NTA); 243b5b17502SBarry Smith PetscSparseDenseMinusDot(sum, x, v, vj, nz); 244b5b17502SBarry Smith x[i] = (1 - omega) * x[i] + omega * sum * aidiag[i]; 245b5b17502SBarry Smith nz = nz2; 246b5b17502SBarry Smith v -= nz + 1; 247b5b17502SBarry Smith vj -= nz + 1; 248b5b17502SBarry Smith } 2499566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz)); 250b5b17502SBarry Smith } 251b5b17502SBarry Smith } 252b5b17502SBarry Smith its--; 253b5b17502SBarry Smith } 254b5b17502SBarry Smith 255b5b17502SBarry Smith while (its--) { 256b5b17502SBarry Smith /* 257b5b17502SBarry Smith forward sweep: 258b5b17502SBarry Smith for i=0,...,m-1: 259b5b17502SBarry Smith sum[i] = (b[i] - U(i,:)x)/d[i]; 260b5b17502SBarry Smith x[i] = (1-omega)x[i] + omega*sum[i]; 261b5b17502SBarry Smith b = b - x[i]*U^T(i,:); 262b5b17502SBarry Smith 263b5b17502SBarry Smith */ 264b5b17502SBarry Smith if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 2659566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(t, b, m)); 266b5b17502SBarry Smith 267b5b17502SBarry Smith for (i = 0; i < m; i++) { 2689371c9d4SSatish Balay v = aa + ai[i] + 1; 2699371c9d4SSatish Balay v1 = v; 2709371c9d4SSatish Balay vj = aj + ai[i] + 1; 2719371c9d4SSatish Balay vj1 = vj; 2729371c9d4SSatish Balay nz = ai[i + 1] - ai[i] - 1; 2739371c9d4SSatish Balay nz1 = nz; 274b5b17502SBarry Smith sum = t[i]; 275b5b17502SBarry Smith while (nz1--) sum -= (*v1++) * x[*vj1++]; 276b5b17502SBarry Smith x[i] = (1 - omega) * x[i] + omega * sum * aidiag[i]; 277b5b17502SBarry Smith while (nz--) t[*vj++] -= x[i] * (*v++); 278b5b17502SBarry Smith } 2799566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(4.0 * a->nz)); 280b5b17502SBarry Smith } 281b5b17502SBarry Smith 282b5b17502SBarry Smith if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 283b5b17502SBarry Smith /* 284b5b17502SBarry Smith backward sweep: 285b5b17502SBarry Smith b = b - x[i]*U^T(i,:), i=0,...,n-2 286b5b17502SBarry Smith for i=m-1,...,0: 287b5b17502SBarry Smith sum[i] = (b[i] - U(i,:)x)/d[i]; 288b5b17502SBarry Smith x[i] = (1-omega)x[i] + omega*sum[i]; 289b5b17502SBarry Smith */ 290b5b17502SBarry Smith /* if there was a forward sweep done above then I thing the next two for loops are not needed */ 2919566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(t, b, m)); 292b5b17502SBarry Smith 293b5b17502SBarry Smith for (i = 0; i < m - 1; i++) { /* update rhs */ 294b5b17502SBarry Smith v = aa + ai[i] + 1; 295b5b17502SBarry Smith vj = aj + ai[i] + 1; 296b5b17502SBarry Smith nz = ai[i + 1] - ai[i] - 1; 297b5b17502SBarry Smith while (nz--) t[*vj++] -= x[i] * (*v++); 298b5b17502SBarry Smith } 2999566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * (a->nz - m))); 300b5b17502SBarry Smith for (i = m - 1; i >= 0; i--) { 301b5b17502SBarry Smith v = aa + ai[i] + 1; 302b5b17502SBarry Smith vj = aj + ai[i] + 1; 303b5b17502SBarry Smith nz = ai[i + 1] - ai[i] - 1; 304b5b17502SBarry Smith sum = t[i]; 305b5b17502SBarry Smith while (nz--) sum -= x[*vj++] * (*v++); 306b5b17502SBarry Smith x[i] = (1 - omega) * x[i] + omega * sum * aidiag[i]; 307b5b17502SBarry Smith } 3089566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * (a->nz + m))); 309b5b17502SBarry Smith } 310b5b17502SBarry Smith } 311b5b17502SBarry Smith 3129566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 3139566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 3143ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 315b5b17502SBarry Smith } 316