xref: /petsc/src/mat/impls/sbaij/seq/relax.h (revision a8f51744601f91dc30d5f98153b1ecd91eebb0e5)
1b5b17502SBarry Smith 
2b5b17502SBarry Smith /*
3b5b17502SBarry Smith     This is included by sbaij.c to generate unsigned short and regular versions of these two functions
4b5b17502SBarry Smith */
554e8760dSRichard Tran Mills 
654e8760dSRichard Tran Mills /* We cut-and-past below from aij.h to make a "no_function" version of PetscSparseDensePlusDot().
754e8760dSRichard Tran Mills  * This is necessary because the USESHORT case cannot use the inlined functions that may be employed. */
854e8760dSRichard Tran Mills 
954e8760dSRichard Tran Mills #if defined(PETSC_KERNEL_USE_UNROLL_4)
109371c9d4SSatish Balay   #define PetscSparseDensePlusDot_no_function(sum, r, xv, xi, nnz) \
11*a8f51744SPierre Jolivet     do { \
1254e8760dSRichard Tran Mills       if (nnz > 0) { \
13a9ce9505SJunchao Zhang         PetscInt nnz2 = nnz, rem = nnz & 0x3; \
14a9ce9505SJunchao Zhang         switch (rem) { \
15d71ae5a4SJacob Faibussowitsch         case 3: \
16d71ae5a4SJacob Faibussowitsch           sum += *xv++ * r[*xi++]; \
17d71ae5a4SJacob Faibussowitsch         case 2: \
18d71ae5a4SJacob Faibussowitsch           sum += *xv++ * r[*xi++]; \
19d71ae5a4SJacob Faibussowitsch         case 1: \
20d71ae5a4SJacob Faibussowitsch           sum += *xv++ * r[*xi++]; \
21d71ae5a4SJacob Faibussowitsch           nnz2 -= rem; \
22a9ce9505SJunchao Zhang         } \
239371c9d4SSatish Balay         while (nnz2 > 0) { \
249371c9d4SSatish Balay           sum += xv[0] * r[xi[0]] + xv[1] * r[xi[1]] + xv[2] * r[xi[2]] + xv[3] * r[xi[3]]; \
259371c9d4SSatish Balay           xv += 4; \
269371c9d4SSatish Balay           xi += 4; \
279371c9d4SSatish Balay           nnz2 -= 4; \
289371c9d4SSatish Balay         } \
299371c9d4SSatish Balay         xv -= nnz; \
309371c9d4SSatish Balay         xi -= nnz; \
31a9ce9505SJunchao Zhang       } \
32*a8f51744SPierre Jolivet     } while (0)
3354e8760dSRichard Tran Mills 
3454e8760dSRichard Tran Mills #elif defined(PETSC_KERNEL_USE_UNROLL_2)
359371c9d4SSatish Balay   #define PetscSparseDensePlusDot_no_function(sum, r, xv, xi, nnz) \
36*a8f51744SPierre Jolivet     do { \
3754e8760dSRichard Tran Mills       PetscInt __i, __i1, __i2; \
389371c9d4SSatish Balay       for (__i = 0; __i < nnz - 1; __i += 2) { \
399371c9d4SSatish Balay         __i1 = xi[__i]; \
409371c9d4SSatish Balay         __i2 = xi[__i + 1]; \
419371c9d4SSatish Balay         sum += (xv[__i] * r[__i1] + xv[__i + 1] * r[__i2]); \
429371c9d4SSatish Balay       } \
439371c9d4SSatish Balay       if (nnz & 0x1) sum += xv[__i] * r[xi[__i]]; \
44*a8f51744SPierre Jolivet     } while (0)
4554e8760dSRichard Tran Mills 
4654e8760dSRichard Tran Mills #else
479371c9d4SSatish Balay   #define PetscSparseDensePlusDot_no_function(sum, r, xv, xi, nnz) \
48*a8f51744SPierre Jolivet     do { \
4954e8760dSRichard Tran Mills       PetscInt __i; \
509371c9d4SSatish Balay       for (__i = 0; __i < nnz; __i++) sum += xv[__i] * r[xi[__i]]; \
51*a8f51744SPierre Jolivet     } while (0)
5254e8760dSRichard Tran Mills #endif
5354e8760dSRichard Tran Mills 
54018dd85eSSatish Balay #if defined(USESHORT)
55018dd85eSSatish Balay PetscErrorCode MatMult_SeqSBAIJ_1_ushort(Mat A, Vec xx, Vec zz)
56018dd85eSSatish Balay #else
57018dd85eSSatish Balay PetscErrorCode MatMult_SeqSBAIJ_1(Mat A, Vec xx, Vec zz)
58018dd85eSSatish Balay #endif
59b5b17502SBarry Smith {
60b5b17502SBarry Smith   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
61b5b17502SBarry Smith   const PetscScalar *x;
62b5b17502SBarry Smith   PetscScalar       *z, x1, sum;
63b5b17502SBarry Smith   const MatScalar   *v;
64b5b17502SBarry Smith   MatScalar          vj;
65b5b17502SBarry Smith   PetscInt           mbs = a->mbs, i, j, nz;
66b5b17502SBarry Smith   const PetscInt    *ai  = a->i;
67b5b17502SBarry Smith #if defined(USESHORT)
68b5b17502SBarry Smith   const unsigned short *ib = a->jshort;
69b5b17502SBarry Smith   unsigned short        ibt;
70b5b17502SBarry Smith #else
71b5b17502SBarry Smith   const PetscInt *ib = a->j;
72b5b17502SBarry Smith   PetscInt        ibt;
73b5b17502SBarry Smith #endif
748dca527aSShri Abhyankar   PetscInt nonzerorow = 0, jmin;
75eb1ec7c1SStefano Zampini #if defined(PETSC_USE_COMPLEX)
76b94d7dedSBarry Smith   const int aconj = A->hermitian == PETSC_BOOL3_TRUE;
77eb1ec7c1SStefano Zampini #else
78eb1ec7c1SStefano Zampini   const int       aconj = 0;
79eb1ec7c1SStefano Zampini #endif
80b5b17502SBarry Smith 
81b5b17502SBarry Smith   PetscFunctionBegin;
829566063dSJacob Faibussowitsch   PetscCall(VecSet(zz, 0.0));
839566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
849566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
85b5b17502SBarry Smith 
86b5b17502SBarry Smith   v = a->a;
87b5b17502SBarry Smith   for (i = 0; i < mbs; i++) {
88b5b17502SBarry Smith     nz = ai[i + 1] - ai[i]; /* length of i_th row of A */
8903d09022SShri Abhyankar     if (!nz) continue;      /* Move to the next row if the current row is empty */
9003d09022SShri Abhyankar     nonzerorow++;
918dca527aSShri Abhyankar     sum  = 0.0;
928dca527aSShri Abhyankar     jmin = 0;
93b5b17502SBarry Smith     x1   = x[i];
948dca527aSShri Abhyankar     if (ib[0] == i) {
95b5b17502SBarry Smith       sum = v[0] * x1; /* diagonal term */
968dca527aSShri Abhyankar       jmin++;
978dca527aSShri Abhyankar     }
98444d8c10SJed Brown     PetscPrefetchBlock(ib + nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
99444d8c10SJed Brown     PetscPrefetchBlock(v + nz, nz, 0, PETSC_PREFETCH_HINT_NTA);  /* Entries for the next row */
100eb1ec7c1SStefano Zampini     if (aconj) {
101eb1ec7c1SStefano Zampini       for (j = jmin; j < nz; j++) {
102eb1ec7c1SStefano Zampini         ibt = ib[j];
103eb1ec7c1SStefano Zampini         vj  = v[j];
104eb1ec7c1SStefano Zampini         z[ibt] += PetscConj(vj) * x1; /* (strict lower triangular part of A)*x  */
105eb1ec7c1SStefano Zampini         sum += vj * x[ibt];           /* (strict upper triangular part of A)*x  */
106eb1ec7c1SStefano Zampini       }
107eb1ec7c1SStefano Zampini     } else {
1088dca527aSShri Abhyankar       for (j = jmin; j < nz; j++) {
109b5b17502SBarry Smith         ibt = ib[j];
110b5b17502SBarry Smith         vj  = v[j];
111b5b17502SBarry Smith         z[ibt] += vj * x1;  /* (strict lower triangular part of A)*x  */
112b5b17502SBarry Smith         sum += vj * x[ibt]; /* (strict upper triangular part of A)*x  */
113b5b17502SBarry Smith       }
114eb1ec7c1SStefano Zampini     }
115b5b17502SBarry Smith     z[i] += sum;
116b5b17502SBarry Smith     v += nz;
117b5b17502SBarry Smith     ib += nz;
118b5b17502SBarry Smith   }
119b5b17502SBarry Smith 
1209566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
1219566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
1229566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * (2.0 * a->nz - nonzerorow) - nonzerorow));
1233ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
124b5b17502SBarry Smith }
125b5b17502SBarry Smith 
126018dd85eSSatish Balay #if defined(USESHORT)
127018dd85eSSatish Balay PetscErrorCode MatSOR_SeqSBAIJ_ushort(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
128018dd85eSSatish Balay #else
129018dd85eSSatish Balay PetscErrorCode MatSOR_SeqSBAIJ(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
130018dd85eSSatish Balay #endif
131b5b17502SBarry Smith {
132b5b17502SBarry Smith   Mat_SeqSBAIJ      *a  = (Mat_SeqSBAIJ *)A->data;
133b5b17502SBarry Smith   const MatScalar   *aa = a->a, *v, *v1, *aidiag;
134b5b17502SBarry Smith   PetscScalar       *x, *t, sum;
135b5b17502SBarry Smith   const PetscScalar *b;
136b5b17502SBarry Smith   MatScalar          tmp;
137b5b17502SBarry Smith   PetscInt           m = a->mbs, bs = A->rmap->bs, j;
138b5b17502SBarry Smith   const PetscInt    *ai = a->i;
139b5b17502SBarry Smith #if defined(USESHORT)
140b5b17502SBarry Smith   const unsigned short *aj = a->jshort, *vj, *vj1;
141b5b17502SBarry Smith #else
142b5b17502SBarry Smith   const PetscInt *aj = a->j, *vj, *vj1;
143b5b17502SBarry Smith #endif
144b5b17502SBarry Smith   PetscInt nz, nz1, i;
145b5b17502SBarry Smith 
146b5b17502SBarry Smith   PetscFunctionBegin;
147422a814eSBarry Smith   if (fshift == -1.0) fshift = 0.0; /* negative fshift indicates do not error on zero diagonal; this code never errors on zero diagonal */
148aed4548fSBarry Smith   PetscCheck(!(flag & SOR_EISENSTAT), PETSC_COMM_SELF, PETSC_ERR_SUP, "No support yet for Eisenstat");
149b5b17502SBarry Smith 
150b5b17502SBarry Smith   its = its * lits;
15108401ef6SPierre Jolivet   PetscCheck(its > 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Relaxation requires global its %" PetscInt_FMT " and local its %" PetscInt_FMT " both positive", its, lits);
152b5b17502SBarry Smith 
15308401ef6SPierre Jolivet   PetscCheck(bs <= 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "SSOR for block size > 1 is not yet implemented");
154b5b17502SBarry Smith 
1559566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
1569566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
157b5b17502SBarry Smith 
158b5b17502SBarry Smith   if (!a->idiagvalid) {
15948a46eb9SPierre Jolivet     if (!a->idiag) PetscCall(PetscMalloc1(m, &a->idiag));
160b5b17502SBarry Smith     for (i = 0; i < a->mbs; i++) a->idiag[i] = 1.0 / a->a[a->i[i]];
161b5b17502SBarry Smith     a->idiagvalid = PETSC_TRUE;
162b5b17502SBarry Smith   }
163b5b17502SBarry Smith 
16448a46eb9SPierre Jolivet   if (!a->sor_work) PetscCall(PetscMalloc1(m, &a->sor_work));
16541f059aeSBarry Smith   t = a->sor_work;
166b5b17502SBarry Smith 
167b5b17502SBarry Smith   aidiag = a->idiag;
168b5b17502SBarry Smith 
169b5b17502SBarry Smith   if (flag == SOR_APPLY_UPPER) {
170b5b17502SBarry Smith     /* apply (U + D/omega) to the vector */
171b5b17502SBarry Smith     PetscScalar d;
172b5b17502SBarry Smith     for (i = 0; i < m; i++) {
173b5b17502SBarry Smith       d   = fshift + aa[ai[i]];
174b5b17502SBarry Smith       nz  = ai[i + 1] - ai[i] - 1;
175b5b17502SBarry Smith       vj  = aj + ai[i] + 1;
176b5b17502SBarry Smith       v   = aa + ai[i] + 1;
177b5b17502SBarry Smith       sum = b[i] * d / omega;
17854e8760dSRichard Tran Mills #ifdef USESHORT
17954e8760dSRichard Tran Mills       PetscSparseDensePlusDot_no_function(sum, b, v, vj, nz);
18054e8760dSRichard Tran Mills #else
181b5b17502SBarry Smith       PetscSparseDensePlusDot(sum, b, v, vj, nz);
18254e8760dSRichard Tran Mills #endif
183b5b17502SBarry Smith       x[i] = sum;
184b5b17502SBarry Smith     }
1859566063dSJacob Faibussowitsch     PetscCall(PetscLogFlops(a->nz));
186b5b17502SBarry Smith   }
187b5b17502SBarry Smith 
188b5b17502SBarry Smith   if (flag & SOR_ZERO_INITIAL_GUESS) {
189b5b17502SBarry Smith     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
1909566063dSJacob Faibussowitsch       PetscCall(PetscArraycpy(t, b, m));
191b5b17502SBarry Smith 
192b5b17502SBarry Smith       v  = aa + 1;
193b5b17502SBarry Smith       vj = aj + 1;
194b5b17502SBarry Smith       for (i = 0; i < m; i++) {
195b5b17502SBarry Smith         nz  = ai[i + 1] - ai[i] - 1;
196b5b17502SBarry Smith         tmp = -(x[i] = omega * t[i] * aidiag[i]);
19726fbe8dcSKarl Rupp         for (j = 0; j < nz; j++) t[vj[j]] += tmp * v[j];
198b5b17502SBarry Smith         v += nz + 1;
199b5b17502SBarry Smith         vj += nz + 1;
200b5b17502SBarry Smith       }
2019566063dSJacob Faibussowitsch       PetscCall(PetscLogFlops(2.0 * a->nz));
202b5b17502SBarry Smith     }
203b5b17502SBarry Smith 
204b5b17502SBarry Smith     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
205b5b17502SBarry Smith       int nz2;
206b5b17502SBarry Smith       if (!(flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP)) {
207b5b17502SBarry Smith #if defined(PETSC_USE_BACKWARD_LOOP)
208b5b17502SBarry Smith         v  = aa + ai[m] - 1;
209b5b17502SBarry Smith         vj = aj + ai[m] - 1;
210b5b17502SBarry Smith         for (i = m - 1; i >= 0; i--) {
211b5b17502SBarry Smith           sum = b[i];
212b5b17502SBarry Smith           nz  = ai[i + 1] - ai[i] - 1;
2139371c9d4SSatish Balay           {
2149371c9d4SSatish Balay             PetscInt __i;
2159371c9d4SSatish Balay             for (__i = 0; __i < nz; __i++) sum -= v[-__i] * x[vj[-__i]];
2169371c9d4SSatish Balay           }
217b5b17502SBarry Smith #else
218b5b17502SBarry Smith         v  = aa + ai[m - 1] + 1;
219b5b17502SBarry Smith         vj = aj + ai[m - 1] + 1;
220b5b17502SBarry Smith         nz = 0;
221b5b17502SBarry Smith         for (i = m - 1; i >= 0; i--) {
222b5b17502SBarry Smith           sum = b[i];
2236c11f900SJed Brown           nz2 = ai[i] - ai[PetscMax(i - 1, 0)] - 1; /* avoid referencing ai[-1], nonsense nz2 is okay on last iteration */
22450d8bf02SJed Brown           PETSC_Prefetch(v - nz2 - 1, 0, PETSC_PREFETCH_HINT_NTA);
22550d8bf02SJed Brown           PETSC_Prefetch(vj - nz2 - 1, 0, PETSC_PREFETCH_HINT_NTA);
226b5b17502SBarry Smith           PetscSparseDenseMinusDot(sum, x, v, vj, nz);
227b5b17502SBarry Smith           nz = nz2;
228b5b17502SBarry Smith #endif
229b5b17502SBarry Smith           x[i] = omega * sum * aidiag[i];
230b5b17502SBarry Smith           v -= nz + 1;
231b5b17502SBarry Smith           vj -= nz + 1;
232b5b17502SBarry Smith         }
2339566063dSJacob Faibussowitsch         PetscCall(PetscLogFlops(2.0 * a->nz));
234b5b17502SBarry Smith       } else {
235b5b17502SBarry Smith         v  = aa + ai[m - 1] + 1;
236b5b17502SBarry Smith         vj = aj + ai[m - 1] + 1;
237b5b17502SBarry Smith         nz = 0;
238b5b17502SBarry Smith         for (i = m - 1; i >= 0; i--) {
239b5b17502SBarry Smith           sum = t[i];
2406c11f900SJed Brown           nz2 = ai[i] - ai[PetscMax(i - 1, 0)] - 1; /* avoid referencing ai[-1], nonsense nz2 is okay on last iteration */
24150d8bf02SJed Brown           PETSC_Prefetch(v - nz2 - 1, 0, PETSC_PREFETCH_HINT_NTA);
24250d8bf02SJed Brown           PETSC_Prefetch(vj - nz2 - 1, 0, PETSC_PREFETCH_HINT_NTA);
243b5b17502SBarry Smith           PetscSparseDenseMinusDot(sum, x, v, vj, nz);
244b5b17502SBarry Smith           x[i] = (1 - omega) * x[i] + omega * sum * aidiag[i];
245b5b17502SBarry Smith           nz   = nz2;
246b5b17502SBarry Smith           v -= nz + 1;
247b5b17502SBarry Smith           vj -= nz + 1;
248b5b17502SBarry Smith         }
2499566063dSJacob Faibussowitsch         PetscCall(PetscLogFlops(2.0 * a->nz));
250b5b17502SBarry Smith       }
251b5b17502SBarry Smith     }
252b5b17502SBarry Smith     its--;
253b5b17502SBarry Smith   }
254b5b17502SBarry Smith 
255b5b17502SBarry Smith   while (its--) {
256b5b17502SBarry Smith     /*
257b5b17502SBarry Smith        forward sweep:
258b5b17502SBarry Smith        for i=0,...,m-1:
259b5b17502SBarry Smith          sum[i] = (b[i] - U(i,:)x)/d[i];
260b5b17502SBarry Smith          x[i]   = (1-omega)x[i] + omega*sum[i];
261b5b17502SBarry Smith          b      = b - x[i]*U^T(i,:);
262b5b17502SBarry Smith 
263b5b17502SBarry Smith     */
264b5b17502SBarry Smith     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
2659566063dSJacob Faibussowitsch       PetscCall(PetscArraycpy(t, b, m));
266b5b17502SBarry Smith 
267b5b17502SBarry Smith       for (i = 0; i < m; i++) {
2689371c9d4SSatish Balay         v   = aa + ai[i] + 1;
2699371c9d4SSatish Balay         v1  = v;
2709371c9d4SSatish Balay         vj  = aj + ai[i] + 1;
2719371c9d4SSatish Balay         vj1 = vj;
2729371c9d4SSatish Balay         nz  = ai[i + 1] - ai[i] - 1;
2739371c9d4SSatish Balay         nz1 = nz;
274b5b17502SBarry Smith         sum = t[i];
275b5b17502SBarry Smith         while (nz1--) sum -= (*v1++) * x[*vj1++];
276b5b17502SBarry Smith         x[i] = (1 - omega) * x[i] + omega * sum * aidiag[i];
277b5b17502SBarry Smith         while (nz--) t[*vj++] -= x[i] * (*v++);
278b5b17502SBarry Smith       }
2799566063dSJacob Faibussowitsch       PetscCall(PetscLogFlops(4.0 * a->nz));
280b5b17502SBarry Smith     }
281b5b17502SBarry Smith 
282b5b17502SBarry Smith     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
283b5b17502SBarry Smith       /*
284b5b17502SBarry Smith        backward sweep:
285b5b17502SBarry Smith        b = b - x[i]*U^T(i,:), i=0,...,n-2
286b5b17502SBarry Smith        for i=m-1,...,0:
287b5b17502SBarry Smith          sum[i] = (b[i] - U(i,:)x)/d[i];
288b5b17502SBarry Smith          x[i]   = (1-omega)x[i] + omega*sum[i];
289b5b17502SBarry Smith       */
290b5b17502SBarry Smith       /* if there was a forward sweep done above then I thing the next two for loops are not needed */
2919566063dSJacob Faibussowitsch       PetscCall(PetscArraycpy(t, b, m));
292b5b17502SBarry Smith 
293b5b17502SBarry Smith       for (i = 0; i < m - 1; i++) { /* update rhs */
294b5b17502SBarry Smith         v  = aa + ai[i] + 1;
295b5b17502SBarry Smith         vj = aj + ai[i] + 1;
296b5b17502SBarry Smith         nz = ai[i + 1] - ai[i] - 1;
297b5b17502SBarry Smith         while (nz--) t[*vj++] -= x[i] * (*v++);
298b5b17502SBarry Smith       }
2999566063dSJacob Faibussowitsch       PetscCall(PetscLogFlops(2.0 * (a->nz - m)));
300b5b17502SBarry Smith       for (i = m - 1; i >= 0; i--) {
301b5b17502SBarry Smith         v   = aa + ai[i] + 1;
302b5b17502SBarry Smith         vj  = aj + ai[i] + 1;
303b5b17502SBarry Smith         nz  = ai[i + 1] - ai[i] - 1;
304b5b17502SBarry Smith         sum = t[i];
305b5b17502SBarry Smith         while (nz--) sum -= x[*vj++] * (*v++);
306b5b17502SBarry Smith         x[i] = (1 - omega) * x[i] + omega * sum * aidiag[i];
307b5b17502SBarry Smith       }
3089566063dSJacob Faibussowitsch       PetscCall(PetscLogFlops(2.0 * (a->nz + m)));
309b5b17502SBarry Smith     }
310b5b17502SBarry Smith   }
311b5b17502SBarry Smith 
3129566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
3139566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
3143ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
315b5b17502SBarry Smith }
316