12c733ed4SBarry Smith #include <../src/mat/impls/baij/seq/baij.h> 22c733ed4SBarry Smith #include <petsc/private/kernels/blockinvert.h> 32c733ed4SBarry Smith 42c733ed4SBarry Smith /* Block operations are done by accessing one column at at time */ 52c733ed4SBarry Smith /* Default MatSolve for block size 11 */ 62c733ed4SBarry Smith 7*9371c9d4SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_11_NaturalOrdering(Mat A, Vec bb, Vec xx) { 82c733ed4SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 92c733ed4SBarry Smith const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *adiag = a->diag, *vi, bs = A->rmap->bs, bs2 = a->bs2; 102c733ed4SBarry Smith PetscInt i, k, nz, idx, idt, m; 112c733ed4SBarry Smith const MatScalar *aa = a->a, *v; 122c733ed4SBarry Smith PetscScalar s[11]; 132c733ed4SBarry Smith PetscScalar *x, xv; 142c733ed4SBarry Smith const PetscScalar *b; 152c733ed4SBarry Smith 162c733ed4SBarry Smith PetscFunctionBegin; 179566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 189566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 192c733ed4SBarry Smith 202c733ed4SBarry Smith /* forward solve the lower triangular */ 212c733ed4SBarry Smith for (i = 0; i < n; i++) { 222c733ed4SBarry Smith v = aa + bs2 * ai[i]; 232c733ed4SBarry Smith vi = aj + ai[i]; 242c733ed4SBarry Smith nz = ai[i + 1] - ai[i]; 252c733ed4SBarry Smith idt = bs * i; 26*9371c9d4SSatish Balay x[idt] = b[idt]; 27*9371c9d4SSatish Balay x[1 + idt] = b[1 + idt]; 28*9371c9d4SSatish Balay x[2 + idt] = b[2 + idt]; 29*9371c9d4SSatish Balay x[3 + idt] = b[3 + idt]; 30*9371c9d4SSatish Balay x[4 + idt] = b[4 + idt]; 31*9371c9d4SSatish Balay x[5 + idt] = b[5 + idt]; 32*9371c9d4SSatish Balay x[6 + idt] = b[6 + idt]; 33*9371c9d4SSatish Balay x[7 + idt] = b[7 + idt]; 34*9371c9d4SSatish Balay x[8 + idt] = b[8 + idt]; 35*9371c9d4SSatish Balay x[9 + idt] = b[9 + idt]; 362c733ed4SBarry Smith x[10 + idt] = b[10 + idt]; 372c733ed4SBarry Smith for (m = 0; m < nz; m++) { 382c733ed4SBarry Smith idx = bs * vi[m]; 392c733ed4SBarry Smith for (k = 0; k < 11; k++) { 402c733ed4SBarry Smith xv = x[k + idx]; 412c733ed4SBarry Smith x[idt] -= v[0] * xv; 422c733ed4SBarry Smith x[1 + idt] -= v[1] * xv; 432c733ed4SBarry Smith x[2 + idt] -= v[2] * xv; 442c733ed4SBarry Smith x[3 + idt] -= v[3] * xv; 452c733ed4SBarry Smith x[4 + idt] -= v[4] * xv; 462c733ed4SBarry Smith x[5 + idt] -= v[5] * xv; 472c733ed4SBarry Smith x[6 + idt] -= v[6] * xv; 482c733ed4SBarry Smith x[7 + idt] -= v[7] * xv; 492c733ed4SBarry Smith x[8 + idt] -= v[8] * xv; 502c733ed4SBarry Smith x[9 + idt] -= v[9] * xv; 512c733ed4SBarry Smith x[10 + idt] -= v[10] * xv; 522c733ed4SBarry Smith v += 11; 532c733ed4SBarry Smith } 542c733ed4SBarry Smith } 552c733ed4SBarry Smith } 562c733ed4SBarry Smith /* backward solve the upper triangular */ 572c733ed4SBarry Smith for (i = n - 1; i >= 0; i--) { 582c733ed4SBarry Smith v = aa + bs2 * (adiag[i + 1] + 1); 592c733ed4SBarry Smith vi = aj + adiag[i + 1] + 1; 602c733ed4SBarry Smith nz = adiag[i] - adiag[i + 1] - 1; 612c733ed4SBarry Smith idt = bs * i; 62*9371c9d4SSatish Balay s[0] = x[idt]; 63*9371c9d4SSatish Balay s[1] = x[1 + idt]; 64*9371c9d4SSatish Balay s[2] = x[2 + idt]; 65*9371c9d4SSatish Balay s[3] = x[3 + idt]; 66*9371c9d4SSatish Balay s[4] = x[4 + idt]; 67*9371c9d4SSatish Balay s[5] = x[5 + idt]; 68*9371c9d4SSatish Balay s[6] = x[6 + idt]; 69*9371c9d4SSatish Balay s[7] = x[7 + idt]; 70*9371c9d4SSatish Balay s[8] = x[8 + idt]; 71*9371c9d4SSatish Balay s[9] = x[9 + idt]; 722c733ed4SBarry Smith s[10] = x[10 + idt]; 732c733ed4SBarry Smith 742c733ed4SBarry Smith for (m = 0; m < nz; m++) { 752c733ed4SBarry Smith idx = bs * vi[m]; 762c733ed4SBarry Smith for (k = 0; k < 11; k++) { 772c733ed4SBarry Smith xv = x[k + idx]; 782c733ed4SBarry Smith s[0] -= v[0] * xv; 792c733ed4SBarry Smith s[1] -= v[1] * xv; 802c733ed4SBarry Smith s[2] -= v[2] * xv; 812c733ed4SBarry Smith s[3] -= v[3] * xv; 822c733ed4SBarry Smith s[4] -= v[4] * xv; 832c733ed4SBarry Smith s[5] -= v[5] * xv; 842c733ed4SBarry Smith s[6] -= v[6] * xv; 852c733ed4SBarry Smith s[7] -= v[7] * xv; 862c733ed4SBarry Smith s[8] -= v[8] * xv; 872c733ed4SBarry Smith s[9] -= v[9] * xv; 882c733ed4SBarry Smith s[10] -= v[10] * xv; 892c733ed4SBarry Smith v += 11; 902c733ed4SBarry Smith } 912c733ed4SBarry Smith } 929566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(x + idt, bs)); 932c733ed4SBarry Smith for (k = 0; k < 11; k++) { 942c733ed4SBarry Smith x[idt] += v[0] * s[k]; 952c733ed4SBarry Smith x[1 + idt] += v[1] * s[k]; 962c733ed4SBarry Smith x[2 + idt] += v[2] * s[k]; 972c733ed4SBarry Smith x[3 + idt] += v[3] * s[k]; 982c733ed4SBarry Smith x[4 + idt] += v[4] * s[k]; 992c733ed4SBarry Smith x[5 + idt] += v[5] * s[k]; 1002c733ed4SBarry Smith x[6 + idt] += v[6] * s[k]; 1012c733ed4SBarry Smith x[7 + idt] += v[7] * s[k]; 1022c733ed4SBarry Smith x[8 + idt] += v[8] * s[k]; 1032c733ed4SBarry Smith x[9 + idt] += v[9] * s[k]; 1042c733ed4SBarry Smith x[10 + idt] += v[10] * s[k]; 1052c733ed4SBarry Smith v += 11; 1062c733ed4SBarry Smith } 1072c733ed4SBarry Smith } 1089566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 1099566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 1109566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * bs2 * (a->nz) - bs * A->cmap->n)); 1112c733ed4SBarry Smith PetscFunctionReturn(0); 1122c733ed4SBarry Smith } 113