12c733ed4SBarry Smith #include <../src/mat/impls/baij/seq/baij.h> 22c733ed4SBarry Smith #include <petsc/private/kernels/blockinvert.h> 32c733ed4SBarry Smith 42c733ed4SBarry Smith /* Block operations are done by accessing one column at at time */ 52c733ed4SBarry Smith /* Default MatSolve for block size 14 */ 62c733ed4SBarry Smith 72c733ed4SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_14_NaturalOrdering(Mat A,Vec bb,Vec xx) 82c733ed4SBarry Smith { 92c733ed4SBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data; 102c733ed4SBarry Smith const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2; 112c733ed4SBarry Smith PetscInt i,k,nz,idx,idt,m; 122c733ed4SBarry Smith const MatScalar *aa=a->a,*v; 132c733ed4SBarry Smith PetscScalar s[14]; 142c733ed4SBarry Smith PetscScalar *x,xv; 152c733ed4SBarry Smith const PetscScalar *b; 162c733ed4SBarry Smith 172c733ed4SBarry Smith PetscFunctionBegin; 18*9566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb,&b)); 19*9566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx,&x)); 202c733ed4SBarry Smith 212c733ed4SBarry Smith /* forward solve the lower triangular */ 222c733ed4SBarry Smith for (i=0; i<n; i++) { 232c733ed4SBarry Smith v = aa + bs2*ai[i]; 242c733ed4SBarry Smith vi = aj + ai[i]; 252c733ed4SBarry Smith nz = ai[i+1] - ai[i]; 262c733ed4SBarry Smith idt = bs*i; 272c733ed4SBarry Smith x[idt] = b[idt]; x[1+idt] = b[1+idt]; x[2+idt] = b[2+idt]; x[3+idt] = b[3+idt]; x[4+idt] = b[4+idt]; 282c733ed4SBarry Smith x[5+idt] = b[5+idt]; x[6+idt] = b[6+idt]; x[7+idt] = b[7+idt]; x[8+idt] = b[8+idt]; x[9+idt] = b[9+idt]; 292c733ed4SBarry Smith x[10+idt] = b[10+idt]; x[11+idt] = b[11+idt]; x[12+idt] = b[12+idt]; x[13+idt] = b[13+idt]; 302c733ed4SBarry Smith for (m=0; m<nz; m++) { 312c733ed4SBarry Smith idx = bs*vi[m]; 322c733ed4SBarry Smith for (k=0; k<bs; k++) { 332c733ed4SBarry Smith xv = x[k + idx]; 342c733ed4SBarry Smith x[idt] -= v[0]*xv; 352c733ed4SBarry Smith x[1+idt] -= v[1]*xv; 362c733ed4SBarry Smith x[2+idt] -= v[2]*xv; 372c733ed4SBarry Smith x[3+idt] -= v[3]*xv; 382c733ed4SBarry Smith x[4+idt] -= v[4]*xv; 392c733ed4SBarry Smith x[5+idt] -= v[5]*xv; 402c733ed4SBarry Smith x[6+idt] -= v[6]*xv; 412c733ed4SBarry Smith x[7+idt] -= v[7]*xv; 422c733ed4SBarry Smith x[8+idt] -= v[8]*xv; 432c733ed4SBarry Smith x[9+idt] -= v[9]*xv; 442c733ed4SBarry Smith x[10+idt] -= v[10]*xv; 452c733ed4SBarry Smith x[11+idt] -= v[11]*xv; 462c733ed4SBarry Smith x[12+idt] -= v[12]*xv; 472c733ed4SBarry Smith x[13+idt] -= v[13]*xv; 482c733ed4SBarry Smith v += 14; 492c733ed4SBarry Smith } 502c733ed4SBarry Smith } 512c733ed4SBarry Smith } 522c733ed4SBarry Smith /* backward solve the upper triangular */ 532c733ed4SBarry Smith for (i=n-1; i>=0; i--) { 542c733ed4SBarry Smith v = aa + bs2*(adiag[i+1]+1); 552c733ed4SBarry Smith vi = aj + adiag[i+1]+1; 562c733ed4SBarry Smith nz = adiag[i] - adiag[i+1] - 1; 572c733ed4SBarry Smith idt = bs*i; 582c733ed4SBarry Smith s[0] = x[idt]; s[1] = x[1+idt]; s[2] = x[2+idt]; s[3] = x[3+idt]; s[4] = x[4+idt]; 592c733ed4SBarry Smith s[5] = x[5+idt]; s[6] = x[6+idt]; s[7] = x[7+idt]; s[8] = x[8+idt]; s[9] = x[9+idt]; 602c733ed4SBarry Smith s[10] = x[10+idt]; s[11] = x[11+idt]; s[12] = x[12+idt]; s[13] = x[13+idt]; 612c733ed4SBarry Smith 622c733ed4SBarry Smith for (m=0; m<nz; m++) { 632c733ed4SBarry Smith idx = bs*vi[m]; 642c733ed4SBarry Smith for (k=0; k<bs; k++) { 652c733ed4SBarry Smith xv = x[k + idx]; 662c733ed4SBarry Smith s[0] -= v[0]*xv; 672c733ed4SBarry Smith s[1] -= v[1]*xv; 682c733ed4SBarry Smith s[2] -= v[2]*xv; 692c733ed4SBarry Smith s[3] -= v[3]*xv; 702c733ed4SBarry Smith s[4] -= v[4]*xv; 712c733ed4SBarry Smith s[5] -= v[5]*xv; 722c733ed4SBarry Smith s[6] -= v[6]*xv; 732c733ed4SBarry Smith s[7] -= v[7]*xv; 742c733ed4SBarry Smith s[8] -= v[8]*xv; 752c733ed4SBarry Smith s[9] -= v[9]*xv; 762c733ed4SBarry Smith s[10] -= v[10]*xv; 772c733ed4SBarry Smith s[11] -= v[11]*xv; 782c733ed4SBarry Smith s[12] -= v[12]*xv; 792c733ed4SBarry Smith s[13] -= v[13]*xv; 802c733ed4SBarry Smith v += 14; 812c733ed4SBarry Smith } 822c733ed4SBarry Smith } 83*9566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(x+idt,bs)); 842c733ed4SBarry Smith for (k=0; k<bs; k++) { 852c733ed4SBarry Smith x[idt] += v[0]*s[k]; 862c733ed4SBarry Smith x[1+idt] += v[1]*s[k]; 872c733ed4SBarry Smith x[2+idt] += v[2]*s[k]; 882c733ed4SBarry Smith x[3+idt] += v[3]*s[k]; 892c733ed4SBarry Smith x[4+idt] += v[4]*s[k]; 902c733ed4SBarry Smith x[5+idt] += v[5]*s[k]; 912c733ed4SBarry Smith x[6+idt] += v[6]*s[k]; 922c733ed4SBarry Smith x[7+idt] += v[7]*s[k]; 932c733ed4SBarry Smith x[8+idt] += v[8]*s[k]; 942c733ed4SBarry Smith x[9+idt] += v[9]*s[k]; 952c733ed4SBarry Smith x[10+idt] += v[10]*s[k]; 962c733ed4SBarry Smith x[11+idt] += v[11]*s[k]; 972c733ed4SBarry Smith x[12+idt] += v[12]*s[k]; 982c733ed4SBarry Smith x[13+idt] += v[13]*s[k]; 992c733ed4SBarry Smith v += 14; 1002c733ed4SBarry Smith } 1012c733ed4SBarry Smith } 102*9566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb,&b)); 103*9566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx,&x)); 104*9566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n)); 1052c733ed4SBarry Smith PetscFunctionReturn(0); 1062c733ed4SBarry Smith } 1072c733ed4SBarry Smith 1082c733ed4SBarry Smith /* Block operations are done by accessing one column at at time */ 1092c733ed4SBarry Smith /* Default MatSolve for block size 13 */ 1102c733ed4SBarry Smith 1112c733ed4SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_13_NaturalOrdering(Mat A,Vec bb,Vec xx) 1122c733ed4SBarry Smith { 1132c733ed4SBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data; 1142c733ed4SBarry Smith const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2; 1152c733ed4SBarry Smith PetscInt i,k,nz,idx,idt,m; 1162c733ed4SBarry Smith const MatScalar *aa=a->a,*v; 1172c733ed4SBarry Smith PetscScalar s[13]; 1182c733ed4SBarry Smith PetscScalar *x,xv; 1192c733ed4SBarry Smith const PetscScalar *b; 1202c733ed4SBarry Smith 1212c733ed4SBarry Smith PetscFunctionBegin; 122*9566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb,&b)); 123*9566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx,&x)); 1242c733ed4SBarry Smith 1252c733ed4SBarry Smith /* forward solve the lower triangular */ 1262c733ed4SBarry Smith for (i=0; i<n; i++) { 1272c733ed4SBarry Smith v = aa + bs2*ai[i]; 1282c733ed4SBarry Smith vi = aj + ai[i]; 1292c733ed4SBarry Smith nz = ai[i+1] - ai[i]; 1302c733ed4SBarry Smith idt = bs*i; 1312c733ed4SBarry Smith x[idt] = b[idt]; x[1+idt] = b[1+idt]; x[2+idt] = b[2+idt]; x[3+idt] = b[3+idt]; x[4+idt] = b[4+idt]; 1322c733ed4SBarry Smith x[5+idt] = b[5+idt]; x[6+idt] = b[6+idt]; x[7+idt] = b[7+idt]; x[8+idt] = b[8+idt]; x[9+idt] = b[9+idt]; 1332c733ed4SBarry Smith x[10+idt] = b[10+idt]; x[11+idt] = b[11+idt]; x[12+idt] = b[12+idt]; 1342c733ed4SBarry Smith for (m=0; m<nz; m++) { 1352c733ed4SBarry Smith idx = bs*vi[m]; 1362c733ed4SBarry Smith for (k=0; k<bs; k++) { 1372c733ed4SBarry Smith xv = x[k + idx]; 1382c733ed4SBarry Smith x[idt] -= v[0]*xv; 1392c733ed4SBarry Smith x[1+idt] -= v[1]*xv; 1402c733ed4SBarry Smith x[2+idt] -= v[2]*xv; 1412c733ed4SBarry Smith x[3+idt] -= v[3]*xv; 1422c733ed4SBarry Smith x[4+idt] -= v[4]*xv; 1432c733ed4SBarry Smith x[5+idt] -= v[5]*xv; 1442c733ed4SBarry Smith x[6+idt] -= v[6]*xv; 1452c733ed4SBarry Smith x[7+idt] -= v[7]*xv; 1462c733ed4SBarry Smith x[8+idt] -= v[8]*xv; 1472c733ed4SBarry Smith x[9+idt] -= v[9]*xv; 1482c733ed4SBarry Smith x[10+idt] -= v[10]*xv; 1492c733ed4SBarry Smith x[11+idt] -= v[11]*xv; 1502c733ed4SBarry Smith x[12+idt] -= v[12]*xv; 1512c733ed4SBarry Smith v += 13; 1522c733ed4SBarry Smith } 1532c733ed4SBarry Smith } 1542c733ed4SBarry Smith } 1552c733ed4SBarry Smith /* backward solve the upper triangular */ 1562c733ed4SBarry Smith for (i=n-1; i>=0; i--) { 1572c733ed4SBarry Smith v = aa + bs2*(adiag[i+1]+1); 1582c733ed4SBarry Smith vi = aj + adiag[i+1]+1; 1592c733ed4SBarry Smith nz = adiag[i] - adiag[i+1] - 1; 1602c733ed4SBarry Smith idt = bs*i; 1612c733ed4SBarry Smith s[0] = x[idt]; s[1] = x[1+idt]; s[2] = x[2+idt]; s[3] = x[3+idt]; s[4] = x[4+idt]; 1622c733ed4SBarry Smith s[5] = x[5+idt]; s[6] = x[6+idt]; s[7] = x[7+idt]; s[8] = x[8+idt]; s[9] = x[9+idt]; 1632c733ed4SBarry Smith s[10] = x[10+idt]; s[11] = x[11+idt]; s[12] = x[12+idt]; 1642c733ed4SBarry Smith 1652c733ed4SBarry Smith for (m=0; m<nz; m++) { 1662c733ed4SBarry Smith idx = bs*vi[m]; 1672c733ed4SBarry Smith for (k=0; k<bs; k++) { 1682c733ed4SBarry Smith xv = x[k + idx]; 1692c733ed4SBarry Smith s[0] -= v[0]*xv; 1702c733ed4SBarry Smith s[1] -= v[1]*xv; 1712c733ed4SBarry Smith s[2] -= v[2]*xv; 1722c733ed4SBarry Smith s[3] -= v[3]*xv; 1732c733ed4SBarry Smith s[4] -= v[4]*xv; 1742c733ed4SBarry Smith s[5] -= v[5]*xv; 1752c733ed4SBarry Smith s[6] -= v[6]*xv; 1762c733ed4SBarry Smith s[7] -= v[7]*xv; 1772c733ed4SBarry Smith s[8] -= v[8]*xv; 1782c733ed4SBarry Smith s[9] -= v[9]*xv; 1792c733ed4SBarry Smith s[10] -= v[10]*xv; 1802c733ed4SBarry Smith s[11] -= v[11]*xv; 1812c733ed4SBarry Smith s[12] -= v[12]*xv; 1822c733ed4SBarry Smith v += 13; 1832c733ed4SBarry Smith } 1842c733ed4SBarry Smith } 185*9566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(x+idt,bs)); 1862c733ed4SBarry Smith for (k=0; k<bs; k++) { 1872c733ed4SBarry Smith x[idt] += v[0]*s[k]; 1882c733ed4SBarry Smith x[1+idt] += v[1]*s[k]; 1892c733ed4SBarry Smith x[2+idt] += v[2]*s[k]; 1902c733ed4SBarry Smith x[3+idt] += v[3]*s[k]; 1912c733ed4SBarry Smith x[4+idt] += v[4]*s[k]; 1922c733ed4SBarry Smith x[5+idt] += v[5]*s[k]; 1932c733ed4SBarry Smith x[6+idt] += v[6]*s[k]; 1942c733ed4SBarry Smith x[7+idt] += v[7]*s[k]; 1952c733ed4SBarry Smith x[8+idt] += v[8]*s[k]; 1962c733ed4SBarry Smith x[9+idt] += v[9]*s[k]; 1972c733ed4SBarry Smith x[10+idt] += v[10]*s[k]; 1982c733ed4SBarry Smith x[11+idt] += v[11]*s[k]; 1992c733ed4SBarry Smith x[12+idt] += v[12]*s[k]; 2002c733ed4SBarry Smith v += 13; 2012c733ed4SBarry Smith } 2022c733ed4SBarry Smith } 203*9566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb,&b)); 204*9566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx,&x)); 205*9566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n)); 2062c733ed4SBarry Smith PetscFunctionReturn(0); 2072c733ed4SBarry Smith } 2082c733ed4SBarry Smith 2092c733ed4SBarry Smith /* Block operations are done by accessing one column at at time */ 2102c733ed4SBarry Smith /* Default MatSolve for block size 12 */ 2112c733ed4SBarry Smith 2122c733ed4SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_12_NaturalOrdering(Mat A,Vec bb,Vec xx) 2132c733ed4SBarry Smith { 2142c733ed4SBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data; 2152c733ed4SBarry Smith const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2; 2162c733ed4SBarry Smith PetscInt i,k,nz,idx,idt,m; 2172c733ed4SBarry Smith const MatScalar *aa=a->a,*v; 2182c733ed4SBarry Smith PetscScalar s[12]; 2192c733ed4SBarry Smith PetscScalar *x,xv; 2202c733ed4SBarry Smith const PetscScalar *b; 2212c733ed4SBarry Smith 2222c733ed4SBarry Smith PetscFunctionBegin; 223*9566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb,&b)); 224*9566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx,&x)); 2252c733ed4SBarry Smith 2262c733ed4SBarry Smith /* forward solve the lower triangular */ 2272c733ed4SBarry Smith for (i=0; i<n; i++) { 2282c733ed4SBarry Smith v = aa + bs2*ai[i]; 2292c733ed4SBarry Smith vi = aj + ai[i]; 2302c733ed4SBarry Smith nz = ai[i+1] - ai[i]; 2312c733ed4SBarry Smith idt = bs*i; 2322c733ed4SBarry Smith x[idt] = b[idt]; x[1+idt] = b[1+idt]; x[2+idt] = b[2+idt]; x[3+idt] = b[3+idt]; x[4+idt] = b[4+idt]; 2332c733ed4SBarry Smith x[5+idt] = b[5+idt]; x[6+idt] = b[6+idt]; x[7+idt] = b[7+idt]; x[8+idt] = b[8+idt]; x[9+idt] = b[9+idt]; 2342c733ed4SBarry Smith x[10+idt] = b[10+idt]; x[11+idt] = b[11+idt]; 2352c733ed4SBarry Smith for (m=0; m<nz; m++) { 2362c733ed4SBarry Smith idx = bs*vi[m]; 2372c733ed4SBarry Smith for (k=0; k<bs; k++) { 2382c733ed4SBarry Smith xv = x[k + idx]; 2392c733ed4SBarry Smith x[idt] -= v[0]*xv; 2402c733ed4SBarry Smith x[1+idt] -= v[1]*xv; 2412c733ed4SBarry Smith x[2+idt] -= v[2]*xv; 2422c733ed4SBarry Smith x[3+idt] -= v[3]*xv; 2432c733ed4SBarry Smith x[4+idt] -= v[4]*xv; 2442c733ed4SBarry Smith x[5+idt] -= v[5]*xv; 2452c733ed4SBarry Smith x[6+idt] -= v[6]*xv; 2462c733ed4SBarry Smith x[7+idt] -= v[7]*xv; 2472c733ed4SBarry Smith x[8+idt] -= v[8]*xv; 2482c733ed4SBarry Smith x[9+idt] -= v[9]*xv; 2492c733ed4SBarry Smith x[10+idt] -= v[10]*xv; 2502c733ed4SBarry Smith x[11+idt] -= v[11]*xv; 2512c733ed4SBarry Smith v += 12; 2522c733ed4SBarry Smith } 2532c733ed4SBarry Smith } 2542c733ed4SBarry Smith } 2552c733ed4SBarry Smith /* backward solve the upper triangular */ 2562c733ed4SBarry Smith for (i=n-1; i>=0; i--) { 2572c733ed4SBarry Smith v = aa + bs2*(adiag[i+1]+1); 2582c733ed4SBarry Smith vi = aj + adiag[i+1]+1; 2592c733ed4SBarry Smith nz = adiag[i] - adiag[i+1] - 1; 2602c733ed4SBarry Smith idt = bs*i; 2612c733ed4SBarry Smith s[0] = x[idt]; s[1] = x[1+idt]; s[2] = x[2+idt]; s[3] = x[3+idt]; s[4] = x[4+idt]; 2622c733ed4SBarry Smith s[5] = x[5+idt]; s[6] = x[6+idt]; s[7] = x[7+idt]; s[8] = x[8+idt]; s[9] = x[9+idt]; 2632c733ed4SBarry Smith s[10] = x[10+idt]; s[11] = x[11+idt]; 2642c733ed4SBarry Smith 2652c733ed4SBarry Smith for (m=0; m<nz; m++) { 2662c733ed4SBarry Smith idx = bs*vi[m]; 2672c733ed4SBarry Smith for (k=0; k<bs; k++) { 2682c733ed4SBarry Smith xv = x[k + idx]; 2692c733ed4SBarry Smith s[0] -= v[0]*xv; 2702c733ed4SBarry Smith s[1] -= v[1]*xv; 2712c733ed4SBarry Smith s[2] -= v[2]*xv; 2722c733ed4SBarry Smith s[3] -= v[3]*xv; 2732c733ed4SBarry Smith s[4] -= v[4]*xv; 2742c733ed4SBarry Smith s[5] -= v[5]*xv; 2752c733ed4SBarry Smith s[6] -= v[6]*xv; 2762c733ed4SBarry Smith s[7] -= v[7]*xv; 2772c733ed4SBarry Smith s[8] -= v[8]*xv; 2782c733ed4SBarry Smith s[9] -= v[9]*xv; 2792c733ed4SBarry Smith s[10] -= v[10]*xv; 2802c733ed4SBarry Smith s[11] -= v[11]*xv; 2812c733ed4SBarry Smith v += 12; 2822c733ed4SBarry Smith } 2832c733ed4SBarry Smith } 284*9566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(x+idt,bs)); 2852c733ed4SBarry Smith for (k=0; k<bs; k++) { 2862c733ed4SBarry Smith x[idt] += v[0]*s[k]; 2872c733ed4SBarry Smith x[1+idt] += v[1]*s[k]; 2882c733ed4SBarry Smith x[2+idt] += v[2]*s[k]; 2892c733ed4SBarry Smith x[3+idt] += v[3]*s[k]; 2902c733ed4SBarry Smith x[4+idt] += v[4]*s[k]; 2912c733ed4SBarry Smith x[5+idt] += v[5]*s[k]; 2922c733ed4SBarry Smith x[6+idt] += v[6]*s[k]; 2932c733ed4SBarry Smith x[7+idt] += v[7]*s[k]; 2942c733ed4SBarry Smith x[8+idt] += v[8]*s[k]; 2952c733ed4SBarry Smith x[9+idt] += v[9]*s[k]; 2962c733ed4SBarry Smith x[10+idt] += v[10]*s[k]; 2972c733ed4SBarry Smith x[11+idt] += v[11]*s[k]; 2982c733ed4SBarry Smith v += 12; 2992c733ed4SBarry Smith } 3002c733ed4SBarry Smith } 301*9566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb,&b)); 302*9566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx,&x)); 303*9566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n)); 3042c733ed4SBarry Smith PetscFunctionReturn(0); 3052c733ed4SBarry Smith } 306