1 #include <../src/mat/impls/sbaij/seq/sbaij.h> 2 #include <petsc/private/kernels/blockinvert.h> 3 4 /* 5 Version for when blocks are 5 by 5 Using natural ordering 6 */ 7 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_5_NaturalOrdering(Mat C, Mat A, const MatFactorInfo *info) 8 { 9 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)C->data; 10 PetscInt i, j, mbs = a->mbs, *bi = b->i, *bj = b->j; 11 PetscInt *ai, *aj, k, k1, jmin, jmax, *jl, *il, vj, nexti, ili, ipvt[5]; 12 MatScalar *ba = b->a, *aa, *ap, *dk, *uik; 13 MatScalar *u, *d, *rtmp, *rtmp_ptr, work[25]; 14 PetscReal shift = info->shiftamount; 15 PetscBool allowzeropivot, zeropivotdetected; 16 17 PetscFunctionBegin; 18 /* initialization */ 19 allowzeropivot = PetscNot(A->erroriffailure); 20 PetscCall(PetscCalloc1(25 * mbs, &rtmp)); 21 PetscCall(PetscMalloc2(mbs, &il, mbs, &jl)); 22 il[0] = 0; 23 for (i = 0; i < mbs; i++) jl[i] = mbs; 24 25 PetscCall(PetscMalloc2(25, &dk, 25, &uik)); 26 ai = a->i; 27 aj = a->j; 28 aa = a->a; 29 30 /* for each row k */ 31 for (k = 0; k < mbs; k++) { 32 /*initialize k-th row with elements nonzero in row k of A */ 33 jmin = ai[k]; 34 jmax = ai[k + 1]; 35 if (jmin < jmax) { 36 ap = aa + jmin * 25; 37 for (j = jmin; j < jmax; j++) { 38 vj = aj[j]; /* block col. index */ 39 rtmp_ptr = rtmp + vj * 25; 40 for (i = 0; i < 25; i++) *rtmp_ptr++ = *ap++; 41 } 42 } 43 44 /* modify k-th row by adding in those rows i with U(i,k) != 0 */ 45 PetscCall(PetscArraycpy(dk, rtmp + k * 25, 25)); 46 i = jl[k]; /* first row to be added to k_th row */ 47 48 while (i < mbs) { 49 nexti = jl[i]; /* next row to be added to k_th row */ 50 51 /* compute multiplier */ 52 ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */ 53 54 /* uik = -inv(Di)*U_bar(i,k) */ 55 d = ba + i * 25; 56 u = ba + ili * 25; 57 58 uik[0] = -(d[0] * u[0] + d[5] * u[1] + d[10] * u[2] + d[15] * u[3] + d[20] * u[4]); 59 uik[1] = -(d[1] * u[0] + d[6] * u[1] + d[11] * u[2] + d[16] * u[3] + d[21] * u[4]); 60 uik[2] = -(d[2] * u[0] + d[7] * u[1] + d[12] * u[2] + d[17] * u[3] + d[22] * u[4]); 61 uik[3] = -(d[3] * u[0] + d[8] * u[1] + d[13] * u[2] + d[18] * u[3] + d[23] * u[4]); 62 uik[4] = -(d[4] * u[0] + d[9] * u[1] + d[14] * u[2] + d[19] * u[3] + d[24] * u[4]); 63 64 uik[5] = -(d[0] * u[5] + d[5] * u[6] + d[10] * u[7] + d[15] * u[8] + d[20] * u[9]); 65 uik[6] = -(d[1] * u[5] + d[6] * u[6] + d[11] * u[7] + d[16] * u[8] + d[21] * u[9]); 66 uik[7] = -(d[2] * u[5] + d[7] * u[6] + d[12] * u[7] + d[17] * u[8] + d[22] * u[9]); 67 uik[8] = -(d[3] * u[5] + d[8] * u[6] + d[13] * u[7] + d[18] * u[8] + d[23] * u[9]); 68 uik[9] = -(d[4] * u[5] + d[9] * u[6] + d[14] * u[7] + d[19] * u[8] + d[24] * u[9]); 69 70 uik[10] = -(d[0] * u[10] + d[5] * u[11] + d[10] * u[12] + d[15] * u[13] + d[20] * u[14]); 71 uik[11] = -(d[1] * u[10] + d[6] * u[11] + d[11] * u[12] + d[16] * u[13] + d[21] * u[14]); 72 uik[12] = -(d[2] * u[10] + d[7] * u[11] + d[12] * u[12] + d[17] * u[13] + d[22] * u[14]); 73 uik[13] = -(d[3] * u[10] + d[8] * u[11] + d[13] * u[12] + d[18] * u[13] + d[23] * u[14]); 74 uik[14] = -(d[4] * u[10] + d[9] * u[11] + d[14] * u[12] + d[19] * u[13] + d[24] * u[14]); 75 76 uik[15] = -(d[0] * u[15] + d[5] * u[16] + d[10] * u[17] + d[15] * u[18] + d[20] * u[19]); 77 uik[16] = -(d[1] * u[15] + d[6] * u[16] + d[11] * u[17] + d[16] * u[18] + d[21] * u[19]); 78 uik[17] = -(d[2] * u[15] + d[7] * u[16] + d[12] * u[17] + d[17] * u[18] + d[22] * u[19]); 79 uik[18] = -(d[3] * u[15] + d[8] * u[16] + d[13] * u[17] + d[18] * u[18] + d[23] * u[19]); 80 uik[19] = -(d[4] * u[15] + d[9] * u[16] + d[14] * u[17] + d[19] * u[18] + d[24] * u[19]); 81 82 uik[20] = -(d[0] * u[20] + d[5] * u[21] + d[10] * u[22] + d[15] * u[23] + d[20] * u[24]); 83 uik[21] = -(d[1] * u[20] + d[6] * u[21] + d[11] * u[22] + d[16] * u[23] + d[21] * u[24]); 84 uik[22] = -(d[2] * u[20] + d[7] * u[21] + d[12] * u[22] + d[17] * u[23] + d[22] * u[24]); 85 uik[23] = -(d[3] * u[20] + d[8] * u[21] + d[13] * u[22] + d[18] * u[23] + d[23] * u[24]); 86 uik[24] = -(d[4] * u[20] + d[9] * u[21] + d[14] * u[22] + d[19] * u[23] + d[24] * u[24]); 87 88 /* update D(k) += -U(i,k)^T * U_bar(i,k) */ 89 dk[0] += uik[0] * u[0] + uik[1] * u[1] + uik[2] * u[2] + uik[3] * u[3] + uik[4] * u[4]; 90 dk[1] += uik[5] * u[0] + uik[6] * u[1] + uik[7] * u[2] + uik[8] * u[3] + uik[9] * u[4]; 91 dk[2] += uik[10] * u[0] + uik[11] * u[1] + uik[12] * u[2] + uik[13] * u[3] + uik[14] * u[4]; 92 dk[3] += uik[15] * u[0] + uik[16] * u[1] + uik[17] * u[2] + uik[18] * u[3] + uik[19] * u[4]; 93 dk[4] += uik[20] * u[0] + uik[21] * u[1] + uik[22] * u[2] + uik[23] * u[3] + uik[24] * u[4]; 94 95 dk[5] += uik[0] * u[5] + uik[1] * u[6] + uik[2] * u[7] + uik[3] * u[8] + uik[4] * u[9]; 96 dk[6] += uik[5] * u[5] + uik[6] * u[6] + uik[7] * u[7] + uik[8] * u[8] + uik[9] * u[9]; 97 dk[7] += uik[10] * u[5] + uik[11] * u[6] + uik[12] * u[7] + uik[13] * u[8] + uik[14] * u[9]; 98 dk[8] += uik[15] * u[5] + uik[16] * u[6] + uik[17] * u[7] + uik[18] * u[8] + uik[19] * u[9]; 99 dk[9] += uik[20] * u[5] + uik[21] * u[6] + uik[22] * u[7] + uik[23] * u[8] + uik[24] * u[9]; 100 101 dk[10] += uik[0] * u[10] + uik[1] * u[11] + uik[2] * u[12] + uik[3] * u[13] + uik[4] * u[14]; 102 dk[11] += uik[5] * u[10] + uik[6] * u[11] + uik[7] * u[12] + uik[8] * u[13] + uik[9] * u[14]; 103 dk[12] += uik[10] * u[10] + uik[11] * u[11] + uik[12] * u[12] + uik[13] * u[13] + uik[14] * u[14]; 104 dk[13] += uik[15] * u[10] + uik[16] * u[11] + uik[17] * u[12] + uik[18] * u[13] + uik[19] * u[14]; 105 dk[14] += uik[20] * u[10] + uik[21] * u[11] + uik[22] * u[12] + uik[23] * u[13] + uik[24] * u[14]; 106 107 dk[15] += uik[0] * u[15] + uik[1] * u[16] + uik[2] * u[17] + uik[3] * u[18] + uik[4] * u[19]; 108 dk[16] += uik[5] * u[15] + uik[6] * u[16] + uik[7] * u[17] + uik[8] * u[18] + uik[9] * u[19]; 109 dk[17] += uik[10] * u[15] + uik[11] * u[16] + uik[12] * u[17] + uik[13] * u[18] + uik[14] * u[19]; 110 dk[18] += uik[15] * u[15] + uik[16] * u[16] + uik[17] * u[17] + uik[18] * u[18] + uik[19] * u[19]; 111 dk[19] += uik[20] * u[15] + uik[21] * u[16] + uik[22] * u[17] + uik[23] * u[18] + uik[24] * u[19]; 112 113 dk[20] += uik[0] * u[20] + uik[1] * u[21] + uik[2] * u[22] + uik[3] * u[23] + uik[4] * u[24]; 114 dk[21] += uik[5] * u[20] + uik[6] * u[21] + uik[7] * u[22] + uik[8] * u[23] + uik[9] * u[24]; 115 dk[22] += uik[10] * u[20] + uik[11] * u[21] + uik[12] * u[22] + uik[13] * u[23] + uik[14] * u[24]; 116 dk[23] += uik[15] * u[20] + uik[16] * u[21] + uik[17] * u[22] + uik[18] * u[23] + uik[19] * u[24]; 117 dk[24] += uik[20] * u[20] + uik[21] * u[21] + uik[22] * u[22] + uik[23] * u[23] + uik[24] * u[24]; 118 119 PetscCall(PetscLogFlops(125.0 * 4.0)); 120 121 /* update -U(i,k) */ 122 PetscCall(PetscArraycpy(ba + ili * 25, uik, 25)); 123 124 /* add multiple of row i to k-th row ... */ 125 jmin = ili + 1; 126 jmax = bi[i + 1]; 127 if (jmin < jmax) { 128 for (j = jmin; j < jmax; j++) { 129 /* rtmp += -U(i,k)^T * U_bar(i,j) */ 130 rtmp_ptr = rtmp + bj[j] * 25; 131 u = ba + j * 25; 132 rtmp_ptr[0] += uik[0] * u[0] + uik[1] * u[1] + uik[2] * u[2] + uik[3] * u[3] + uik[4] * u[4]; 133 rtmp_ptr[1] += uik[5] * u[0] + uik[6] * u[1] + uik[7] * u[2] + uik[8] * u[3] + uik[9] * u[4]; 134 rtmp_ptr[2] += uik[10] * u[0] + uik[11] * u[1] + uik[12] * u[2] + uik[13] * u[3] + uik[14] * u[4]; 135 rtmp_ptr[3] += uik[15] * u[0] + uik[16] * u[1] + uik[17] * u[2] + uik[18] * u[3] + uik[19] * u[4]; 136 rtmp_ptr[4] += uik[20] * u[0] + uik[21] * u[1] + uik[22] * u[2] + uik[23] * u[3] + uik[24] * u[4]; 137 138 rtmp_ptr[5] += uik[0] * u[5] + uik[1] * u[6] + uik[2] * u[7] + uik[3] * u[8] + uik[4] * u[9]; 139 rtmp_ptr[6] += uik[5] * u[5] + uik[6] * u[6] + uik[7] * u[7] + uik[8] * u[8] + uik[9] * u[9]; 140 rtmp_ptr[7] += uik[10] * u[5] + uik[11] * u[6] + uik[12] * u[7] + uik[13] * u[8] + uik[14] * u[9]; 141 rtmp_ptr[8] += uik[15] * u[5] + uik[16] * u[6] + uik[17] * u[7] + uik[18] * u[8] + uik[19] * u[9]; 142 rtmp_ptr[9] += uik[20] * u[5] + uik[21] * u[6] + uik[22] * u[7] + uik[23] * u[8] + uik[24] * u[9]; 143 144 rtmp_ptr[10] += uik[0] * u[10] + uik[1] * u[11] + uik[2] * u[12] + uik[3] * u[13] + uik[4] * u[14]; 145 rtmp_ptr[11] += uik[5] * u[10] + uik[6] * u[11] + uik[7] * u[12] + uik[8] * u[13] + uik[9] * u[14]; 146 rtmp_ptr[12] += uik[10] * u[10] + uik[11] * u[11] + uik[12] * u[12] + uik[13] * u[13] + uik[14] * u[14]; 147 rtmp_ptr[13] += uik[15] * u[10] + uik[16] * u[11] + uik[17] * u[12] + uik[18] * u[13] + uik[19] * u[14]; 148 rtmp_ptr[14] += uik[20] * u[10] + uik[21] * u[11] + uik[22] * u[12] + uik[23] * u[13] + uik[24] * u[14]; 149 150 rtmp_ptr[15] += uik[0] * u[15] + uik[1] * u[16] + uik[2] * u[17] + uik[3] * u[18] + uik[4] * u[19]; 151 rtmp_ptr[16] += uik[5] * u[15] + uik[6] * u[16] + uik[7] * u[17] + uik[8] * u[18] + uik[9] * u[19]; 152 rtmp_ptr[17] += uik[10] * u[15] + uik[11] * u[16] + uik[12] * u[17] + uik[13] * u[18] + uik[14] * u[19]; 153 rtmp_ptr[18] += uik[15] * u[15] + uik[16] * u[16] + uik[17] * u[17] + uik[18] * u[18] + uik[19] * u[19]; 154 rtmp_ptr[19] += uik[20] * u[15] + uik[21] * u[16] + uik[22] * u[17] + uik[23] * u[18] + uik[24] * u[19]; 155 156 rtmp_ptr[20] += uik[0] * u[20] + uik[1] * u[21] + uik[2] * u[22] + uik[3] * u[23] + uik[4] * u[24]; 157 rtmp_ptr[21] += uik[5] * u[20] + uik[6] * u[21] + uik[7] * u[22] + uik[8] * u[23] + uik[9] * u[24]; 158 rtmp_ptr[22] += uik[10] * u[20] + uik[11] * u[21] + uik[12] * u[22] + uik[13] * u[23] + uik[14] * u[24]; 159 rtmp_ptr[23] += uik[15] * u[20] + uik[16] * u[21] + uik[17] * u[22] + uik[18] * u[23] + uik[19] * u[24]; 160 rtmp_ptr[24] += uik[20] * u[20] + uik[21] * u[21] + uik[22] * u[22] + uik[23] * u[23] + uik[24] * u[24]; 161 } 162 PetscCall(PetscLogFlops(2.0 * 125.0 * (jmax - jmin))); 163 164 /* ... add i to row list for next nonzero entry */ 165 il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */ 166 j = bj[jmin]; 167 jl[i] = jl[j]; 168 jl[j] = i; /* update jl */ 169 } 170 i = nexti; 171 } 172 173 /* save nonzero entries in k-th row of U ... */ 174 175 /* invert diagonal block */ 176 d = ba + k * 25; 177 PetscCall(PetscArraycpy(d, dk, 25)); 178 PetscCall(PetscKernel_A_gets_inverse_A_5(d, ipvt, work, shift, allowzeropivot, &zeropivotdetected)); 179 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 180 181 jmin = bi[k]; 182 jmax = bi[k + 1]; 183 if (jmin < jmax) { 184 for (j = jmin; j < jmax; j++) { 185 vj = bj[j]; /* block col. index of U */ 186 u = ba + j * 25; 187 rtmp_ptr = rtmp + vj * 25; 188 for (k1 = 0; k1 < 25; k1++) { 189 *u++ = *rtmp_ptr; 190 *rtmp_ptr++ = 0.0; 191 } 192 } 193 194 /* ... add k to row list for first nonzero entry in k-th row */ 195 il[k] = jmin; 196 i = bj[jmin]; 197 jl[k] = jl[i]; 198 jl[i] = k; 199 } 200 } 201 202 PetscCall(PetscFree(rtmp)); 203 PetscCall(PetscFree2(il, jl)); 204 PetscCall(PetscFree2(dk, uik)); 205 206 C->ops->solve = MatSolve_SeqSBAIJ_5_NaturalOrdering_inplace; 207 C->ops->solvetranspose = MatSolve_SeqSBAIJ_5_NaturalOrdering_inplace; 208 C->ops->forwardsolve = MatForwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace; 209 C->ops->backwardsolve = MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace; 210 C->assembled = PETSC_TRUE; 211 C->preallocated = PETSC_TRUE; 212 213 PetscCall(PetscLogFlops(1.3333 * 125 * b->mbs)); /* from inverting diagonal blocks */ 214 PetscFunctionReturn(PETSC_SUCCESS); 215 } 216