1 2 #include <../src/mat/impls/sbaij/seq/sbaij.h> 3 #include <petsc/private/kernels/blockinvert.h> 4 5 /* 6 Version for when blocks are 5 by 5 Using natural ordering 7 */ 8 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_5_NaturalOrdering(Mat C, Mat A, const MatFactorInfo *info) 9 { 10 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)C->data; 11 PetscInt i, j, mbs = a->mbs, *bi = b->i, *bj = b->j; 12 PetscInt *ai, *aj, k, k1, jmin, jmax, *jl, *il, vj, nexti, ili, ipvt[5]; 13 MatScalar *ba = b->a, *aa, *ap, *dk, *uik; 14 MatScalar *u, *d, *rtmp, *rtmp_ptr, work[25]; 15 PetscReal shift = info->shiftamount; 16 PetscBool allowzeropivot, zeropivotdetected; 17 18 PetscFunctionBegin; 19 /* initialization */ 20 allowzeropivot = PetscNot(A->erroriffailure); 21 PetscCall(PetscCalloc1(25 * mbs, &rtmp)); 22 PetscCall(PetscMalloc2(mbs, &il, mbs, &jl)); 23 il[0] = 0; 24 for (i = 0; i < mbs; i++) jl[i] = mbs; 25 26 PetscCall(PetscMalloc2(25, &dk, 25, &uik)); 27 ai = a->i; 28 aj = a->j; 29 aa = a->a; 30 31 /* for each row k */ 32 for (k = 0; k < mbs; k++) { 33 /*initialize k-th row with elements nonzero in row k of A */ 34 jmin = ai[k]; 35 jmax = ai[k + 1]; 36 if (jmin < jmax) { 37 ap = aa + jmin * 25; 38 for (j = jmin; j < jmax; j++) { 39 vj = aj[j]; /* block col. index */ 40 rtmp_ptr = rtmp + vj * 25; 41 for (i = 0; i < 25; i++) *rtmp_ptr++ = *ap++; 42 } 43 } 44 45 /* modify k-th row by adding in those rows i with U(i,k) != 0 */ 46 PetscCall(PetscArraycpy(dk, rtmp + k * 25, 25)); 47 i = jl[k]; /* first row to be added to k_th row */ 48 49 while (i < mbs) { 50 nexti = jl[i]; /* next row to be added to k_th row */ 51 52 /* compute multiplier */ 53 ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */ 54 55 /* uik = -inv(Di)*U_bar(i,k) */ 56 d = ba + i * 25; 57 u = ba + ili * 25; 58 59 uik[0] = -(d[0] * u[0] + d[5] * u[1] + d[10] * u[2] + d[15] * u[3] + d[20] * u[4]); 60 uik[1] = -(d[1] * u[0] + d[6] * u[1] + d[11] * u[2] + d[16] * u[3] + d[21] * u[4]); 61 uik[2] = -(d[2] * u[0] + d[7] * u[1] + d[12] * u[2] + d[17] * u[3] + d[22] * u[4]); 62 uik[3] = -(d[3] * u[0] + d[8] * u[1] + d[13] * u[2] + d[18] * u[3] + d[23] * u[4]); 63 uik[4] = -(d[4] * u[0] + d[9] * u[1] + d[14] * u[2] + d[19] * u[3] + d[24] * u[4]); 64 65 uik[5] = -(d[0] * u[5] + d[5] * u[6] + d[10] * u[7] + d[15] * u[8] + d[20] * u[9]); 66 uik[6] = -(d[1] * u[5] + d[6] * u[6] + d[11] * u[7] + d[16] * u[8] + d[21] * u[9]); 67 uik[7] = -(d[2] * u[5] + d[7] * u[6] + d[12] * u[7] + d[17] * u[8] + d[22] * u[9]); 68 uik[8] = -(d[3] * u[5] + d[8] * u[6] + d[13] * u[7] + d[18] * u[8] + d[23] * u[9]); 69 uik[9] = -(d[4] * u[5] + d[9] * u[6] + d[14] * u[7] + d[19] * u[8] + d[24] * u[9]); 70 71 uik[10] = -(d[0] * u[10] + d[5] * u[11] + d[10] * u[12] + d[15] * u[13] + d[20] * u[14]); 72 uik[11] = -(d[1] * u[10] + d[6] * u[11] + d[11] * u[12] + d[16] * u[13] + d[21] * u[14]); 73 uik[12] = -(d[2] * u[10] + d[7] * u[11] + d[12] * u[12] + d[17] * u[13] + d[22] * u[14]); 74 uik[13] = -(d[3] * u[10] + d[8] * u[11] + d[13] * u[12] + d[18] * u[13] + d[23] * u[14]); 75 uik[14] = -(d[4] * u[10] + d[9] * u[11] + d[14] * u[12] + d[19] * u[13] + d[24] * u[14]); 76 77 uik[15] = -(d[0] * u[15] + d[5] * u[16] + d[10] * u[17] + d[15] * u[18] + d[20] * u[19]); 78 uik[16] = -(d[1] * u[15] + d[6] * u[16] + d[11] * u[17] + d[16] * u[18] + d[21] * u[19]); 79 uik[17] = -(d[2] * u[15] + d[7] * u[16] + d[12] * u[17] + d[17] * u[18] + d[22] * u[19]); 80 uik[18] = -(d[3] * u[15] + d[8] * u[16] + d[13] * u[17] + d[18] * u[18] + d[23] * u[19]); 81 uik[19] = -(d[4] * u[15] + d[9] * u[16] + d[14] * u[17] + d[19] * u[18] + d[24] * u[19]); 82 83 uik[20] = -(d[0] * u[20] + d[5] * u[21] + d[10] * u[22] + d[15] * u[23] + d[20] * u[24]); 84 uik[21] = -(d[1] * u[20] + d[6] * u[21] + d[11] * u[22] + d[16] * u[23] + d[21] * u[24]); 85 uik[22] = -(d[2] * u[20] + d[7] * u[21] + d[12] * u[22] + d[17] * u[23] + d[22] * u[24]); 86 uik[23] = -(d[3] * u[20] + d[8] * u[21] + d[13] * u[22] + d[18] * u[23] + d[23] * u[24]); 87 uik[24] = -(d[4] * u[20] + d[9] * u[21] + d[14] * u[22] + d[19] * u[23] + d[24] * u[24]); 88 89 /* update D(k) += -U(i,k)^T * U_bar(i,k) */ 90 dk[0] += uik[0] * u[0] + uik[1] * u[1] + uik[2] * u[2] + uik[3] * u[3] + uik[4] * u[4]; 91 dk[1] += uik[5] * u[0] + uik[6] * u[1] + uik[7] * u[2] + uik[8] * u[3] + uik[9] * u[4]; 92 dk[2] += uik[10] * u[0] + uik[11] * u[1] + uik[12] * u[2] + uik[13] * u[3] + uik[14] * u[4]; 93 dk[3] += uik[15] * u[0] + uik[16] * u[1] + uik[17] * u[2] + uik[18] * u[3] + uik[19] * u[4]; 94 dk[4] += uik[20] * u[0] + uik[21] * u[1] + uik[22] * u[2] + uik[23] * u[3] + uik[24] * u[4]; 95 96 dk[5] += uik[0] * u[5] + uik[1] * u[6] + uik[2] * u[7] + uik[3] * u[8] + uik[4] * u[9]; 97 dk[6] += uik[5] * u[5] + uik[6] * u[6] + uik[7] * u[7] + uik[8] * u[8] + uik[9] * u[9]; 98 dk[7] += uik[10] * u[5] + uik[11] * u[6] + uik[12] * u[7] + uik[13] * u[8] + uik[14] * u[9]; 99 dk[8] += uik[15] * u[5] + uik[16] * u[6] + uik[17] * u[7] + uik[18] * u[8] + uik[19] * u[9]; 100 dk[9] += uik[20] * u[5] + uik[21] * u[6] + uik[22] * u[7] + uik[23] * u[8] + uik[24] * u[9]; 101 102 dk[10] += uik[0] * u[10] + uik[1] * u[11] + uik[2] * u[12] + uik[3] * u[13] + uik[4] * u[14]; 103 dk[11] += uik[5] * u[10] + uik[6] * u[11] + uik[7] * u[12] + uik[8] * u[13] + uik[9] * u[14]; 104 dk[12] += uik[10] * u[10] + uik[11] * u[11] + uik[12] * u[12] + uik[13] * u[13] + uik[14] * u[14]; 105 dk[13] += uik[15] * u[10] + uik[16] * u[11] + uik[17] * u[12] + uik[18] * u[13] + uik[19] * u[14]; 106 dk[14] += uik[20] * u[10] + uik[21] * u[11] + uik[22] * u[12] + uik[23] * u[13] + uik[24] * u[14]; 107 108 dk[15] += uik[0] * u[15] + uik[1] * u[16] + uik[2] * u[17] + uik[3] * u[18] + uik[4] * u[19]; 109 dk[16] += uik[5] * u[15] + uik[6] * u[16] + uik[7] * u[17] + uik[8] * u[18] + uik[9] * u[19]; 110 dk[17] += uik[10] * u[15] + uik[11] * u[16] + uik[12] * u[17] + uik[13] * u[18] + uik[14] * u[19]; 111 dk[18] += uik[15] * u[15] + uik[16] * u[16] + uik[17] * u[17] + uik[18] * u[18] + uik[19] * u[19]; 112 dk[19] += uik[20] * u[15] + uik[21] * u[16] + uik[22] * u[17] + uik[23] * u[18] + uik[24] * u[19]; 113 114 dk[20] += uik[0] * u[20] + uik[1] * u[21] + uik[2] * u[22] + uik[3] * u[23] + uik[4] * u[24]; 115 dk[21] += uik[5] * u[20] + uik[6] * u[21] + uik[7] * u[22] + uik[8] * u[23] + uik[9] * u[24]; 116 dk[22] += uik[10] * u[20] + uik[11] * u[21] + uik[12] * u[22] + uik[13] * u[23] + uik[14] * u[24]; 117 dk[23] += uik[15] * u[20] + uik[16] * u[21] + uik[17] * u[22] + uik[18] * u[23] + uik[19] * u[24]; 118 dk[24] += uik[20] * u[20] + uik[21] * u[21] + uik[22] * u[22] + uik[23] * u[23] + uik[24] * u[24]; 119 120 PetscCall(PetscLogFlops(125.0 * 4.0)); 121 122 /* update -U(i,k) */ 123 PetscCall(PetscArraycpy(ba + ili * 25, uik, 25)); 124 125 /* add multiple of row i to k-th row ... */ 126 jmin = ili + 1; 127 jmax = bi[i + 1]; 128 if (jmin < jmax) { 129 for (j = jmin; j < jmax; j++) { 130 /* rtmp += -U(i,k)^T * U_bar(i,j) */ 131 rtmp_ptr = rtmp + bj[j] * 25; 132 u = ba + j * 25; 133 rtmp_ptr[0] += uik[0] * u[0] + uik[1] * u[1] + uik[2] * u[2] + uik[3] * u[3] + uik[4] * u[4]; 134 rtmp_ptr[1] += uik[5] * u[0] + uik[6] * u[1] + uik[7] * u[2] + uik[8] * u[3] + uik[9] * u[4]; 135 rtmp_ptr[2] += uik[10] * u[0] + uik[11] * u[1] + uik[12] * u[2] + uik[13] * u[3] + uik[14] * u[4]; 136 rtmp_ptr[3] += uik[15] * u[0] + uik[16] * u[1] + uik[17] * u[2] + uik[18] * u[3] + uik[19] * u[4]; 137 rtmp_ptr[4] += uik[20] * u[0] + uik[21] * u[1] + uik[22] * u[2] + uik[23] * u[3] + uik[24] * u[4]; 138 139 rtmp_ptr[5] += uik[0] * u[5] + uik[1] * u[6] + uik[2] * u[7] + uik[3] * u[8] + uik[4] * u[9]; 140 rtmp_ptr[6] += uik[5] * u[5] + uik[6] * u[6] + uik[7] * u[7] + uik[8] * u[8] + uik[9] * u[9]; 141 rtmp_ptr[7] += uik[10] * u[5] + uik[11] * u[6] + uik[12] * u[7] + uik[13] * u[8] + uik[14] * u[9]; 142 rtmp_ptr[8] += uik[15] * u[5] + uik[16] * u[6] + uik[17] * u[7] + uik[18] * u[8] + uik[19] * u[9]; 143 rtmp_ptr[9] += uik[20] * u[5] + uik[21] * u[6] + uik[22] * u[7] + uik[23] * u[8] + uik[24] * u[9]; 144 145 rtmp_ptr[10] += uik[0] * u[10] + uik[1] * u[11] + uik[2] * u[12] + uik[3] * u[13] + uik[4] * u[14]; 146 rtmp_ptr[11] += uik[5] * u[10] + uik[6] * u[11] + uik[7] * u[12] + uik[8] * u[13] + uik[9] * u[14]; 147 rtmp_ptr[12] += uik[10] * u[10] + uik[11] * u[11] + uik[12] * u[12] + uik[13] * u[13] + uik[14] * u[14]; 148 rtmp_ptr[13] += uik[15] * u[10] + uik[16] * u[11] + uik[17] * u[12] + uik[18] * u[13] + uik[19] * u[14]; 149 rtmp_ptr[14] += uik[20] * u[10] + uik[21] * u[11] + uik[22] * u[12] + uik[23] * u[13] + uik[24] * u[14]; 150 151 rtmp_ptr[15] += uik[0] * u[15] + uik[1] * u[16] + uik[2] * u[17] + uik[3] * u[18] + uik[4] * u[19]; 152 rtmp_ptr[16] += uik[5] * u[15] + uik[6] * u[16] + uik[7] * u[17] + uik[8] * u[18] + uik[9] * u[19]; 153 rtmp_ptr[17] += uik[10] * u[15] + uik[11] * u[16] + uik[12] * u[17] + uik[13] * u[18] + uik[14] * u[19]; 154 rtmp_ptr[18] += uik[15] * u[15] + uik[16] * u[16] + uik[17] * u[17] + uik[18] * u[18] + uik[19] * u[19]; 155 rtmp_ptr[19] += uik[20] * u[15] + uik[21] * u[16] + uik[22] * u[17] + uik[23] * u[18] + uik[24] * u[19]; 156 157 rtmp_ptr[20] += uik[0] * u[20] + uik[1] * u[21] + uik[2] * u[22] + uik[3] * u[23] + uik[4] * u[24]; 158 rtmp_ptr[21] += uik[5] * u[20] + uik[6] * u[21] + uik[7] * u[22] + uik[8] * u[23] + uik[9] * u[24]; 159 rtmp_ptr[22] += uik[10] * u[20] + uik[11] * u[21] + uik[12] * u[22] + uik[13] * u[23] + uik[14] * u[24]; 160 rtmp_ptr[23] += uik[15] * u[20] + uik[16] * u[21] + uik[17] * u[22] + uik[18] * u[23] + uik[19] * u[24]; 161 rtmp_ptr[24] += uik[20] * u[20] + uik[21] * u[21] + uik[22] * u[22] + uik[23] * u[23] + uik[24] * u[24]; 162 } 163 PetscCall(PetscLogFlops(2.0 * 125.0 * (jmax - jmin))); 164 165 /* ... add i to row list for next nonzero entry */ 166 il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */ 167 j = bj[jmin]; 168 jl[i] = jl[j]; 169 jl[j] = i; /* update jl */ 170 } 171 i = nexti; 172 } 173 174 /* save nonzero entries in k-th row of U ... */ 175 176 /* invert diagonal block */ 177 d = ba + k * 25; 178 PetscCall(PetscArraycpy(d, dk, 25)); 179 PetscCall(PetscKernel_A_gets_inverse_A_5(d, ipvt, work, shift, allowzeropivot, &zeropivotdetected)); 180 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 181 182 jmin = bi[k]; 183 jmax = bi[k + 1]; 184 if (jmin < jmax) { 185 for (j = jmin; j < jmax; j++) { 186 vj = bj[j]; /* block col. index of U */ 187 u = ba + j * 25; 188 rtmp_ptr = rtmp + vj * 25; 189 for (k1 = 0; k1 < 25; k1++) { 190 *u++ = *rtmp_ptr; 191 *rtmp_ptr++ = 0.0; 192 } 193 } 194 195 /* ... add k to row list for first nonzero entry in k-th row */ 196 il[k] = jmin; 197 i = bj[jmin]; 198 jl[k] = jl[i]; 199 jl[i] = k; 200 } 201 } 202 203 PetscCall(PetscFree(rtmp)); 204 PetscCall(PetscFree2(il, jl)); 205 PetscCall(PetscFree2(dk, uik)); 206 207 C->ops->solve = MatSolve_SeqSBAIJ_5_NaturalOrdering_inplace; 208 C->ops->solvetranspose = MatSolve_SeqSBAIJ_5_NaturalOrdering_inplace; 209 C->ops->forwardsolve = MatForwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace; 210 C->ops->backwardsolve = MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace; 211 C->assembled = PETSC_TRUE; 212 C->preallocated = PETSC_TRUE; 213 214 PetscCall(PetscLogFlops(1.3333 * 125 * b->mbs)); /* from inverting diagonal blocks */ 215 PetscFunctionReturn(0); 216 } 217