1be1d678aSKris Buschelman 283287d42SBarry Smith /* 383287d42SBarry Smith Factorization code for BAIJ format. 483287d42SBarry Smith */ 5c6db04a5SJed Brown #include <../src/mat/impls/baij/seq/baij.h> 6af0996ceSBarry Smith #include <petsc/private/kernels/blockinvert.h> 783287d42SBarry Smith /* 883287d42SBarry Smith Version for when blocks are 7 by 7 983287d42SBarry Smith */ 10*9371c9d4SSatish Balay PetscErrorCode MatLUFactorNumeric_SeqBAIJ_7_inplace(Mat C, Mat A, const MatFactorInfo *info) { 1183287d42SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 1283287d42SBarry Smith IS isrow = b->row, isicol = b->icol; 135d0c19d7SBarry Smith const PetscInt *r, *ic, *bi = b->i, *bj = b->j, *ajtmp, *diag_offset = b->diag, *ai = a->i, *aj = a->j, *pj, *ajtmpold; 145d0c19d7SBarry Smith PetscInt i, j, n = a->mbs, nz, row, idx; 1583287d42SBarry Smith MatScalar *pv, *v, *rtmp, *pc, *w, *x; 1683287d42SBarry Smith MatScalar p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4; 1783287d42SBarry Smith MatScalar p5, p6, p7, p8, p9, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16; 1883287d42SBarry Smith MatScalar x17, x18, x19, x20, x21, x22, x23, x24, x25, p10, p11, p12, p13, p14; 1983287d42SBarry Smith MatScalar p15, p16, p17, p18, p19, p20, p21, p22, p23, p24, p25, m10, m11, m12; 2083287d42SBarry Smith MatScalar m13, m14, m15, m16, m17, m18, m19, m20, m21, m22, m23, m24, m25; 2183287d42SBarry Smith MatScalar p26, p27, p28, p29, p30, p31, p32, p33, p34, p35, p36; 2283287d42SBarry Smith MatScalar p37, p38, p39, p40, p41, p42, p43, p44, p45, p46, p47, p48, p49; 2383287d42SBarry Smith MatScalar x26, x27, x28, x29, x30, x31, x32, x33, x34, x35, x36; 2483287d42SBarry Smith MatScalar x37, x38, x39, x40, x41, x42, x43, x44, x45, x46, x47, x48, x49; 2583287d42SBarry Smith MatScalar m26, m27, m28, m29, m30, m31, m32, m33, m34, m35, m36; 2683287d42SBarry Smith MatScalar m37, m38, m39, m40, m41, m42, m43, m44, m45, m46, m47, m48, m49; 2783287d42SBarry Smith MatScalar *ba = b->a, *aa = a->a; 28182b8fbaSHong Zhang PetscReal shift = info->shiftamount; 29a455e926SHong Zhang PetscBool allowzeropivot, zeropivotdetected; 3083287d42SBarry Smith 3183287d42SBarry Smith PetscFunctionBegin; 320164db54SHong Zhang allowzeropivot = PetscNot(A->erroriffailure); 339566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow, &r)); 349566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isicol, &ic)); 359566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(49 * (n + 1), &rtmp)); 3683287d42SBarry Smith 3783287d42SBarry Smith for (i = 0; i < n; i++) { 3883287d42SBarry Smith nz = bi[i + 1] - bi[i]; 3983287d42SBarry Smith ajtmp = bj + bi[i]; 4083287d42SBarry Smith for (j = 0; j < nz; j++) { 4183287d42SBarry Smith x = rtmp + 49 * ajtmp[j]; 4283287d42SBarry Smith x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 4383287d42SBarry Smith x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = x[16] = x[17] = 0.0; 4483287d42SBarry Smith x[18] = x[19] = x[20] = x[21] = x[22] = x[23] = x[24] = x[25] = 0.0; 4583287d42SBarry Smith x[26] = x[27] = x[28] = x[29] = x[30] = x[31] = x[32] = x[33] = 0.0; 4683287d42SBarry Smith x[34] = x[35] = x[36] = x[37] = x[38] = x[39] = x[40] = x[41] = 0.0; 4783287d42SBarry Smith x[42] = x[43] = x[44] = x[45] = x[46] = x[47] = x[48] = 0.0; 4883287d42SBarry Smith } 4983287d42SBarry Smith /* load in initial (unfactored row) */ 5083287d42SBarry Smith idx = r[i]; 5183287d42SBarry Smith nz = ai[idx + 1] - ai[idx]; 5283287d42SBarry Smith ajtmpold = aj + ai[idx]; 5383287d42SBarry Smith v = aa + 49 * ai[idx]; 5483287d42SBarry Smith for (j = 0; j < nz; j++) { 5583287d42SBarry Smith x = rtmp + 49 * ic[ajtmpold[j]]; 56*9371c9d4SSatish Balay x[0] = v[0]; 57*9371c9d4SSatish Balay x[1] = v[1]; 58*9371c9d4SSatish Balay x[2] = v[2]; 59*9371c9d4SSatish Balay x[3] = v[3]; 60*9371c9d4SSatish Balay x[4] = v[4]; 61*9371c9d4SSatish Balay x[5] = v[5]; 62*9371c9d4SSatish Balay x[6] = v[6]; 63*9371c9d4SSatish Balay x[7] = v[7]; 64*9371c9d4SSatish Balay x[8] = v[8]; 65*9371c9d4SSatish Balay x[9] = v[9]; 66*9371c9d4SSatish Balay x[10] = v[10]; 67*9371c9d4SSatish Balay x[11] = v[11]; 68*9371c9d4SSatish Balay x[12] = v[12]; 69*9371c9d4SSatish Balay x[13] = v[13]; 70*9371c9d4SSatish Balay x[14] = v[14]; 71*9371c9d4SSatish Balay x[15] = v[15]; 72*9371c9d4SSatish Balay x[16] = v[16]; 73*9371c9d4SSatish Balay x[17] = v[17]; 74*9371c9d4SSatish Balay x[18] = v[18]; 75*9371c9d4SSatish Balay x[19] = v[19]; 76*9371c9d4SSatish Balay x[20] = v[20]; 77*9371c9d4SSatish Balay x[21] = v[21]; 78*9371c9d4SSatish Balay x[22] = v[22]; 79*9371c9d4SSatish Balay x[23] = v[23]; 80*9371c9d4SSatish Balay x[24] = v[24]; 81*9371c9d4SSatish Balay x[25] = v[25]; 82*9371c9d4SSatish Balay x[26] = v[26]; 83*9371c9d4SSatish Balay x[27] = v[27]; 84*9371c9d4SSatish Balay x[28] = v[28]; 85*9371c9d4SSatish Balay x[29] = v[29]; 86*9371c9d4SSatish Balay x[30] = v[30]; 87*9371c9d4SSatish Balay x[31] = v[31]; 88*9371c9d4SSatish Balay x[32] = v[32]; 89*9371c9d4SSatish Balay x[33] = v[33]; 90*9371c9d4SSatish Balay x[34] = v[34]; 91*9371c9d4SSatish Balay x[35] = v[35]; 92*9371c9d4SSatish Balay x[36] = v[36]; 93*9371c9d4SSatish Balay x[37] = v[37]; 94*9371c9d4SSatish Balay x[38] = v[38]; 95*9371c9d4SSatish Balay x[39] = v[39]; 96*9371c9d4SSatish Balay x[40] = v[40]; 97*9371c9d4SSatish Balay x[41] = v[41]; 98*9371c9d4SSatish Balay x[42] = v[42]; 99*9371c9d4SSatish Balay x[43] = v[43]; 100*9371c9d4SSatish Balay x[44] = v[44]; 101*9371c9d4SSatish Balay x[45] = v[45]; 102*9371c9d4SSatish Balay x[46] = v[46]; 103*9371c9d4SSatish Balay x[47] = v[47]; 10483287d42SBarry Smith x[48] = v[48]; 10583287d42SBarry Smith v += 49; 10683287d42SBarry Smith } 10783287d42SBarry Smith row = *ajtmp++; 10883287d42SBarry Smith while (row < i) { 10983287d42SBarry Smith pc = rtmp + 49 * row; 110*9371c9d4SSatish Balay p1 = pc[0]; 111*9371c9d4SSatish Balay p2 = pc[1]; 112*9371c9d4SSatish Balay p3 = pc[2]; 113*9371c9d4SSatish Balay p4 = pc[3]; 114*9371c9d4SSatish Balay p5 = pc[4]; 115*9371c9d4SSatish Balay p6 = pc[5]; 116*9371c9d4SSatish Balay p7 = pc[6]; 117*9371c9d4SSatish Balay p8 = pc[7]; 118*9371c9d4SSatish Balay p9 = pc[8]; 119*9371c9d4SSatish Balay p10 = pc[9]; 120*9371c9d4SSatish Balay p11 = pc[10]; 121*9371c9d4SSatish Balay p12 = pc[11]; 122*9371c9d4SSatish Balay p13 = pc[12]; 123*9371c9d4SSatish Balay p14 = pc[13]; 124*9371c9d4SSatish Balay p15 = pc[14]; 125*9371c9d4SSatish Balay p16 = pc[15]; 126*9371c9d4SSatish Balay p17 = pc[16]; 127*9371c9d4SSatish Balay p18 = pc[17]; 128*9371c9d4SSatish Balay p19 = pc[18]; 129*9371c9d4SSatish Balay p20 = pc[19]; 130*9371c9d4SSatish Balay p21 = pc[20]; 131*9371c9d4SSatish Balay p22 = pc[21]; 132*9371c9d4SSatish Balay p23 = pc[22]; 133*9371c9d4SSatish Balay p24 = pc[23]; 134*9371c9d4SSatish Balay p25 = pc[24]; 135*9371c9d4SSatish Balay p26 = pc[25]; 136*9371c9d4SSatish Balay p27 = pc[26]; 137*9371c9d4SSatish Balay p28 = pc[27]; 138*9371c9d4SSatish Balay p29 = pc[28]; 139*9371c9d4SSatish Balay p30 = pc[29]; 140*9371c9d4SSatish Balay p31 = pc[30]; 141*9371c9d4SSatish Balay p32 = pc[31]; 142*9371c9d4SSatish Balay p33 = pc[32]; 143*9371c9d4SSatish Balay p34 = pc[33]; 144*9371c9d4SSatish Balay p35 = pc[34]; 145*9371c9d4SSatish Balay p36 = pc[35]; 146*9371c9d4SSatish Balay p37 = pc[36]; 147*9371c9d4SSatish Balay p38 = pc[37]; 148*9371c9d4SSatish Balay p39 = pc[38]; 149*9371c9d4SSatish Balay p40 = pc[39]; 150*9371c9d4SSatish Balay p41 = pc[40]; 151*9371c9d4SSatish Balay p42 = pc[41]; 152*9371c9d4SSatish Balay p43 = pc[42]; 153*9371c9d4SSatish Balay p44 = pc[43]; 154*9371c9d4SSatish Balay p45 = pc[44]; 155*9371c9d4SSatish Balay p46 = pc[45]; 156*9371c9d4SSatish Balay p47 = pc[46]; 157*9371c9d4SSatish Balay p48 = pc[47]; 15883287d42SBarry Smith p49 = pc[48]; 159*9371c9d4SSatish Balay if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0 || p17 != 0.0 || p18 != 0.0 || p19 != 0.0 || p20 != 0.0 || p21 != 0.0 || p22 != 0.0 || p23 != 0.0 || p24 != 0.0 || p25 != 0.0 || p26 != 0.0 || p27 != 0.0 || p28 != 0.0 || p29 != 0.0 || p30 != 0.0 || p31 != 0.0 || p32 != 0.0 || p33 != 0.0 || p34 != 0.0 || p35 != 0.0 || p36 != 0.0 || p37 != 0.0 || p38 != 0.0 || p39 != 0.0 || p40 != 0.0 || p41 != 0.0 || p42 != 0.0 || p43 != 0.0 || p44 != 0.0 || p45 != 0.0 || p46 != 0.0 || p47 != 0.0 || p48 != 0.0 || p49 != 0.0) { 16083287d42SBarry Smith pv = ba + 49 * diag_offset[row]; 16183287d42SBarry Smith pj = bj + diag_offset[row] + 1; 162*9371c9d4SSatish Balay x1 = pv[0]; 163*9371c9d4SSatish Balay x2 = pv[1]; 164*9371c9d4SSatish Balay x3 = pv[2]; 165*9371c9d4SSatish Balay x4 = pv[3]; 166*9371c9d4SSatish Balay x5 = pv[4]; 167*9371c9d4SSatish Balay x6 = pv[5]; 168*9371c9d4SSatish Balay x7 = pv[6]; 169*9371c9d4SSatish Balay x8 = pv[7]; 170*9371c9d4SSatish Balay x9 = pv[8]; 171*9371c9d4SSatish Balay x10 = pv[9]; 172*9371c9d4SSatish Balay x11 = pv[10]; 173*9371c9d4SSatish Balay x12 = pv[11]; 174*9371c9d4SSatish Balay x13 = pv[12]; 175*9371c9d4SSatish Balay x14 = pv[13]; 176*9371c9d4SSatish Balay x15 = pv[14]; 177*9371c9d4SSatish Balay x16 = pv[15]; 178*9371c9d4SSatish Balay x17 = pv[16]; 179*9371c9d4SSatish Balay x18 = pv[17]; 180*9371c9d4SSatish Balay x19 = pv[18]; 181*9371c9d4SSatish Balay x20 = pv[19]; 182*9371c9d4SSatish Balay x21 = pv[20]; 183*9371c9d4SSatish Balay x22 = pv[21]; 184*9371c9d4SSatish Balay x23 = pv[22]; 185*9371c9d4SSatish Balay x24 = pv[23]; 186*9371c9d4SSatish Balay x25 = pv[24]; 187*9371c9d4SSatish Balay x26 = pv[25]; 188*9371c9d4SSatish Balay x27 = pv[26]; 189*9371c9d4SSatish Balay x28 = pv[27]; 190*9371c9d4SSatish Balay x29 = pv[28]; 191*9371c9d4SSatish Balay x30 = pv[29]; 192*9371c9d4SSatish Balay x31 = pv[30]; 193*9371c9d4SSatish Balay x32 = pv[31]; 194*9371c9d4SSatish Balay x33 = pv[32]; 195*9371c9d4SSatish Balay x34 = pv[33]; 196*9371c9d4SSatish Balay x35 = pv[34]; 197*9371c9d4SSatish Balay x36 = pv[35]; 198*9371c9d4SSatish Balay x37 = pv[36]; 199*9371c9d4SSatish Balay x38 = pv[37]; 200*9371c9d4SSatish Balay x39 = pv[38]; 201*9371c9d4SSatish Balay x40 = pv[39]; 202*9371c9d4SSatish Balay x41 = pv[40]; 203*9371c9d4SSatish Balay x42 = pv[41]; 204*9371c9d4SSatish Balay x43 = pv[42]; 205*9371c9d4SSatish Balay x44 = pv[43]; 206*9371c9d4SSatish Balay x45 = pv[44]; 207*9371c9d4SSatish Balay x46 = pv[45]; 208*9371c9d4SSatish Balay x47 = pv[46]; 209*9371c9d4SSatish Balay x48 = pv[47]; 21083287d42SBarry Smith x49 = pv[48]; 21183287d42SBarry Smith pc[0] = m1 = p1 * x1 + p8 * x2 + p15 * x3 + p22 * x4 + p29 * x5 + p36 * x6 + p43 * x7; 21283287d42SBarry Smith pc[1] = m2 = p2 * x1 + p9 * x2 + p16 * x3 + p23 * x4 + p30 * x5 + p37 * x6 + p44 * x7; 21383287d42SBarry Smith pc[2] = m3 = p3 * x1 + p10 * x2 + p17 * x3 + p24 * x4 + p31 * x5 + p38 * x6 + p45 * x7; 21483287d42SBarry Smith pc[3] = m4 = p4 * x1 + p11 * x2 + p18 * x3 + p25 * x4 + p32 * x5 + p39 * x6 + p46 * x7; 21583287d42SBarry Smith pc[4] = m5 = p5 * x1 + p12 * x2 + p19 * x3 + p26 * x4 + p33 * x5 + p40 * x6 + p47 * x7; 21683287d42SBarry Smith pc[5] = m6 = p6 * x1 + p13 * x2 + p20 * x3 + p27 * x4 + p34 * x5 + p41 * x6 + p48 * x7; 21783287d42SBarry Smith pc[6] = m7 = p7 * x1 + p14 * x2 + p21 * x3 + p28 * x4 + p35 * x5 + p42 * x6 + p49 * x7; 21883287d42SBarry Smith 21983287d42SBarry Smith pc[7] = m8 = p1 * x8 + p8 * x9 + p15 * x10 + p22 * x11 + p29 * x12 + p36 * x13 + p43 * x14; 22083287d42SBarry Smith pc[8] = m9 = p2 * x8 + p9 * x9 + p16 * x10 + p23 * x11 + p30 * x12 + p37 * x13 + p44 * x14; 22183287d42SBarry Smith pc[9] = m10 = p3 * x8 + p10 * x9 + p17 * x10 + p24 * x11 + p31 * x12 + p38 * x13 + p45 * x14; 22283287d42SBarry Smith pc[10] = m11 = p4 * x8 + p11 * x9 + p18 * x10 + p25 * x11 + p32 * x12 + p39 * x13 + p46 * x14; 22383287d42SBarry Smith pc[11] = m12 = p5 * x8 + p12 * x9 + p19 * x10 + p26 * x11 + p33 * x12 + p40 * x13 + p47 * x14; 22483287d42SBarry Smith pc[12] = m13 = p6 * x8 + p13 * x9 + p20 * x10 + p27 * x11 + p34 * x12 + p41 * x13 + p48 * x14; 22583287d42SBarry Smith pc[13] = m14 = p7 * x8 + p14 * x9 + p21 * x10 + p28 * x11 + p35 * x12 + p42 * x13 + p49 * x14; 22683287d42SBarry Smith 22783287d42SBarry Smith pc[14] = m15 = p1 * x15 + p8 * x16 + p15 * x17 + p22 * x18 + p29 * x19 + p36 * x20 + p43 * x21; 22883287d42SBarry Smith pc[15] = m16 = p2 * x15 + p9 * x16 + p16 * x17 + p23 * x18 + p30 * x19 + p37 * x20 + p44 * x21; 22983287d42SBarry Smith pc[16] = m17 = p3 * x15 + p10 * x16 + p17 * x17 + p24 * x18 + p31 * x19 + p38 * x20 + p45 * x21; 23083287d42SBarry Smith pc[17] = m18 = p4 * x15 + p11 * x16 + p18 * x17 + p25 * x18 + p32 * x19 + p39 * x20 + p46 * x21; 23183287d42SBarry Smith pc[18] = m19 = p5 * x15 + p12 * x16 + p19 * x17 + p26 * x18 + p33 * x19 + p40 * x20 + p47 * x21; 23283287d42SBarry Smith pc[19] = m20 = p6 * x15 + p13 * x16 + p20 * x17 + p27 * x18 + p34 * x19 + p41 * x20 + p48 * x21; 23383287d42SBarry Smith pc[20] = m21 = p7 * x15 + p14 * x16 + p21 * x17 + p28 * x18 + p35 * x19 + p42 * x20 + p49 * x21; 23483287d42SBarry Smith 23583287d42SBarry Smith pc[21] = m22 = p1 * x22 + p8 * x23 + p15 * x24 + p22 * x25 + p29 * x26 + p36 * x27 + p43 * x28; 23683287d42SBarry Smith pc[22] = m23 = p2 * x22 + p9 * x23 + p16 * x24 + p23 * x25 + p30 * x26 + p37 * x27 + p44 * x28; 23783287d42SBarry Smith pc[23] = m24 = p3 * x22 + p10 * x23 + p17 * x24 + p24 * x25 + p31 * x26 + p38 * x27 + p45 * x28; 23883287d42SBarry Smith pc[24] = m25 = p4 * x22 + p11 * x23 + p18 * x24 + p25 * x25 + p32 * x26 + p39 * x27 + p46 * x28; 23983287d42SBarry Smith pc[25] = m26 = p5 * x22 + p12 * x23 + p19 * x24 + p26 * x25 + p33 * x26 + p40 * x27 + p47 * x28; 24083287d42SBarry Smith pc[26] = m27 = p6 * x22 + p13 * x23 + p20 * x24 + p27 * x25 + p34 * x26 + p41 * x27 + p48 * x28; 24183287d42SBarry Smith pc[27] = m28 = p7 * x22 + p14 * x23 + p21 * x24 + p28 * x25 + p35 * x26 + p42 * x27 + p49 * x28; 24283287d42SBarry Smith 24383287d42SBarry Smith pc[28] = m29 = p1 * x29 + p8 * x30 + p15 * x31 + p22 * x32 + p29 * x33 + p36 * x34 + p43 * x35; 24483287d42SBarry Smith pc[29] = m30 = p2 * x29 + p9 * x30 + p16 * x31 + p23 * x32 + p30 * x33 + p37 * x34 + p44 * x35; 24583287d42SBarry Smith pc[30] = m31 = p3 * x29 + p10 * x30 + p17 * x31 + p24 * x32 + p31 * x33 + p38 * x34 + p45 * x35; 24683287d42SBarry Smith pc[31] = m32 = p4 * x29 + p11 * x30 + p18 * x31 + p25 * x32 + p32 * x33 + p39 * x34 + p46 * x35; 24783287d42SBarry Smith pc[32] = m33 = p5 * x29 + p12 * x30 + p19 * x31 + p26 * x32 + p33 * x33 + p40 * x34 + p47 * x35; 24883287d42SBarry Smith pc[33] = m34 = p6 * x29 + p13 * x30 + p20 * x31 + p27 * x32 + p34 * x33 + p41 * x34 + p48 * x35; 24983287d42SBarry Smith pc[34] = m35 = p7 * x29 + p14 * x30 + p21 * x31 + p28 * x32 + p35 * x33 + p42 * x34 + p49 * x35; 25083287d42SBarry Smith 25183287d42SBarry Smith pc[35] = m36 = p1 * x36 + p8 * x37 + p15 * x38 + p22 * x39 + p29 * x40 + p36 * x41 + p43 * x42; 25283287d42SBarry Smith pc[36] = m37 = p2 * x36 + p9 * x37 + p16 * x38 + p23 * x39 + p30 * x40 + p37 * x41 + p44 * x42; 25383287d42SBarry Smith pc[37] = m38 = p3 * x36 + p10 * x37 + p17 * x38 + p24 * x39 + p31 * x40 + p38 * x41 + p45 * x42; 25483287d42SBarry Smith pc[38] = m39 = p4 * x36 + p11 * x37 + p18 * x38 + p25 * x39 + p32 * x40 + p39 * x41 + p46 * x42; 25583287d42SBarry Smith pc[39] = m40 = p5 * x36 + p12 * x37 + p19 * x38 + p26 * x39 + p33 * x40 + p40 * x41 + p47 * x42; 25683287d42SBarry Smith pc[40] = m41 = p6 * x36 + p13 * x37 + p20 * x38 + p27 * x39 + p34 * x40 + p41 * x41 + p48 * x42; 25783287d42SBarry Smith pc[41] = m42 = p7 * x36 + p14 * x37 + p21 * x38 + p28 * x39 + p35 * x40 + p42 * x41 + p49 * x42; 25883287d42SBarry Smith 25983287d42SBarry Smith pc[42] = m43 = p1 * x43 + p8 * x44 + p15 * x45 + p22 * x46 + p29 * x47 + p36 * x48 + p43 * x49; 26083287d42SBarry Smith pc[43] = m44 = p2 * x43 + p9 * x44 + p16 * x45 + p23 * x46 + p30 * x47 + p37 * x48 + p44 * x49; 26183287d42SBarry Smith pc[44] = m45 = p3 * x43 + p10 * x44 + p17 * x45 + p24 * x46 + p31 * x47 + p38 * x48 + p45 * x49; 26283287d42SBarry Smith pc[45] = m46 = p4 * x43 + p11 * x44 + p18 * x45 + p25 * x46 + p32 * x47 + p39 * x48 + p46 * x49; 26383287d42SBarry Smith pc[46] = m47 = p5 * x43 + p12 * x44 + p19 * x45 + p26 * x46 + p33 * x47 + p40 * x48 + p47 * x49; 26483287d42SBarry Smith pc[47] = m48 = p6 * x43 + p13 * x44 + p20 * x45 + p27 * x46 + p34 * x47 + p41 * x48 + p48 * x49; 26583287d42SBarry Smith pc[48] = m49 = p7 * x43 + p14 * x44 + p21 * x45 + p28 * x46 + p35 * x47 + p42 * x48 + p49 * x49; 26683287d42SBarry Smith 26783287d42SBarry Smith nz = bi[row + 1] - diag_offset[row] - 1; 26883287d42SBarry Smith pv += 49; 26983287d42SBarry Smith for (j = 0; j < nz; j++) { 270*9371c9d4SSatish Balay x1 = pv[0]; 271*9371c9d4SSatish Balay x2 = pv[1]; 272*9371c9d4SSatish Balay x3 = pv[2]; 273*9371c9d4SSatish Balay x4 = pv[3]; 274*9371c9d4SSatish Balay x5 = pv[4]; 275*9371c9d4SSatish Balay x6 = pv[5]; 276*9371c9d4SSatish Balay x7 = pv[6]; 277*9371c9d4SSatish Balay x8 = pv[7]; 278*9371c9d4SSatish Balay x9 = pv[8]; 279*9371c9d4SSatish Balay x10 = pv[9]; 280*9371c9d4SSatish Balay x11 = pv[10]; 281*9371c9d4SSatish Balay x12 = pv[11]; 282*9371c9d4SSatish Balay x13 = pv[12]; 283*9371c9d4SSatish Balay x14 = pv[13]; 284*9371c9d4SSatish Balay x15 = pv[14]; 285*9371c9d4SSatish Balay x16 = pv[15]; 286*9371c9d4SSatish Balay x17 = pv[16]; 287*9371c9d4SSatish Balay x18 = pv[17]; 288*9371c9d4SSatish Balay x19 = pv[18]; 289*9371c9d4SSatish Balay x20 = pv[19]; 290*9371c9d4SSatish Balay x21 = pv[20]; 291*9371c9d4SSatish Balay x22 = pv[21]; 292*9371c9d4SSatish Balay x23 = pv[22]; 293*9371c9d4SSatish Balay x24 = pv[23]; 294*9371c9d4SSatish Balay x25 = pv[24]; 295*9371c9d4SSatish Balay x26 = pv[25]; 296*9371c9d4SSatish Balay x27 = pv[26]; 297*9371c9d4SSatish Balay x28 = pv[27]; 298*9371c9d4SSatish Balay x29 = pv[28]; 299*9371c9d4SSatish Balay x30 = pv[29]; 300*9371c9d4SSatish Balay x31 = pv[30]; 301*9371c9d4SSatish Balay x32 = pv[31]; 302*9371c9d4SSatish Balay x33 = pv[32]; 303*9371c9d4SSatish Balay x34 = pv[33]; 304*9371c9d4SSatish Balay x35 = pv[34]; 305*9371c9d4SSatish Balay x36 = pv[35]; 306*9371c9d4SSatish Balay x37 = pv[36]; 307*9371c9d4SSatish Balay x38 = pv[37]; 308*9371c9d4SSatish Balay x39 = pv[38]; 309*9371c9d4SSatish Balay x40 = pv[39]; 310*9371c9d4SSatish Balay x41 = pv[40]; 311*9371c9d4SSatish Balay x42 = pv[41]; 312*9371c9d4SSatish Balay x43 = pv[42]; 313*9371c9d4SSatish Balay x44 = pv[43]; 314*9371c9d4SSatish Balay x45 = pv[44]; 315*9371c9d4SSatish Balay x46 = pv[45]; 316*9371c9d4SSatish Balay x47 = pv[46]; 317*9371c9d4SSatish Balay x48 = pv[47]; 31883287d42SBarry Smith x49 = pv[48]; 31983287d42SBarry Smith x = rtmp + 49 * pj[j]; 32083287d42SBarry Smith x[0] -= m1 * x1 + m8 * x2 + m15 * x3 + m22 * x4 + m29 * x5 + m36 * x6 + m43 * x7; 32183287d42SBarry Smith x[1] -= m2 * x1 + m9 * x2 + m16 * x3 + m23 * x4 + m30 * x5 + m37 * x6 + m44 * x7; 32283287d42SBarry Smith x[2] -= m3 * x1 + m10 * x2 + m17 * x3 + m24 * x4 + m31 * x5 + m38 * x6 + m45 * x7; 32383287d42SBarry Smith x[3] -= m4 * x1 + m11 * x2 + m18 * x3 + m25 * x4 + m32 * x5 + m39 * x6 + m46 * x7; 32483287d42SBarry Smith x[4] -= m5 * x1 + m12 * x2 + m19 * x3 + m26 * x4 + m33 * x5 + m40 * x6 + m47 * x7; 32583287d42SBarry Smith x[5] -= m6 * x1 + m13 * x2 + m20 * x3 + m27 * x4 + m34 * x5 + m41 * x6 + m48 * x7; 32683287d42SBarry Smith x[6] -= m7 * x1 + m14 * x2 + m21 * x3 + m28 * x4 + m35 * x5 + m42 * x6 + m49 * x7; 32783287d42SBarry Smith 32883287d42SBarry Smith x[7] -= m1 * x8 + m8 * x9 + m15 * x10 + m22 * x11 + m29 * x12 + m36 * x13 + m43 * x14; 32983287d42SBarry Smith x[8] -= m2 * x8 + m9 * x9 + m16 * x10 + m23 * x11 + m30 * x12 + m37 * x13 + m44 * x14; 33083287d42SBarry Smith x[9] -= m3 * x8 + m10 * x9 + m17 * x10 + m24 * x11 + m31 * x12 + m38 * x13 + m45 * x14; 33183287d42SBarry Smith x[10] -= m4 * x8 + m11 * x9 + m18 * x10 + m25 * x11 + m32 * x12 + m39 * x13 + m46 * x14; 33283287d42SBarry Smith x[11] -= m5 * x8 + m12 * x9 + m19 * x10 + m26 * x11 + m33 * x12 + m40 * x13 + m47 * x14; 33383287d42SBarry Smith x[12] -= m6 * x8 + m13 * x9 + m20 * x10 + m27 * x11 + m34 * x12 + m41 * x13 + m48 * x14; 33483287d42SBarry Smith x[13] -= m7 * x8 + m14 * x9 + m21 * x10 + m28 * x11 + m35 * x12 + m42 * x13 + m49 * x14; 33583287d42SBarry Smith 33683287d42SBarry Smith x[14] -= m1 * x15 + m8 * x16 + m15 * x17 + m22 * x18 + m29 * x19 + m36 * x20 + m43 * x21; 33783287d42SBarry Smith x[15] -= m2 * x15 + m9 * x16 + m16 * x17 + m23 * x18 + m30 * x19 + m37 * x20 + m44 * x21; 33883287d42SBarry Smith x[16] -= m3 * x15 + m10 * x16 + m17 * x17 + m24 * x18 + m31 * x19 + m38 * x20 + m45 * x21; 33983287d42SBarry Smith x[17] -= m4 * x15 + m11 * x16 + m18 * x17 + m25 * x18 + m32 * x19 + m39 * x20 + m46 * x21; 34083287d42SBarry Smith x[18] -= m5 * x15 + m12 * x16 + m19 * x17 + m26 * x18 + m33 * x19 + m40 * x20 + m47 * x21; 34183287d42SBarry Smith x[19] -= m6 * x15 + m13 * x16 + m20 * x17 + m27 * x18 + m34 * x19 + m41 * x20 + m48 * x21; 34283287d42SBarry Smith x[20] -= m7 * x15 + m14 * x16 + m21 * x17 + m28 * x18 + m35 * x19 + m42 * x20 + m49 * x21; 34383287d42SBarry Smith 34483287d42SBarry Smith x[21] -= m1 * x22 + m8 * x23 + m15 * x24 + m22 * x25 + m29 * x26 + m36 * x27 + m43 * x28; 34583287d42SBarry Smith x[22] -= m2 * x22 + m9 * x23 + m16 * x24 + m23 * x25 + m30 * x26 + m37 * x27 + m44 * x28; 34683287d42SBarry Smith x[23] -= m3 * x22 + m10 * x23 + m17 * x24 + m24 * x25 + m31 * x26 + m38 * x27 + m45 * x28; 34783287d42SBarry Smith x[24] -= m4 * x22 + m11 * x23 + m18 * x24 + m25 * x25 + m32 * x26 + m39 * x27 + m46 * x28; 34883287d42SBarry Smith x[25] -= m5 * x22 + m12 * x23 + m19 * x24 + m26 * x25 + m33 * x26 + m40 * x27 + m47 * x28; 34983287d42SBarry Smith x[26] -= m6 * x22 + m13 * x23 + m20 * x24 + m27 * x25 + m34 * x26 + m41 * x27 + m48 * x28; 35083287d42SBarry Smith x[27] -= m7 * x22 + m14 * x23 + m21 * x24 + m28 * x25 + m35 * x26 + m42 * x27 + m49 * x28; 35183287d42SBarry Smith 35283287d42SBarry Smith x[28] -= m1 * x29 + m8 * x30 + m15 * x31 + m22 * x32 + m29 * x33 + m36 * x34 + m43 * x35; 35383287d42SBarry Smith x[29] -= m2 * x29 + m9 * x30 + m16 * x31 + m23 * x32 + m30 * x33 + m37 * x34 + m44 * x35; 35483287d42SBarry Smith x[30] -= m3 * x29 + m10 * x30 + m17 * x31 + m24 * x32 + m31 * x33 + m38 * x34 + m45 * x35; 35583287d42SBarry Smith x[31] -= m4 * x29 + m11 * x30 + m18 * x31 + m25 * x32 + m32 * x33 + m39 * x34 + m46 * x35; 35683287d42SBarry Smith x[32] -= m5 * x29 + m12 * x30 + m19 * x31 + m26 * x32 + m33 * x33 + m40 * x34 + m47 * x35; 35783287d42SBarry Smith x[33] -= m6 * x29 + m13 * x30 + m20 * x31 + m27 * x32 + m34 * x33 + m41 * x34 + m48 * x35; 35883287d42SBarry Smith x[34] -= m7 * x29 + m14 * x30 + m21 * x31 + m28 * x32 + m35 * x33 + m42 * x34 + m49 * x35; 35983287d42SBarry Smith 36083287d42SBarry Smith x[35] -= m1 * x36 + m8 * x37 + m15 * x38 + m22 * x39 + m29 * x40 + m36 * x41 + m43 * x42; 36183287d42SBarry Smith x[36] -= m2 * x36 + m9 * x37 + m16 * x38 + m23 * x39 + m30 * x40 + m37 * x41 + m44 * x42; 36283287d42SBarry Smith x[37] -= m3 * x36 + m10 * x37 + m17 * x38 + m24 * x39 + m31 * x40 + m38 * x41 + m45 * x42; 36383287d42SBarry Smith x[38] -= m4 * x36 + m11 * x37 + m18 * x38 + m25 * x39 + m32 * x40 + m39 * x41 + m46 * x42; 36483287d42SBarry Smith x[39] -= m5 * x36 + m12 * x37 + m19 * x38 + m26 * x39 + m33 * x40 + m40 * x41 + m47 * x42; 36583287d42SBarry Smith x[40] -= m6 * x36 + m13 * x37 + m20 * x38 + m27 * x39 + m34 * x40 + m41 * x41 + m48 * x42; 36683287d42SBarry Smith x[41] -= m7 * x36 + m14 * x37 + m21 * x38 + m28 * x39 + m35 * x40 + m42 * x41 + m49 * x42; 36783287d42SBarry Smith 36883287d42SBarry Smith x[42] -= m1 * x43 + m8 * x44 + m15 * x45 + m22 * x46 + m29 * x47 + m36 * x48 + m43 * x49; 36983287d42SBarry Smith x[43] -= m2 * x43 + m9 * x44 + m16 * x45 + m23 * x46 + m30 * x47 + m37 * x48 + m44 * x49; 37083287d42SBarry Smith x[44] -= m3 * x43 + m10 * x44 + m17 * x45 + m24 * x46 + m31 * x47 + m38 * x48 + m45 * x49; 37183287d42SBarry Smith x[45] -= m4 * x43 + m11 * x44 + m18 * x45 + m25 * x46 + m32 * x47 + m39 * x48 + m46 * x49; 37283287d42SBarry Smith x[46] -= m5 * x43 + m12 * x44 + m19 * x45 + m26 * x46 + m33 * x47 + m40 * x48 + m47 * x49; 37383287d42SBarry Smith x[47] -= m6 * x43 + m13 * x44 + m20 * x45 + m27 * x46 + m34 * x47 + m41 * x48 + m48 * x49; 37483287d42SBarry Smith x[48] -= m7 * x43 + m14 * x44 + m21 * x45 + m28 * x46 + m35 * x47 + m42 * x48 + m49 * x49; 37583287d42SBarry Smith pv += 49; 37683287d42SBarry Smith } 3779566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(686.0 * nz + 637.0)); 37883287d42SBarry Smith } 37983287d42SBarry Smith row = *ajtmp++; 38083287d42SBarry Smith } 38183287d42SBarry Smith /* finished row so stick it into b->a */ 38283287d42SBarry Smith pv = ba + 49 * bi[i]; 38383287d42SBarry Smith pj = bj + bi[i]; 38483287d42SBarry Smith nz = bi[i + 1] - bi[i]; 38583287d42SBarry Smith for (j = 0; j < nz; j++) { 38683287d42SBarry Smith x = rtmp + 49 * pj[j]; 387*9371c9d4SSatish Balay pv[0] = x[0]; 388*9371c9d4SSatish Balay pv[1] = x[1]; 389*9371c9d4SSatish Balay pv[2] = x[2]; 390*9371c9d4SSatish Balay pv[3] = x[3]; 391*9371c9d4SSatish Balay pv[4] = x[4]; 392*9371c9d4SSatish Balay pv[5] = x[5]; 393*9371c9d4SSatish Balay pv[6] = x[6]; 394*9371c9d4SSatish Balay pv[7] = x[7]; 395*9371c9d4SSatish Balay pv[8] = x[8]; 396*9371c9d4SSatish Balay pv[9] = x[9]; 397*9371c9d4SSatish Balay pv[10] = x[10]; 398*9371c9d4SSatish Balay pv[11] = x[11]; 399*9371c9d4SSatish Balay pv[12] = x[12]; 400*9371c9d4SSatish Balay pv[13] = x[13]; 401*9371c9d4SSatish Balay pv[14] = x[14]; 402*9371c9d4SSatish Balay pv[15] = x[15]; 403*9371c9d4SSatish Balay pv[16] = x[16]; 404*9371c9d4SSatish Balay pv[17] = x[17]; 405*9371c9d4SSatish Balay pv[18] = x[18]; 406*9371c9d4SSatish Balay pv[19] = x[19]; 407*9371c9d4SSatish Balay pv[20] = x[20]; 408*9371c9d4SSatish Balay pv[21] = x[21]; 409*9371c9d4SSatish Balay pv[22] = x[22]; 410*9371c9d4SSatish Balay pv[23] = x[23]; 411*9371c9d4SSatish Balay pv[24] = x[24]; 412*9371c9d4SSatish Balay pv[25] = x[25]; 413*9371c9d4SSatish Balay pv[26] = x[26]; 414*9371c9d4SSatish Balay pv[27] = x[27]; 415*9371c9d4SSatish Balay pv[28] = x[28]; 416*9371c9d4SSatish Balay pv[29] = x[29]; 417*9371c9d4SSatish Balay pv[30] = x[30]; 418*9371c9d4SSatish Balay pv[31] = x[31]; 419*9371c9d4SSatish Balay pv[32] = x[32]; 420*9371c9d4SSatish Balay pv[33] = x[33]; 421*9371c9d4SSatish Balay pv[34] = x[34]; 422*9371c9d4SSatish Balay pv[35] = x[35]; 423*9371c9d4SSatish Balay pv[36] = x[36]; 424*9371c9d4SSatish Balay pv[37] = x[37]; 425*9371c9d4SSatish Balay pv[38] = x[38]; 426*9371c9d4SSatish Balay pv[39] = x[39]; 427*9371c9d4SSatish Balay pv[40] = x[40]; 428*9371c9d4SSatish Balay pv[41] = x[41]; 429*9371c9d4SSatish Balay pv[42] = x[42]; 430*9371c9d4SSatish Balay pv[43] = x[43]; 431*9371c9d4SSatish Balay pv[44] = x[44]; 432*9371c9d4SSatish Balay pv[45] = x[45]; 433*9371c9d4SSatish Balay pv[46] = x[46]; 434*9371c9d4SSatish Balay pv[47] = x[47]; 43583287d42SBarry Smith pv[48] = x[48]; 43683287d42SBarry Smith pv += 49; 43783287d42SBarry Smith } 43883287d42SBarry Smith /* invert diagonal block */ 43983287d42SBarry Smith w = ba + 49 * diag_offset[i]; 4409566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_inverse_A_7(w, shift, allowzeropivot, &zeropivotdetected)); 4417b6c816cSBarry Smith if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 44283287d42SBarry Smith } 44383287d42SBarry Smith 4449566063dSJacob Faibussowitsch PetscCall(PetscFree(rtmp)); 4459566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isicol, &ic)); 4469566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &r)); 44726fbe8dcSKarl Rupp 44806e38f1dSHong Zhang C->ops->solve = MatSolve_SeqBAIJ_7_inplace; 44906e38f1dSHong Zhang C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_7_inplace; 45083287d42SBarry Smith C->assembled = PETSC_TRUE; 45126fbe8dcSKarl Rupp 4529566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(1.333333333333 * 7 * 7 * 7 * b->mbs)); /* from inverting diagonal blocks */ 45383287d42SBarry Smith PetscFunctionReturn(0); 45483287d42SBarry Smith } 455bef36659SShri Abhyankar 456*9371c9d4SSatish Balay PetscErrorCode MatLUFactorNumeric_SeqBAIJ_7(Mat B, Mat A, const MatFactorInfo *info) { 457bef36659SShri Abhyankar Mat C = B; 458bef36659SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 459bef36659SShri Abhyankar IS isrow = b->row, isicol = b->icol; 4605a586d82SBarry Smith const PetscInt *r, *ic; 461bbd65245SShri Abhyankar PetscInt i, j, k, nz, nzL, row; 462bbd65245SShri Abhyankar const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j; 463bbd65245SShri Abhyankar const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2; 464bef36659SShri Abhyankar MatScalar *rtmp, *pc, *mwork, *v, *pv, *aa = a->a; 465bbd65245SShri Abhyankar PetscInt flg; 466182b8fbaSHong Zhang PetscReal shift = info->shiftamount; 467a455e926SHong Zhang PetscBool allowzeropivot, zeropivotdetected; 468bef36659SShri Abhyankar 469bef36659SShri Abhyankar PetscFunctionBegin; 4700164db54SHong Zhang allowzeropivot = PetscNot(A->erroriffailure); 4719566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow, &r)); 4729566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isicol, &ic)); 473bef36659SShri Abhyankar 474bef36659SShri Abhyankar /* generate work space needed by the factorization */ 4759566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork)); 4769566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(rtmp, bs2 * n)); 477bef36659SShri Abhyankar 478bef36659SShri Abhyankar for (i = 0; i < n; i++) { 479bef36659SShri Abhyankar /* zero rtmp */ 480bef36659SShri Abhyankar /* L part */ 481bef36659SShri Abhyankar nz = bi[i + 1] - bi[i]; 482bef36659SShri Abhyankar bjtmp = bj + bi[i]; 483*9371c9d4SSatish Balay for (j = 0; j < nz; j++) { PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); } 484bef36659SShri Abhyankar 485bef36659SShri Abhyankar /* U part */ 48635aa4fcfSShri Abhyankar nz = bdiag[i] - bdiag[i + 1]; 48735aa4fcfSShri Abhyankar bjtmp = bj + bdiag[i + 1] + 1; 488*9371c9d4SSatish Balay for (j = 0; j < nz; j++) { PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); } 48935aa4fcfSShri Abhyankar 49035aa4fcfSShri Abhyankar /* load in initial (unfactored row) */ 49135aa4fcfSShri Abhyankar nz = ai[r[i] + 1] - ai[r[i]]; 49235aa4fcfSShri Abhyankar ajtmp = aj + ai[r[i]]; 49335aa4fcfSShri Abhyankar v = aa + bs2 * ai[r[i]]; 494*9371c9d4SSatish Balay for (j = 0; j < nz; j++) { PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2)); } 49535aa4fcfSShri Abhyankar 49635aa4fcfSShri Abhyankar /* elimination */ 49735aa4fcfSShri Abhyankar bjtmp = bj + bi[i]; 49835aa4fcfSShri Abhyankar nzL = bi[i + 1] - bi[i]; 49935aa4fcfSShri Abhyankar for (k = 0; k < nzL; k++) { 50035aa4fcfSShri Abhyankar row = bjtmp[k]; 50135aa4fcfSShri Abhyankar pc = rtmp + bs2 * row; 502c35f09e5SBarry Smith for (flg = 0, j = 0; j < bs2; j++) { 503c35f09e5SBarry Smith if (pc[j] != 0.0) { 504c35f09e5SBarry Smith flg = 1; 505c35f09e5SBarry Smith break; 506c35f09e5SBarry Smith } 507c35f09e5SBarry Smith } 50835aa4fcfSShri Abhyankar if (flg) { 50935aa4fcfSShri Abhyankar pv = b->a + bs2 * bdiag[row]; 51096b95a6bSBarry Smith /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 5119566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_A_times_B_7(pc, pv, mwork)); 51235aa4fcfSShri Abhyankar 513a5b23f4aSJose E. Roman pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 51435aa4fcfSShri Abhyankar pv = b->a + bs2 * (bdiag[row + 1] + 1); 51535aa4fcfSShri Abhyankar nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */ 51635aa4fcfSShri Abhyankar for (j = 0; j < nz; j++) { 51796b95a6bSBarry Smith /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 51835aa4fcfSShri Abhyankar /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 51935aa4fcfSShri Abhyankar v = rtmp + bs2 * pj[j]; 5209566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_A_minus_B_times_C_7(v, pc, pv)); 52135aa4fcfSShri Abhyankar pv += bs2; 52235aa4fcfSShri Abhyankar } 5239566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(686.0 * nz + 637)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 52435aa4fcfSShri Abhyankar } 52535aa4fcfSShri Abhyankar } 52635aa4fcfSShri Abhyankar 52735aa4fcfSShri Abhyankar /* finished row so stick it into b->a */ 52835aa4fcfSShri Abhyankar /* L part */ 52935aa4fcfSShri Abhyankar pv = b->a + bs2 * bi[i]; 53035aa4fcfSShri Abhyankar pj = b->j + bi[i]; 53135aa4fcfSShri Abhyankar nz = bi[i + 1] - bi[i]; 532*9371c9d4SSatish Balay for (j = 0; j < nz; j++) { PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); } 53335aa4fcfSShri Abhyankar 534a5b23f4aSJose E. Roman /* Mark diagonal and invert diagonal for simpler triangular solves */ 53535aa4fcfSShri Abhyankar pv = b->a + bs2 * bdiag[i]; 53635aa4fcfSShri Abhyankar pj = b->j + bdiag[i]; 5379566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2)); 5389566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_inverse_A_7(pv, shift, allowzeropivot, &zeropivotdetected)); 5397b6c816cSBarry Smith if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 54035aa4fcfSShri Abhyankar 54135aa4fcfSShri Abhyankar /* U part */ 54235aa4fcfSShri Abhyankar pv = b->a + bs2 * (bdiag[i + 1] + 1); 54335aa4fcfSShri Abhyankar pj = b->j + bdiag[i + 1] + 1; 54435aa4fcfSShri Abhyankar nz = bdiag[i] - bdiag[i + 1] - 1; 545*9371c9d4SSatish Balay for (j = 0; j < nz; j++) { PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); } 54635aa4fcfSShri Abhyankar } 54735aa4fcfSShri Abhyankar 5489566063dSJacob Faibussowitsch PetscCall(PetscFree2(rtmp, mwork)); 5499566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isicol, &ic)); 5509566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &r)); 55126fbe8dcSKarl Rupp 5524dd39f65SShri Abhyankar C->ops->solve = MatSolve_SeqBAIJ_7; 5534dd39f65SShri Abhyankar C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_7; 55435aa4fcfSShri Abhyankar C->assembled = PETSC_TRUE; 55526fbe8dcSKarl Rupp 5569566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(1.333333333333 * 7 * 7 * 7 * n)); /* from inverting diagonal blocks */ 55735aa4fcfSShri Abhyankar PetscFunctionReturn(0); 55835aa4fcfSShri Abhyankar } 55935aa4fcfSShri Abhyankar 560*9371c9d4SSatish Balay PetscErrorCode MatLUFactorNumeric_SeqBAIJ_7_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info) { 561bef36659SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 562bef36659SShri Abhyankar PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j; 563bef36659SShri Abhyankar PetscInt *ajtmpold, *ajtmp, nz, row; 564bef36659SShri Abhyankar PetscInt *diag_offset = b->diag, *ai = a->i, *aj = a->j, *pj; 565bef36659SShri Abhyankar MatScalar *pv, *v, *rtmp, *pc, *w, *x; 566bef36659SShri Abhyankar MatScalar x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; 567bef36659SShri Abhyankar MatScalar x16, x17, x18, x19, x20, x21, x22, x23, x24, x25; 568bef36659SShri Abhyankar MatScalar p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15; 569bef36659SShri Abhyankar MatScalar p16, p17, p18, p19, p20, p21, p22, p23, p24, p25; 570bef36659SShri Abhyankar MatScalar m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14, m15; 571bef36659SShri Abhyankar MatScalar m16, m17, m18, m19, m20, m21, m22, m23, m24, m25; 572bef36659SShri Abhyankar MatScalar p26, p27, p28, p29, p30, p31, p32, p33, p34, p35, p36; 573bef36659SShri Abhyankar MatScalar p37, p38, p39, p40, p41, p42, p43, p44, p45, p46, p47, p48, p49; 574bef36659SShri Abhyankar MatScalar x26, x27, x28, x29, x30, x31, x32, x33, x34, x35, x36; 575bef36659SShri Abhyankar MatScalar x37, x38, x39, x40, x41, x42, x43, x44, x45, x46, x47, x48, x49; 576bef36659SShri Abhyankar MatScalar m26, m27, m28, m29, m30, m31, m32, m33, m34, m35, m36; 577bef36659SShri Abhyankar MatScalar m37, m38, m39, m40, m41, m42, m43, m44, m45, m46, m47, m48, m49; 578bef36659SShri Abhyankar MatScalar *ba = b->a, *aa = a->a; 579182b8fbaSHong Zhang PetscReal shift = info->shiftamount; 580a455e926SHong Zhang PetscBool allowzeropivot, zeropivotdetected; 581bef36659SShri Abhyankar 582bef36659SShri Abhyankar PetscFunctionBegin; 5830164db54SHong Zhang allowzeropivot = PetscNot(A->erroriffailure); 5849566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(49 * (n + 1), &rtmp)); 585bef36659SShri Abhyankar for (i = 0; i < n; i++) { 586bef36659SShri Abhyankar nz = bi[i + 1] - bi[i]; 587bef36659SShri Abhyankar ajtmp = bj + bi[i]; 588bef36659SShri Abhyankar for (j = 0; j < nz; j++) { 589bef36659SShri Abhyankar x = rtmp + 49 * ajtmp[j]; 590bef36659SShri Abhyankar x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 591bef36659SShri Abhyankar x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = x[16] = x[17] = 0.0; 592bef36659SShri Abhyankar x[18] = x[19] = x[20] = x[21] = x[22] = x[23] = x[24] = x[25] = 0.0; 593bef36659SShri Abhyankar x[26] = x[27] = x[28] = x[29] = x[30] = x[31] = x[32] = x[33] = 0.0; 594bef36659SShri Abhyankar x[34] = x[35] = x[36] = x[37] = x[38] = x[39] = x[40] = x[41] = 0.0; 595bef36659SShri Abhyankar x[42] = x[43] = x[44] = x[45] = x[46] = x[47] = x[48] = 0.0; 596bef36659SShri Abhyankar } 597bef36659SShri Abhyankar /* load in initial (unfactored row) */ 598bef36659SShri Abhyankar nz = ai[i + 1] - ai[i]; 599bef36659SShri Abhyankar ajtmpold = aj + ai[i]; 600bef36659SShri Abhyankar v = aa + 49 * ai[i]; 601bef36659SShri Abhyankar for (j = 0; j < nz; j++) { 602bef36659SShri Abhyankar x = rtmp + 49 * ajtmpold[j]; 603*9371c9d4SSatish Balay x[0] = v[0]; 604*9371c9d4SSatish Balay x[1] = v[1]; 605*9371c9d4SSatish Balay x[2] = v[2]; 606*9371c9d4SSatish Balay x[3] = v[3]; 607*9371c9d4SSatish Balay x[4] = v[4]; 608*9371c9d4SSatish Balay x[5] = v[5]; 609*9371c9d4SSatish Balay x[6] = v[6]; 610*9371c9d4SSatish Balay x[7] = v[7]; 611*9371c9d4SSatish Balay x[8] = v[8]; 612*9371c9d4SSatish Balay x[9] = v[9]; 613*9371c9d4SSatish Balay x[10] = v[10]; 614*9371c9d4SSatish Balay x[11] = v[11]; 615*9371c9d4SSatish Balay x[12] = v[12]; 616*9371c9d4SSatish Balay x[13] = v[13]; 617*9371c9d4SSatish Balay x[14] = v[14]; 618*9371c9d4SSatish Balay x[15] = v[15]; 619*9371c9d4SSatish Balay x[16] = v[16]; 620*9371c9d4SSatish Balay x[17] = v[17]; 621*9371c9d4SSatish Balay x[18] = v[18]; 622*9371c9d4SSatish Balay x[19] = v[19]; 623*9371c9d4SSatish Balay x[20] = v[20]; 624*9371c9d4SSatish Balay x[21] = v[21]; 625*9371c9d4SSatish Balay x[22] = v[22]; 626*9371c9d4SSatish Balay x[23] = v[23]; 627*9371c9d4SSatish Balay x[24] = v[24]; 628*9371c9d4SSatish Balay x[25] = v[25]; 629*9371c9d4SSatish Balay x[26] = v[26]; 630*9371c9d4SSatish Balay x[27] = v[27]; 631*9371c9d4SSatish Balay x[28] = v[28]; 632*9371c9d4SSatish Balay x[29] = v[29]; 633*9371c9d4SSatish Balay x[30] = v[30]; 634*9371c9d4SSatish Balay x[31] = v[31]; 635*9371c9d4SSatish Balay x[32] = v[32]; 636*9371c9d4SSatish Balay x[33] = v[33]; 637*9371c9d4SSatish Balay x[34] = v[34]; 638*9371c9d4SSatish Balay x[35] = v[35]; 639*9371c9d4SSatish Balay x[36] = v[36]; 640*9371c9d4SSatish Balay x[37] = v[37]; 641*9371c9d4SSatish Balay x[38] = v[38]; 642*9371c9d4SSatish Balay x[39] = v[39]; 643*9371c9d4SSatish Balay x[40] = v[40]; 644*9371c9d4SSatish Balay x[41] = v[41]; 645*9371c9d4SSatish Balay x[42] = v[42]; 646*9371c9d4SSatish Balay x[43] = v[43]; 647*9371c9d4SSatish Balay x[44] = v[44]; 648*9371c9d4SSatish Balay x[45] = v[45]; 649*9371c9d4SSatish Balay x[46] = v[46]; 650*9371c9d4SSatish Balay x[47] = v[47]; 651bef36659SShri Abhyankar x[48] = v[48]; 652bef36659SShri Abhyankar v += 49; 653bef36659SShri Abhyankar } 654bef36659SShri Abhyankar row = *ajtmp++; 655bef36659SShri Abhyankar while (row < i) { 656bef36659SShri Abhyankar pc = rtmp + 49 * row; 657*9371c9d4SSatish Balay p1 = pc[0]; 658*9371c9d4SSatish Balay p2 = pc[1]; 659*9371c9d4SSatish Balay p3 = pc[2]; 660*9371c9d4SSatish Balay p4 = pc[3]; 661*9371c9d4SSatish Balay p5 = pc[4]; 662*9371c9d4SSatish Balay p6 = pc[5]; 663*9371c9d4SSatish Balay p7 = pc[6]; 664*9371c9d4SSatish Balay p8 = pc[7]; 665*9371c9d4SSatish Balay p9 = pc[8]; 666*9371c9d4SSatish Balay p10 = pc[9]; 667*9371c9d4SSatish Balay p11 = pc[10]; 668*9371c9d4SSatish Balay p12 = pc[11]; 669*9371c9d4SSatish Balay p13 = pc[12]; 670*9371c9d4SSatish Balay p14 = pc[13]; 671*9371c9d4SSatish Balay p15 = pc[14]; 672*9371c9d4SSatish Balay p16 = pc[15]; 673*9371c9d4SSatish Balay p17 = pc[16]; 674*9371c9d4SSatish Balay p18 = pc[17]; 675*9371c9d4SSatish Balay p19 = pc[18]; 676*9371c9d4SSatish Balay p20 = pc[19]; 677*9371c9d4SSatish Balay p21 = pc[20]; 678*9371c9d4SSatish Balay p22 = pc[21]; 679*9371c9d4SSatish Balay p23 = pc[22]; 680*9371c9d4SSatish Balay p24 = pc[23]; 681*9371c9d4SSatish Balay p25 = pc[24]; 682*9371c9d4SSatish Balay p26 = pc[25]; 683*9371c9d4SSatish Balay p27 = pc[26]; 684*9371c9d4SSatish Balay p28 = pc[27]; 685*9371c9d4SSatish Balay p29 = pc[28]; 686*9371c9d4SSatish Balay p30 = pc[29]; 687*9371c9d4SSatish Balay p31 = pc[30]; 688*9371c9d4SSatish Balay p32 = pc[31]; 689*9371c9d4SSatish Balay p33 = pc[32]; 690*9371c9d4SSatish Balay p34 = pc[33]; 691*9371c9d4SSatish Balay p35 = pc[34]; 692*9371c9d4SSatish Balay p36 = pc[35]; 693*9371c9d4SSatish Balay p37 = pc[36]; 694*9371c9d4SSatish Balay p38 = pc[37]; 695*9371c9d4SSatish Balay p39 = pc[38]; 696*9371c9d4SSatish Balay p40 = pc[39]; 697*9371c9d4SSatish Balay p41 = pc[40]; 698*9371c9d4SSatish Balay p42 = pc[41]; 699*9371c9d4SSatish Balay p43 = pc[42]; 700*9371c9d4SSatish Balay p44 = pc[43]; 701*9371c9d4SSatish Balay p45 = pc[44]; 702*9371c9d4SSatish Balay p46 = pc[45]; 703*9371c9d4SSatish Balay p47 = pc[46]; 704*9371c9d4SSatish Balay p48 = pc[47]; 705bef36659SShri Abhyankar p49 = pc[48]; 706*9371c9d4SSatish Balay if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0 || p17 != 0.0 || p18 != 0.0 || p19 != 0.0 || p20 != 0.0 || p21 != 0.0 || p22 != 0.0 || p23 != 0.0 || p24 != 0.0 || p25 != 0.0 || p26 != 0.0 || p27 != 0.0 || p28 != 0.0 || p29 != 0.0 || p30 != 0.0 || p31 != 0.0 || p32 != 0.0 || p33 != 0.0 || p34 != 0.0 || p35 != 0.0 || p36 != 0.0 || p37 != 0.0 || p38 != 0.0 || p39 != 0.0 || p40 != 0.0 || p41 != 0.0 || p42 != 0.0 || p43 != 0.0 || p44 != 0.0 || p45 != 0.0 || p46 != 0.0 || p47 != 0.0 || p48 != 0.0 || p49 != 0.0) { 707bef36659SShri Abhyankar pv = ba + 49 * diag_offset[row]; 708bef36659SShri Abhyankar pj = bj + diag_offset[row] + 1; 709*9371c9d4SSatish Balay x1 = pv[0]; 710*9371c9d4SSatish Balay x2 = pv[1]; 711*9371c9d4SSatish Balay x3 = pv[2]; 712*9371c9d4SSatish Balay x4 = pv[3]; 713*9371c9d4SSatish Balay x5 = pv[4]; 714*9371c9d4SSatish Balay x6 = pv[5]; 715*9371c9d4SSatish Balay x7 = pv[6]; 716*9371c9d4SSatish Balay x8 = pv[7]; 717*9371c9d4SSatish Balay x9 = pv[8]; 718*9371c9d4SSatish Balay x10 = pv[9]; 719*9371c9d4SSatish Balay x11 = pv[10]; 720*9371c9d4SSatish Balay x12 = pv[11]; 721*9371c9d4SSatish Balay x13 = pv[12]; 722*9371c9d4SSatish Balay x14 = pv[13]; 723*9371c9d4SSatish Balay x15 = pv[14]; 724*9371c9d4SSatish Balay x16 = pv[15]; 725*9371c9d4SSatish Balay x17 = pv[16]; 726*9371c9d4SSatish Balay x18 = pv[17]; 727*9371c9d4SSatish Balay x19 = pv[18]; 728*9371c9d4SSatish Balay x20 = pv[19]; 729*9371c9d4SSatish Balay x21 = pv[20]; 730*9371c9d4SSatish Balay x22 = pv[21]; 731*9371c9d4SSatish Balay x23 = pv[22]; 732*9371c9d4SSatish Balay x24 = pv[23]; 733*9371c9d4SSatish Balay x25 = pv[24]; 734*9371c9d4SSatish Balay x26 = pv[25]; 735*9371c9d4SSatish Balay x27 = pv[26]; 736*9371c9d4SSatish Balay x28 = pv[27]; 737*9371c9d4SSatish Balay x29 = pv[28]; 738*9371c9d4SSatish Balay x30 = pv[29]; 739*9371c9d4SSatish Balay x31 = pv[30]; 740*9371c9d4SSatish Balay x32 = pv[31]; 741*9371c9d4SSatish Balay x33 = pv[32]; 742*9371c9d4SSatish Balay x34 = pv[33]; 743*9371c9d4SSatish Balay x35 = pv[34]; 744*9371c9d4SSatish Balay x36 = pv[35]; 745*9371c9d4SSatish Balay x37 = pv[36]; 746*9371c9d4SSatish Balay x38 = pv[37]; 747*9371c9d4SSatish Balay x39 = pv[38]; 748*9371c9d4SSatish Balay x40 = pv[39]; 749*9371c9d4SSatish Balay x41 = pv[40]; 750*9371c9d4SSatish Balay x42 = pv[41]; 751*9371c9d4SSatish Balay x43 = pv[42]; 752*9371c9d4SSatish Balay x44 = pv[43]; 753*9371c9d4SSatish Balay x45 = pv[44]; 754*9371c9d4SSatish Balay x46 = pv[45]; 755*9371c9d4SSatish Balay x47 = pv[46]; 756*9371c9d4SSatish Balay x48 = pv[47]; 757bef36659SShri Abhyankar x49 = pv[48]; 758bef36659SShri Abhyankar pc[0] = m1 = p1 * x1 + p8 * x2 + p15 * x3 + p22 * x4 + p29 * x5 + p36 * x6 + p43 * x7; 759bef36659SShri Abhyankar pc[1] = m2 = p2 * x1 + p9 * x2 + p16 * x3 + p23 * x4 + p30 * x5 + p37 * x6 + p44 * x7; 760bef36659SShri Abhyankar pc[2] = m3 = p3 * x1 + p10 * x2 + p17 * x3 + p24 * x4 + p31 * x5 + p38 * x6 + p45 * x7; 761bef36659SShri Abhyankar pc[3] = m4 = p4 * x1 + p11 * x2 + p18 * x3 + p25 * x4 + p32 * x5 + p39 * x6 + p46 * x7; 762bef36659SShri Abhyankar pc[4] = m5 = p5 * x1 + p12 * x2 + p19 * x3 + p26 * x4 + p33 * x5 + p40 * x6 + p47 * x7; 763bef36659SShri Abhyankar pc[5] = m6 = p6 * x1 + p13 * x2 + p20 * x3 + p27 * x4 + p34 * x5 + p41 * x6 + p48 * x7; 764bef36659SShri Abhyankar pc[6] = m7 = p7 * x1 + p14 * x2 + p21 * x3 + p28 * x4 + p35 * x5 + p42 * x6 + p49 * x7; 765bef36659SShri Abhyankar 766bef36659SShri Abhyankar pc[7] = m8 = p1 * x8 + p8 * x9 + p15 * x10 + p22 * x11 + p29 * x12 + p36 * x13 + p43 * x14; 767bef36659SShri Abhyankar pc[8] = m9 = p2 * x8 + p9 * x9 + p16 * x10 + p23 * x11 + p30 * x12 + p37 * x13 + p44 * x14; 768bef36659SShri Abhyankar pc[9] = m10 = p3 * x8 + p10 * x9 + p17 * x10 + p24 * x11 + p31 * x12 + p38 * x13 + p45 * x14; 769bef36659SShri Abhyankar pc[10] = m11 = p4 * x8 + p11 * x9 + p18 * x10 + p25 * x11 + p32 * x12 + p39 * x13 + p46 * x14; 770bef36659SShri Abhyankar pc[11] = m12 = p5 * x8 + p12 * x9 + p19 * x10 + p26 * x11 + p33 * x12 + p40 * x13 + p47 * x14; 771bef36659SShri Abhyankar pc[12] = m13 = p6 * x8 + p13 * x9 + p20 * x10 + p27 * x11 + p34 * x12 + p41 * x13 + p48 * x14; 772bef36659SShri Abhyankar pc[13] = m14 = p7 * x8 + p14 * x9 + p21 * x10 + p28 * x11 + p35 * x12 + p42 * x13 + p49 * x14; 773bef36659SShri Abhyankar 774bef36659SShri Abhyankar pc[14] = m15 = p1 * x15 + p8 * x16 + p15 * x17 + p22 * x18 + p29 * x19 + p36 * x20 + p43 * x21; 775bef36659SShri Abhyankar pc[15] = m16 = p2 * x15 + p9 * x16 + p16 * x17 + p23 * x18 + p30 * x19 + p37 * x20 + p44 * x21; 776bef36659SShri Abhyankar pc[16] = m17 = p3 * x15 + p10 * x16 + p17 * x17 + p24 * x18 + p31 * x19 + p38 * x20 + p45 * x21; 777bef36659SShri Abhyankar pc[17] = m18 = p4 * x15 + p11 * x16 + p18 * x17 + p25 * x18 + p32 * x19 + p39 * x20 + p46 * x21; 778bef36659SShri Abhyankar pc[18] = m19 = p5 * x15 + p12 * x16 + p19 * x17 + p26 * x18 + p33 * x19 + p40 * x20 + p47 * x21; 779bef36659SShri Abhyankar pc[19] = m20 = p6 * x15 + p13 * x16 + p20 * x17 + p27 * x18 + p34 * x19 + p41 * x20 + p48 * x21; 780bef36659SShri Abhyankar pc[20] = m21 = p7 * x15 + p14 * x16 + p21 * x17 + p28 * x18 + p35 * x19 + p42 * x20 + p49 * x21; 781bef36659SShri Abhyankar 782bef36659SShri Abhyankar pc[21] = m22 = p1 * x22 + p8 * x23 + p15 * x24 + p22 * x25 + p29 * x26 + p36 * x27 + p43 * x28; 783bef36659SShri Abhyankar pc[22] = m23 = p2 * x22 + p9 * x23 + p16 * x24 + p23 * x25 + p30 * x26 + p37 * x27 + p44 * x28; 784bef36659SShri Abhyankar pc[23] = m24 = p3 * x22 + p10 * x23 + p17 * x24 + p24 * x25 + p31 * x26 + p38 * x27 + p45 * x28; 785bef36659SShri Abhyankar pc[24] = m25 = p4 * x22 + p11 * x23 + p18 * x24 + p25 * x25 + p32 * x26 + p39 * x27 + p46 * x28; 786bef36659SShri Abhyankar pc[25] = m26 = p5 * x22 + p12 * x23 + p19 * x24 + p26 * x25 + p33 * x26 + p40 * x27 + p47 * x28; 787bef36659SShri Abhyankar pc[26] = m27 = p6 * x22 + p13 * x23 + p20 * x24 + p27 * x25 + p34 * x26 + p41 * x27 + p48 * x28; 788bef36659SShri Abhyankar pc[27] = m28 = p7 * x22 + p14 * x23 + p21 * x24 + p28 * x25 + p35 * x26 + p42 * x27 + p49 * x28; 789bef36659SShri Abhyankar 790bef36659SShri Abhyankar pc[28] = m29 = p1 * x29 + p8 * x30 + p15 * x31 + p22 * x32 + p29 * x33 + p36 * x34 + p43 * x35; 791bef36659SShri Abhyankar pc[29] = m30 = p2 * x29 + p9 * x30 + p16 * x31 + p23 * x32 + p30 * x33 + p37 * x34 + p44 * x35; 792bef36659SShri Abhyankar pc[30] = m31 = p3 * x29 + p10 * x30 + p17 * x31 + p24 * x32 + p31 * x33 + p38 * x34 + p45 * x35; 793bef36659SShri Abhyankar pc[31] = m32 = p4 * x29 + p11 * x30 + p18 * x31 + p25 * x32 + p32 * x33 + p39 * x34 + p46 * x35; 794bef36659SShri Abhyankar pc[32] = m33 = p5 * x29 + p12 * x30 + p19 * x31 + p26 * x32 + p33 * x33 + p40 * x34 + p47 * x35; 795bef36659SShri Abhyankar pc[33] = m34 = p6 * x29 + p13 * x30 + p20 * x31 + p27 * x32 + p34 * x33 + p41 * x34 + p48 * x35; 796bef36659SShri Abhyankar pc[34] = m35 = p7 * x29 + p14 * x30 + p21 * x31 + p28 * x32 + p35 * x33 + p42 * x34 + p49 * x35; 797bef36659SShri Abhyankar 798bef36659SShri Abhyankar pc[35] = m36 = p1 * x36 + p8 * x37 + p15 * x38 + p22 * x39 + p29 * x40 + p36 * x41 + p43 * x42; 799bef36659SShri Abhyankar pc[36] = m37 = p2 * x36 + p9 * x37 + p16 * x38 + p23 * x39 + p30 * x40 + p37 * x41 + p44 * x42; 800bef36659SShri Abhyankar pc[37] = m38 = p3 * x36 + p10 * x37 + p17 * x38 + p24 * x39 + p31 * x40 + p38 * x41 + p45 * x42; 801bef36659SShri Abhyankar pc[38] = m39 = p4 * x36 + p11 * x37 + p18 * x38 + p25 * x39 + p32 * x40 + p39 * x41 + p46 * x42; 802bef36659SShri Abhyankar pc[39] = m40 = p5 * x36 + p12 * x37 + p19 * x38 + p26 * x39 + p33 * x40 + p40 * x41 + p47 * x42; 803bef36659SShri Abhyankar pc[40] = m41 = p6 * x36 + p13 * x37 + p20 * x38 + p27 * x39 + p34 * x40 + p41 * x41 + p48 * x42; 804bef36659SShri Abhyankar pc[41] = m42 = p7 * x36 + p14 * x37 + p21 * x38 + p28 * x39 + p35 * x40 + p42 * x41 + p49 * x42; 805bef36659SShri Abhyankar 806bef36659SShri Abhyankar pc[42] = m43 = p1 * x43 + p8 * x44 + p15 * x45 + p22 * x46 + p29 * x47 + p36 * x48 + p43 * x49; 807bef36659SShri Abhyankar pc[43] = m44 = p2 * x43 + p9 * x44 + p16 * x45 + p23 * x46 + p30 * x47 + p37 * x48 + p44 * x49; 808bef36659SShri Abhyankar pc[44] = m45 = p3 * x43 + p10 * x44 + p17 * x45 + p24 * x46 + p31 * x47 + p38 * x48 + p45 * x49; 809bef36659SShri Abhyankar pc[45] = m46 = p4 * x43 + p11 * x44 + p18 * x45 + p25 * x46 + p32 * x47 + p39 * x48 + p46 * x49; 810bef36659SShri Abhyankar pc[46] = m47 = p5 * x43 + p12 * x44 + p19 * x45 + p26 * x46 + p33 * x47 + p40 * x48 + p47 * x49; 811bef36659SShri Abhyankar pc[47] = m48 = p6 * x43 + p13 * x44 + p20 * x45 + p27 * x46 + p34 * x47 + p41 * x48 + p48 * x49; 812bef36659SShri Abhyankar pc[48] = m49 = p7 * x43 + p14 * x44 + p21 * x45 + p28 * x46 + p35 * x47 + p42 * x48 + p49 * x49; 813bef36659SShri Abhyankar 814bef36659SShri Abhyankar nz = bi[row + 1] - diag_offset[row] - 1; 815bef36659SShri Abhyankar pv += 49; 816bef36659SShri Abhyankar for (j = 0; j < nz; j++) { 817*9371c9d4SSatish Balay x1 = pv[0]; 818*9371c9d4SSatish Balay x2 = pv[1]; 819*9371c9d4SSatish Balay x3 = pv[2]; 820*9371c9d4SSatish Balay x4 = pv[3]; 821*9371c9d4SSatish Balay x5 = pv[4]; 822*9371c9d4SSatish Balay x6 = pv[5]; 823*9371c9d4SSatish Balay x7 = pv[6]; 824*9371c9d4SSatish Balay x8 = pv[7]; 825*9371c9d4SSatish Balay x9 = pv[8]; 826*9371c9d4SSatish Balay x10 = pv[9]; 827*9371c9d4SSatish Balay x11 = pv[10]; 828*9371c9d4SSatish Balay x12 = pv[11]; 829*9371c9d4SSatish Balay x13 = pv[12]; 830*9371c9d4SSatish Balay x14 = pv[13]; 831*9371c9d4SSatish Balay x15 = pv[14]; 832*9371c9d4SSatish Balay x16 = pv[15]; 833*9371c9d4SSatish Balay x17 = pv[16]; 834*9371c9d4SSatish Balay x18 = pv[17]; 835*9371c9d4SSatish Balay x19 = pv[18]; 836*9371c9d4SSatish Balay x20 = pv[19]; 837*9371c9d4SSatish Balay x21 = pv[20]; 838*9371c9d4SSatish Balay x22 = pv[21]; 839*9371c9d4SSatish Balay x23 = pv[22]; 840*9371c9d4SSatish Balay x24 = pv[23]; 841*9371c9d4SSatish Balay x25 = pv[24]; 842*9371c9d4SSatish Balay x26 = pv[25]; 843*9371c9d4SSatish Balay x27 = pv[26]; 844*9371c9d4SSatish Balay x28 = pv[27]; 845*9371c9d4SSatish Balay x29 = pv[28]; 846*9371c9d4SSatish Balay x30 = pv[29]; 847*9371c9d4SSatish Balay x31 = pv[30]; 848*9371c9d4SSatish Balay x32 = pv[31]; 849*9371c9d4SSatish Balay x33 = pv[32]; 850*9371c9d4SSatish Balay x34 = pv[33]; 851*9371c9d4SSatish Balay x35 = pv[34]; 852*9371c9d4SSatish Balay x36 = pv[35]; 853*9371c9d4SSatish Balay x37 = pv[36]; 854*9371c9d4SSatish Balay x38 = pv[37]; 855*9371c9d4SSatish Balay x39 = pv[38]; 856*9371c9d4SSatish Balay x40 = pv[39]; 857*9371c9d4SSatish Balay x41 = pv[40]; 858*9371c9d4SSatish Balay x42 = pv[41]; 859*9371c9d4SSatish Balay x43 = pv[42]; 860*9371c9d4SSatish Balay x44 = pv[43]; 861*9371c9d4SSatish Balay x45 = pv[44]; 862*9371c9d4SSatish Balay x46 = pv[45]; 863*9371c9d4SSatish Balay x47 = pv[46]; 864*9371c9d4SSatish Balay x48 = pv[47]; 865bef36659SShri Abhyankar x49 = pv[48]; 866bef36659SShri Abhyankar x = rtmp + 49 * pj[j]; 867bef36659SShri Abhyankar x[0] -= m1 * x1 + m8 * x2 + m15 * x3 + m22 * x4 + m29 * x5 + m36 * x6 + m43 * x7; 868bef36659SShri Abhyankar x[1] -= m2 * x1 + m9 * x2 + m16 * x3 + m23 * x4 + m30 * x5 + m37 * x6 + m44 * x7; 869bef36659SShri Abhyankar x[2] -= m3 * x1 + m10 * x2 + m17 * x3 + m24 * x4 + m31 * x5 + m38 * x6 + m45 * x7; 870bef36659SShri Abhyankar x[3] -= m4 * x1 + m11 * x2 + m18 * x3 + m25 * x4 + m32 * x5 + m39 * x6 + m46 * x7; 871bef36659SShri Abhyankar x[4] -= m5 * x1 + m12 * x2 + m19 * x3 + m26 * x4 + m33 * x5 + m40 * x6 + m47 * x7; 872bef36659SShri Abhyankar x[5] -= m6 * x1 + m13 * x2 + m20 * x3 + m27 * x4 + m34 * x5 + m41 * x6 + m48 * x7; 873bef36659SShri Abhyankar x[6] -= m7 * x1 + m14 * x2 + m21 * x3 + m28 * x4 + m35 * x5 + m42 * x6 + m49 * x7; 874bef36659SShri Abhyankar 875bef36659SShri Abhyankar x[7] -= m1 * x8 + m8 * x9 + m15 * x10 + m22 * x11 + m29 * x12 + m36 * x13 + m43 * x14; 876bef36659SShri Abhyankar x[8] -= m2 * x8 + m9 * x9 + m16 * x10 + m23 * x11 + m30 * x12 + m37 * x13 + m44 * x14; 877bef36659SShri Abhyankar x[9] -= m3 * x8 + m10 * x9 + m17 * x10 + m24 * x11 + m31 * x12 + m38 * x13 + m45 * x14; 878bef36659SShri Abhyankar x[10] -= m4 * x8 + m11 * x9 + m18 * x10 + m25 * x11 + m32 * x12 + m39 * x13 + m46 * x14; 879bef36659SShri Abhyankar x[11] -= m5 * x8 + m12 * x9 + m19 * x10 + m26 * x11 + m33 * x12 + m40 * x13 + m47 * x14; 880bef36659SShri Abhyankar x[12] -= m6 * x8 + m13 * x9 + m20 * x10 + m27 * x11 + m34 * x12 + m41 * x13 + m48 * x14; 881bef36659SShri Abhyankar x[13] -= m7 * x8 + m14 * x9 + m21 * x10 + m28 * x11 + m35 * x12 + m42 * x13 + m49 * x14; 882bef36659SShri Abhyankar 883bef36659SShri Abhyankar x[14] -= m1 * x15 + m8 * x16 + m15 * x17 + m22 * x18 + m29 * x19 + m36 * x20 + m43 * x21; 884bef36659SShri Abhyankar x[15] -= m2 * x15 + m9 * x16 + m16 * x17 + m23 * x18 + m30 * x19 + m37 * x20 + m44 * x21; 885bef36659SShri Abhyankar x[16] -= m3 * x15 + m10 * x16 + m17 * x17 + m24 * x18 + m31 * x19 + m38 * x20 + m45 * x21; 886bef36659SShri Abhyankar x[17] -= m4 * x15 + m11 * x16 + m18 * x17 + m25 * x18 + m32 * x19 + m39 * x20 + m46 * x21; 887bef36659SShri Abhyankar x[18] -= m5 * x15 + m12 * x16 + m19 * x17 + m26 * x18 + m33 * x19 + m40 * x20 + m47 * x21; 888bef36659SShri Abhyankar x[19] -= m6 * x15 + m13 * x16 + m20 * x17 + m27 * x18 + m34 * x19 + m41 * x20 + m48 * x21; 889bef36659SShri Abhyankar x[20] -= m7 * x15 + m14 * x16 + m21 * x17 + m28 * x18 + m35 * x19 + m42 * x20 + m49 * x21; 890bef36659SShri Abhyankar 891bef36659SShri Abhyankar x[21] -= m1 * x22 + m8 * x23 + m15 * x24 + m22 * x25 + m29 * x26 + m36 * x27 + m43 * x28; 892bef36659SShri Abhyankar x[22] -= m2 * x22 + m9 * x23 + m16 * x24 + m23 * x25 + m30 * x26 + m37 * x27 + m44 * x28; 893bef36659SShri Abhyankar x[23] -= m3 * x22 + m10 * x23 + m17 * x24 + m24 * x25 + m31 * x26 + m38 * x27 + m45 * x28; 894bef36659SShri Abhyankar x[24] -= m4 * x22 + m11 * x23 + m18 * x24 + m25 * x25 + m32 * x26 + m39 * x27 + m46 * x28; 895bef36659SShri Abhyankar x[25] -= m5 * x22 + m12 * x23 + m19 * x24 + m26 * x25 + m33 * x26 + m40 * x27 + m47 * x28; 896bef36659SShri Abhyankar x[26] -= m6 * x22 + m13 * x23 + m20 * x24 + m27 * x25 + m34 * x26 + m41 * x27 + m48 * x28; 897bef36659SShri Abhyankar x[27] -= m7 * x22 + m14 * x23 + m21 * x24 + m28 * x25 + m35 * x26 + m42 * x27 + m49 * x28; 898bef36659SShri Abhyankar 899bef36659SShri Abhyankar x[28] -= m1 * x29 + m8 * x30 + m15 * x31 + m22 * x32 + m29 * x33 + m36 * x34 + m43 * x35; 900bef36659SShri Abhyankar x[29] -= m2 * x29 + m9 * x30 + m16 * x31 + m23 * x32 + m30 * x33 + m37 * x34 + m44 * x35; 901bef36659SShri Abhyankar x[30] -= m3 * x29 + m10 * x30 + m17 * x31 + m24 * x32 + m31 * x33 + m38 * x34 + m45 * x35; 902bef36659SShri Abhyankar x[31] -= m4 * x29 + m11 * x30 + m18 * x31 + m25 * x32 + m32 * x33 + m39 * x34 + m46 * x35; 903bef36659SShri Abhyankar x[32] -= m5 * x29 + m12 * x30 + m19 * x31 + m26 * x32 + m33 * x33 + m40 * x34 + m47 * x35; 904bef36659SShri Abhyankar x[33] -= m6 * x29 + m13 * x30 + m20 * x31 + m27 * x32 + m34 * x33 + m41 * x34 + m48 * x35; 905bef36659SShri Abhyankar x[34] -= m7 * x29 + m14 * x30 + m21 * x31 + m28 * x32 + m35 * x33 + m42 * x34 + m49 * x35; 906bef36659SShri Abhyankar 907bef36659SShri Abhyankar x[35] -= m1 * x36 + m8 * x37 + m15 * x38 + m22 * x39 + m29 * x40 + m36 * x41 + m43 * x42; 908bef36659SShri Abhyankar x[36] -= m2 * x36 + m9 * x37 + m16 * x38 + m23 * x39 + m30 * x40 + m37 * x41 + m44 * x42; 909bef36659SShri Abhyankar x[37] -= m3 * x36 + m10 * x37 + m17 * x38 + m24 * x39 + m31 * x40 + m38 * x41 + m45 * x42; 910bef36659SShri Abhyankar x[38] -= m4 * x36 + m11 * x37 + m18 * x38 + m25 * x39 + m32 * x40 + m39 * x41 + m46 * x42; 911bef36659SShri Abhyankar x[39] -= m5 * x36 + m12 * x37 + m19 * x38 + m26 * x39 + m33 * x40 + m40 * x41 + m47 * x42; 912bef36659SShri Abhyankar x[40] -= m6 * x36 + m13 * x37 + m20 * x38 + m27 * x39 + m34 * x40 + m41 * x41 + m48 * x42; 913bef36659SShri Abhyankar x[41] -= m7 * x36 + m14 * x37 + m21 * x38 + m28 * x39 + m35 * x40 + m42 * x41 + m49 * x42; 914bef36659SShri Abhyankar 915bef36659SShri Abhyankar x[42] -= m1 * x43 + m8 * x44 + m15 * x45 + m22 * x46 + m29 * x47 + m36 * x48 + m43 * x49; 916bef36659SShri Abhyankar x[43] -= m2 * x43 + m9 * x44 + m16 * x45 + m23 * x46 + m30 * x47 + m37 * x48 + m44 * x49; 917bef36659SShri Abhyankar x[44] -= m3 * x43 + m10 * x44 + m17 * x45 + m24 * x46 + m31 * x47 + m38 * x48 + m45 * x49; 918bef36659SShri Abhyankar x[45] -= m4 * x43 + m11 * x44 + m18 * x45 + m25 * x46 + m32 * x47 + m39 * x48 + m46 * x49; 919bef36659SShri Abhyankar x[46] -= m5 * x43 + m12 * x44 + m19 * x45 + m26 * x46 + m33 * x47 + m40 * x48 + m47 * x49; 920bef36659SShri Abhyankar x[47] -= m6 * x43 + m13 * x44 + m20 * x45 + m27 * x46 + m34 * x47 + m41 * x48 + m48 * x49; 921bef36659SShri Abhyankar x[48] -= m7 * x43 + m14 * x44 + m21 * x45 + m28 * x46 + m35 * x47 + m42 * x48 + m49 * x49; 922bef36659SShri Abhyankar pv += 49; 923bef36659SShri Abhyankar } 9249566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(686.0 * nz + 637.0)); 925bef36659SShri Abhyankar } 926bef36659SShri Abhyankar row = *ajtmp++; 927bef36659SShri Abhyankar } 928bef36659SShri Abhyankar /* finished row so stick it into b->a */ 929bef36659SShri Abhyankar pv = ba + 49 * bi[i]; 930bef36659SShri Abhyankar pj = bj + bi[i]; 931bef36659SShri Abhyankar nz = bi[i + 1] - bi[i]; 932bef36659SShri Abhyankar for (j = 0; j < nz; j++) { 933bef36659SShri Abhyankar x = rtmp + 49 * pj[j]; 934*9371c9d4SSatish Balay pv[0] = x[0]; 935*9371c9d4SSatish Balay pv[1] = x[1]; 936*9371c9d4SSatish Balay pv[2] = x[2]; 937*9371c9d4SSatish Balay pv[3] = x[3]; 938*9371c9d4SSatish Balay pv[4] = x[4]; 939*9371c9d4SSatish Balay pv[5] = x[5]; 940*9371c9d4SSatish Balay pv[6] = x[6]; 941*9371c9d4SSatish Balay pv[7] = x[7]; 942*9371c9d4SSatish Balay pv[8] = x[8]; 943*9371c9d4SSatish Balay pv[9] = x[9]; 944*9371c9d4SSatish Balay pv[10] = x[10]; 945*9371c9d4SSatish Balay pv[11] = x[11]; 946*9371c9d4SSatish Balay pv[12] = x[12]; 947*9371c9d4SSatish Balay pv[13] = x[13]; 948*9371c9d4SSatish Balay pv[14] = x[14]; 949*9371c9d4SSatish Balay pv[15] = x[15]; 950*9371c9d4SSatish Balay pv[16] = x[16]; 951*9371c9d4SSatish Balay pv[17] = x[17]; 952*9371c9d4SSatish Balay pv[18] = x[18]; 953*9371c9d4SSatish Balay pv[19] = x[19]; 954*9371c9d4SSatish Balay pv[20] = x[20]; 955*9371c9d4SSatish Balay pv[21] = x[21]; 956*9371c9d4SSatish Balay pv[22] = x[22]; 957*9371c9d4SSatish Balay pv[23] = x[23]; 958*9371c9d4SSatish Balay pv[24] = x[24]; 959*9371c9d4SSatish Balay pv[25] = x[25]; 960*9371c9d4SSatish Balay pv[26] = x[26]; 961*9371c9d4SSatish Balay pv[27] = x[27]; 962*9371c9d4SSatish Balay pv[28] = x[28]; 963*9371c9d4SSatish Balay pv[29] = x[29]; 964*9371c9d4SSatish Balay pv[30] = x[30]; 965*9371c9d4SSatish Balay pv[31] = x[31]; 966*9371c9d4SSatish Balay pv[32] = x[32]; 967*9371c9d4SSatish Balay pv[33] = x[33]; 968*9371c9d4SSatish Balay pv[34] = x[34]; 969*9371c9d4SSatish Balay pv[35] = x[35]; 970*9371c9d4SSatish Balay pv[36] = x[36]; 971*9371c9d4SSatish Balay pv[37] = x[37]; 972*9371c9d4SSatish Balay pv[38] = x[38]; 973*9371c9d4SSatish Balay pv[39] = x[39]; 974*9371c9d4SSatish Balay pv[40] = x[40]; 975*9371c9d4SSatish Balay pv[41] = x[41]; 976*9371c9d4SSatish Balay pv[42] = x[42]; 977*9371c9d4SSatish Balay pv[43] = x[43]; 978*9371c9d4SSatish Balay pv[44] = x[44]; 979*9371c9d4SSatish Balay pv[45] = x[45]; 980*9371c9d4SSatish Balay pv[46] = x[46]; 981*9371c9d4SSatish Balay pv[47] = x[47]; 982bef36659SShri Abhyankar pv[48] = x[48]; 983bef36659SShri Abhyankar pv += 49; 984bef36659SShri Abhyankar } 985bef36659SShri Abhyankar /* invert diagonal block */ 986bef36659SShri Abhyankar w = ba + 49 * diag_offset[i]; 9879566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_inverse_A_7(w, shift, allowzeropivot, &zeropivotdetected)); 9887b6c816cSBarry Smith if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 989bef36659SShri Abhyankar } 990bef36659SShri Abhyankar 9919566063dSJacob Faibussowitsch PetscCall(PetscFree(rtmp)); 99226fbe8dcSKarl Rupp 99306e38f1dSHong Zhang C->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering_inplace; 99406e38f1dSHong Zhang C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace; 995bef36659SShri Abhyankar C->assembled = PETSC_TRUE; 99626fbe8dcSKarl Rupp 9979566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(1.333333333333 * 7 * 7 * 7 * b->mbs)); /* from inverting diagonal blocks */ 998bef36659SShri Abhyankar PetscFunctionReturn(0); 999bef36659SShri Abhyankar } 1000bef36659SShri Abhyankar 1001*9371c9d4SSatish Balay PetscErrorCode MatLUFactorNumeric_SeqBAIJ_7_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info) { 1002bef36659SShri Abhyankar Mat C = B; 1003bef36659SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 1004bbd65245SShri Abhyankar PetscInt i, j, k, nz, nzL, row; 1005bbd65245SShri Abhyankar const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j; 1006bbd65245SShri Abhyankar const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2; 1007bef36659SShri Abhyankar MatScalar *rtmp, *pc, *mwork, *v, *pv, *aa = a->a; 1008bbd65245SShri Abhyankar PetscInt flg; 1009182b8fbaSHong Zhang PetscReal shift = info->shiftamount; 1010a455e926SHong Zhang PetscBool allowzeropivot, zeropivotdetected; 1011bef36659SShri Abhyankar 1012bef36659SShri Abhyankar PetscFunctionBegin; 10130164db54SHong Zhang allowzeropivot = PetscNot(A->erroriffailure); 10140164db54SHong Zhang 1015bef36659SShri Abhyankar /* generate work space needed by the factorization */ 10169566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork)); 10179566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(rtmp, bs2 * n)); 1018bef36659SShri Abhyankar 1019bef36659SShri Abhyankar for (i = 0; i < n; i++) { 1020bef36659SShri Abhyankar /* zero rtmp */ 1021bef36659SShri Abhyankar /* L part */ 1022bef36659SShri Abhyankar nz = bi[i + 1] - bi[i]; 1023bef36659SShri Abhyankar bjtmp = bj + bi[i]; 1024*9371c9d4SSatish Balay for (j = 0; j < nz; j++) { PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); } 1025bef36659SShri Abhyankar 1026bef36659SShri Abhyankar /* U part */ 102753cca76cSShri Abhyankar nz = bdiag[i] - bdiag[i + 1]; 102853cca76cSShri Abhyankar bjtmp = bj + bdiag[i + 1] + 1; 1029*9371c9d4SSatish Balay for (j = 0; j < nz; j++) { PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); } 103053cca76cSShri Abhyankar 103153cca76cSShri Abhyankar /* load in initial (unfactored row) */ 103253cca76cSShri Abhyankar nz = ai[i + 1] - ai[i]; 103353cca76cSShri Abhyankar ajtmp = aj + ai[i]; 103453cca76cSShri Abhyankar v = aa + bs2 * ai[i]; 1035*9371c9d4SSatish Balay for (j = 0; j < nz; j++) { PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2)); } 103653cca76cSShri Abhyankar 103753cca76cSShri Abhyankar /* elimination */ 103853cca76cSShri Abhyankar bjtmp = bj + bi[i]; 103953cca76cSShri Abhyankar nzL = bi[i + 1] - bi[i]; 104053cca76cSShri Abhyankar for (k = 0; k < nzL; k++) { 104153cca76cSShri Abhyankar row = bjtmp[k]; 104253cca76cSShri Abhyankar pc = rtmp + bs2 * row; 1043c35f09e5SBarry Smith for (flg = 0, j = 0; j < bs2; j++) { 1044c35f09e5SBarry Smith if (pc[j] != 0.0) { 1045c35f09e5SBarry Smith flg = 1; 1046c35f09e5SBarry Smith break; 1047c35f09e5SBarry Smith } 1048c35f09e5SBarry Smith } 104953cca76cSShri Abhyankar if (flg) { 105053cca76cSShri Abhyankar pv = b->a + bs2 * bdiag[row]; 105196b95a6bSBarry Smith /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 10529566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_A_times_B_7(pc, pv, mwork)); 105353cca76cSShri Abhyankar 1054a5b23f4aSJose E. Roman pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 105553cca76cSShri Abhyankar pv = b->a + bs2 * (bdiag[row + 1] + 1); 105653cca76cSShri Abhyankar nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */ 105753cca76cSShri Abhyankar for (j = 0; j < nz; j++) { 105896b95a6bSBarry Smith /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 105953cca76cSShri Abhyankar /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 106053cca76cSShri Abhyankar v = rtmp + bs2 * pj[j]; 10619566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_A_minus_B_times_C_7(v, pc, pv)); 106253cca76cSShri Abhyankar pv += bs2; 106353cca76cSShri Abhyankar } 10649566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(686.0 * nz + 637)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 106553cca76cSShri Abhyankar } 106653cca76cSShri Abhyankar } 106753cca76cSShri Abhyankar 106853cca76cSShri Abhyankar /* finished row so stick it into b->a */ 106953cca76cSShri Abhyankar /* L part */ 107053cca76cSShri Abhyankar pv = b->a + bs2 * bi[i]; 107153cca76cSShri Abhyankar pj = b->j + bi[i]; 107253cca76cSShri Abhyankar nz = bi[i + 1] - bi[i]; 1073*9371c9d4SSatish Balay for (j = 0; j < nz; j++) { PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); } 107453cca76cSShri Abhyankar 1075a5b23f4aSJose E. Roman /* Mark diagonal and invert diagonal for simpler triangular solves */ 107653cca76cSShri Abhyankar pv = b->a + bs2 * bdiag[i]; 107753cca76cSShri Abhyankar pj = b->j + bdiag[i]; 10789566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2)); 10799566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_inverse_A_7(pv, shift, allowzeropivot, &zeropivotdetected)); 10807b6c816cSBarry Smith if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 108153cca76cSShri Abhyankar 108253cca76cSShri Abhyankar /* U part */ 108353cca76cSShri Abhyankar pv = b->a + bs2 * (bdiag[i + 1] + 1); 108453cca76cSShri Abhyankar pj = b->j + bdiag[i + 1] + 1; 108553cca76cSShri Abhyankar nz = bdiag[i] - bdiag[i + 1] - 1; 1086*9371c9d4SSatish Balay for (j = 0; j < nz; j++) { PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); } 108753cca76cSShri Abhyankar } 10889566063dSJacob Faibussowitsch PetscCall(PetscFree2(rtmp, mwork)); 108926fbe8dcSKarl Rupp 10904dd39f65SShri Abhyankar C->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering; 10914dd39f65SShri Abhyankar C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_7_NaturalOrdering; 109253cca76cSShri Abhyankar C->assembled = PETSC_TRUE; 109326fbe8dcSKarl Rupp 10949566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(1.333333333333 * 7 * 7 * 7 * n)); /* from inverting diagonal blocks */ 109553cca76cSShri Abhyankar PetscFunctionReturn(0); 109653cca76cSShri Abhyankar } 1097