14c1414c8SBarry Smith 24c1414c8SBarry Smith /* 34c1414c8SBarry Smith This file provides high performance routines for the Inode format (compressed sparse row) 44c1414c8SBarry Smith by taking advantage of rows with identical nonzero structure (I-nodes). 54c1414c8SBarry Smith */ 6c6db04a5SJed Brown #include <../src/mat/impls/aij/seq/aij.h> 7*fb56d528SJed Brown #if defined(PETSC_HAVE_XMMINTRIN_H) 8*fb56d528SJed Brown #include <xmmintrin.h> 9*fb56d528SJed Brown #endif 104c1414c8SBarry Smith 11d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatCreateColInode_Private(Mat A, PetscInt *size, PetscInt **ns) 12d71ae5a4SJacob Faibussowitsch { 134c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 144c1414c8SBarry Smith PetscInt i, count, m, n, min_mn, *ns_row, *ns_col; 154c1414c8SBarry Smith 164c1414c8SBarry Smith PetscFunctionBegin; 17d0f46423SBarry Smith n = A->cmap->n; 18d0f46423SBarry Smith m = A->rmap->n; 1908401ef6SPierre Jolivet PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 204c1414c8SBarry Smith ns_row = a->inode.size; 214c1414c8SBarry Smith 224c1414c8SBarry Smith min_mn = (m < n) ? m : n; 234c1414c8SBarry Smith if (!ns) { 249371c9d4SSatish Balay for (count = 0, i = 0; count < min_mn; count += ns_row[i], i++) 259371c9d4SSatish Balay ; 269371c9d4SSatish Balay for (; count + 1 < n; count++, i++) 279371c9d4SSatish Balay ; 28ad540459SPierre Jolivet if (count < n) i++; 294c1414c8SBarry Smith *size = i; 303ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 314c1414c8SBarry Smith } 329566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(n + 1, &ns_col)); 334c1414c8SBarry Smith 344c1414c8SBarry Smith /* Use the same row structure wherever feasible. */ 35ad540459SPierre Jolivet for (count = 0, i = 0; count < min_mn; count += ns_row[i], i++) ns_col[i] = ns_row[i]; 364c1414c8SBarry Smith 374c1414c8SBarry Smith /* if m < n; pad up the remainder with inode_limit */ 38ad540459SPierre Jolivet for (; count + 1 < n; count++, i++) ns_col[i] = 1; 39aaa8cc7dSPierre Jolivet /* The last node is the odd ball. pad it up with the remaining rows; */ 404c1414c8SBarry Smith if (count < n) { 414c1414c8SBarry Smith ns_col[i] = n - count; 424c1414c8SBarry Smith i++; 434c1414c8SBarry Smith } else if (count > n) { 444c1414c8SBarry Smith /* Adjust for the over estimation */ 454c1414c8SBarry Smith ns_col[i - 1] += n - count; 464c1414c8SBarry Smith } 474c1414c8SBarry Smith *size = i; 484c1414c8SBarry Smith *ns = ns_col; 493ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 504c1414c8SBarry Smith } 514c1414c8SBarry Smith 524c1414c8SBarry Smith /* 534c1414c8SBarry Smith This builds symmetric version of nonzero structure, 544c1414c8SBarry Smith */ 55d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift) 56d71ae5a4SJacob Faibussowitsch { 574c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 588758e1faSBarry Smith PetscInt *work, *ia, *ja, nz, nslim_row, nslim_col, m, row, col, n; 598758e1faSBarry Smith PetscInt *tns, *tvc, *ns_row = a->inode.size, *ns_col, nsz, i1, i2; 608758e1faSBarry Smith const PetscInt *j, *jmax, *ai = a->i, *aj = a->j; 614c1414c8SBarry Smith 624c1414c8SBarry Smith PetscFunctionBegin; 634c1414c8SBarry Smith nslim_row = a->inode.node_count; 64d0f46423SBarry Smith m = A->rmap->n; 65d0f46423SBarry Smith n = A->cmap->n; 6608401ef6SPierre Jolivet PetscCheck(m == n, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square"); 6708401ef6SPierre Jolivet PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 684c1414c8SBarry Smith 694c1414c8SBarry Smith /* Use the row_inode as column_inode */ 704c1414c8SBarry Smith nslim_col = nslim_row; 714c1414c8SBarry Smith ns_col = ns_row; 724c1414c8SBarry Smith 7335cb6cd3SPierre Jolivet /* allocate space for reformatted inode structure */ 749566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc)); 754c1414c8SBarry Smith for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_row[i1]; 764c1414c8SBarry Smith 774c1414c8SBarry Smith for (i1 = 0, col = 0; i1 < nslim_col; ++i1) { 784c1414c8SBarry Smith nsz = ns_col[i1]; 792205254eSKarl Rupp for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1; 804c1414c8SBarry Smith } 814c1414c8SBarry Smith /* allocate space for row pointers */ 829566063dSJacob Faibussowitsch PetscCall(PetscCalloc1(nslim_row + 1, &ia)); 834c1414c8SBarry Smith *iia = ia; 849566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(nslim_row + 1, &work)); 854c1414c8SBarry Smith 864c1414c8SBarry Smith /* determine the number of columns in each row */ 874c1414c8SBarry Smith ia[0] = oshift; 884c1414c8SBarry Smith for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) { 894c1414c8SBarry Smith j = aj + ai[row] + ishift; 904c1414c8SBarry Smith jmax = aj + ai[row + 1] + ishift; 9183fed2edSSatish Balay if (j == jmax) continue; /* empty row */ 924c1414c8SBarry Smith col = *j++ + ishift; 934c1414c8SBarry Smith i2 = tvc[col]; 946aad120cSJose E. Roman while (i2 < i1 && j < jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elements */ 954c1414c8SBarry Smith ia[i1 + 1]++; 964c1414c8SBarry Smith ia[i2 + 1]++; 974c1414c8SBarry Smith i2++; /* Start col of next node */ 9890d2dec7SBarry Smith while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j; 994c1414c8SBarry Smith i2 = tvc[col]; 1004c1414c8SBarry Smith } 1014c1414c8SBarry Smith if (i2 == i1) ia[i2 + 1]++; /* now the diagonal element */ 1024c1414c8SBarry Smith } 1034c1414c8SBarry Smith 1044c1414c8SBarry Smith /* shift ia[i] to point to next row */ 1054c1414c8SBarry Smith for (i1 = 1; i1 < nslim_row + 1; i1++) { 1064c1414c8SBarry Smith row = ia[i1 - 1]; 1074c1414c8SBarry Smith ia[i1] += row; 1084c1414c8SBarry Smith work[i1 - 1] = row - oshift; 1094c1414c8SBarry Smith } 1104c1414c8SBarry Smith 1114c1414c8SBarry Smith /* allocate space for column pointers */ 1124c1414c8SBarry Smith nz = ia[nslim_row] + (!ishift); 1139566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(nz, &ja)); 1144c1414c8SBarry Smith *jja = ja; 1154c1414c8SBarry Smith 1164c1414c8SBarry Smith /* loop over lower triangular part putting into ja */ 1174c1414c8SBarry Smith for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) { 1184c1414c8SBarry Smith j = aj + ai[row] + ishift; 1194c1414c8SBarry Smith jmax = aj + ai[row + 1] + ishift; 12083fed2edSSatish Balay if (j == jmax) continue; /* empty row */ 1214c1414c8SBarry Smith col = *j++ + ishift; 1224c1414c8SBarry Smith i2 = tvc[col]; 1234c1414c8SBarry Smith while (i2 < i1 && j < jmax) { 1244c1414c8SBarry Smith ja[work[i2]++] = i1 + oshift; 1254c1414c8SBarry Smith ja[work[i1]++] = i2 + oshift; 1264c1414c8SBarry Smith ++i2; 12790d2dec7SBarry Smith while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j; /* Skip rest col indices in this node */ 1284c1414c8SBarry Smith i2 = tvc[col]; 1294c1414c8SBarry Smith } 1304c1414c8SBarry Smith if (i2 == i1) ja[work[i1]++] = i2 + oshift; 1314c1414c8SBarry Smith } 1329566063dSJacob Faibussowitsch PetscCall(PetscFree(work)); 1339566063dSJacob Faibussowitsch PetscCall(PetscFree2(tns, tvc)); 1343ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1354c1414c8SBarry Smith } 1364c1414c8SBarry Smith 1374c1414c8SBarry Smith /* 1384c1414c8SBarry Smith This builds nonsymmetric version of nonzero structure, 1394c1414c8SBarry Smith */ 140d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift) 141d71ae5a4SJacob Faibussowitsch { 1424c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1438758e1faSBarry Smith PetscInt *work, *ia, *ja, nz, nslim_row, n, row, col, *ns_col, nslim_col; 1448758e1faSBarry Smith PetscInt *tns, *tvc, nsz, i1, i2; 1458758e1faSBarry Smith const PetscInt *j, *ai = a->i, *aj = a->j, *ns_row = a->inode.size; 1464c1414c8SBarry Smith 1474c1414c8SBarry Smith PetscFunctionBegin; 14808401ef6SPierre Jolivet PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 1494c1414c8SBarry Smith nslim_row = a->inode.node_count; 150d0f46423SBarry Smith n = A->cmap->n; 1514c1414c8SBarry Smith 1524c1414c8SBarry Smith /* Create The column_inode for this matrix */ 1539566063dSJacob Faibussowitsch PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col)); 1544c1414c8SBarry Smith 15535cb6cd3SPierre Jolivet /* allocate space for reformatted column_inode structure */ 1569566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc)); 1574c1414c8SBarry Smith for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_col[i1]; 1584c1414c8SBarry Smith 1594c1414c8SBarry Smith for (i1 = 0, col = 0; i1 < nslim_col; ++i1) { 1604c1414c8SBarry Smith nsz = ns_col[i1]; 1612205254eSKarl Rupp for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1; 1624c1414c8SBarry Smith } 1634c1414c8SBarry Smith /* allocate space for row pointers */ 1649566063dSJacob Faibussowitsch PetscCall(PetscCalloc1(nslim_row + 1, &ia)); 1654c1414c8SBarry Smith *iia = ia; 1669566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(nslim_row + 1, &work)); 1674c1414c8SBarry Smith 1684c1414c8SBarry Smith /* determine the number of columns in each row */ 1694c1414c8SBarry Smith ia[0] = oshift; 1704c1414c8SBarry Smith for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) { 1714c1414c8SBarry Smith j = aj + ai[row] + ishift; 17283fed2edSSatish Balay nz = ai[row + 1] - ai[row]; 17383fed2edSSatish Balay if (!nz) continue; /* empty row */ 1744c1414c8SBarry Smith col = *j++ + ishift; 1754c1414c8SBarry Smith i2 = tvc[col]; 1766aad120cSJose E. Roman while (nz-- > 0) { /* off-diagonal elements */ 1774c1414c8SBarry Smith ia[i1 + 1]++; 1784c1414c8SBarry Smith i2++; /* Start col of next node */ 179a8e3a797SJed Brown while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 1804c1414c8SBarry Smith if (nz > 0) i2 = tvc[col]; 1814c1414c8SBarry Smith } 1824c1414c8SBarry Smith } 1834c1414c8SBarry Smith 1844c1414c8SBarry Smith /* shift ia[i] to point to next row */ 1854c1414c8SBarry Smith for (i1 = 1; i1 < nslim_row + 1; i1++) { 1864c1414c8SBarry Smith row = ia[i1 - 1]; 1874c1414c8SBarry Smith ia[i1] += row; 1884c1414c8SBarry Smith work[i1 - 1] = row - oshift; 1894c1414c8SBarry Smith } 1904c1414c8SBarry Smith 1914c1414c8SBarry Smith /* allocate space for column pointers */ 1924c1414c8SBarry Smith nz = ia[nslim_row] + (!ishift); 1939566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(nz, &ja)); 1944c1414c8SBarry Smith *jja = ja; 1954c1414c8SBarry Smith 1964c1414c8SBarry Smith /* loop over matrix putting into ja */ 1974c1414c8SBarry Smith for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) { 1984c1414c8SBarry Smith j = aj + ai[row] + ishift; 19983fed2edSSatish Balay nz = ai[row + 1] - ai[row]; 20083fed2edSSatish Balay if (!nz) continue; /* empty row */ 2014c1414c8SBarry Smith col = *j++ + ishift; 2024c1414c8SBarry Smith i2 = tvc[col]; 2034c1414c8SBarry Smith while (nz-- > 0) { 2044c1414c8SBarry Smith ja[work[i1]++] = i2 + oshift; 2054c1414c8SBarry Smith ++i2; 206a8e3a797SJed Brown while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 2074c1414c8SBarry Smith if (nz > 0) i2 = tvc[col]; 2084c1414c8SBarry Smith } 2094c1414c8SBarry Smith } 2109566063dSJacob Faibussowitsch PetscCall(PetscFree(ns_col)); 2119566063dSJacob Faibussowitsch PetscCall(PetscFree(work)); 2129566063dSJacob Faibussowitsch PetscCall(PetscFree2(tns, tvc)); 2133ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2144c1414c8SBarry Smith } 2154c1414c8SBarry Smith 216d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) 217d71ae5a4SJacob Faibussowitsch { 2184c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2194c1414c8SBarry Smith 2204c1414c8SBarry Smith PetscFunctionBegin; 22150ba90b4SBarry Smith if (n) *n = a->inode.node_count; 2223ba16761SJacob Faibussowitsch if (!ia) PetscFunctionReturn(PETSC_SUCCESS); 2238f7157efSSatish Balay if (!blockcompressed) { 2249566063dSJacob Faibussowitsch PetscCall(MatGetRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done)); 2258f7157efSSatish Balay } else if (symmetric) { 2269566063dSJacob Faibussowitsch PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift)); 2274c1414c8SBarry Smith } else { 2289566063dSJacob Faibussowitsch PetscCall(MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift)); 2294c1414c8SBarry Smith } 2303ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2314c1414c8SBarry Smith } 2324c1414c8SBarry Smith 233d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) 234d71ae5a4SJacob Faibussowitsch { 2354c1414c8SBarry Smith PetscFunctionBegin; 2363ba16761SJacob Faibussowitsch if (!ia) PetscFunctionReturn(PETSC_SUCCESS); 2378f7157efSSatish Balay 2388f7157efSSatish Balay if (!blockcompressed) { 2399566063dSJacob Faibussowitsch PetscCall(MatRestoreRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done)); 2408f7157efSSatish Balay } else { 2419566063dSJacob Faibussowitsch PetscCall(PetscFree(*ia)); 2429566063dSJacob Faibussowitsch PetscCall(PetscFree(*ja)); 2438f7157efSSatish Balay } 2443ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2454c1414c8SBarry Smith } 2464c1414c8SBarry Smith 247d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift) 248d71ae5a4SJacob Faibussowitsch { 2494c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2504c1414c8SBarry Smith PetscInt *work, *ia, *ja, *j, nz, nslim_row, n, row, col, *ns_col, nslim_col; 2514c1414c8SBarry Smith PetscInt *tns, *tvc, *ns_row = a->inode.size, nsz, i1, i2, *ai = a->i, *aj = a->j; 2524c1414c8SBarry Smith 2534c1414c8SBarry Smith PetscFunctionBegin; 25408401ef6SPierre Jolivet PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 2554c1414c8SBarry Smith nslim_row = a->inode.node_count; 256d0f46423SBarry Smith n = A->cmap->n; 2574c1414c8SBarry Smith 2584c1414c8SBarry Smith /* Create The column_inode for this matrix */ 2599566063dSJacob Faibussowitsch PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col)); 2604c1414c8SBarry Smith 26135cb6cd3SPierre Jolivet /* allocate space for reformatted column_inode structure */ 2629566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc)); 2634c1414c8SBarry Smith for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_col[i1]; 2644c1414c8SBarry Smith 2654c1414c8SBarry Smith for (i1 = 0, col = 0; i1 < nslim_col; ++i1) { 2664c1414c8SBarry Smith nsz = ns_col[i1]; 2672205254eSKarl Rupp for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1; 2684c1414c8SBarry Smith } 2694c1414c8SBarry Smith /* allocate space for column pointers */ 2709566063dSJacob Faibussowitsch PetscCall(PetscCalloc1(nslim_col + 1, &ia)); 2714c1414c8SBarry Smith *iia = ia; 2729566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(nslim_col + 1, &work)); 2734c1414c8SBarry Smith 2744c1414c8SBarry Smith /* determine the number of columns in each row */ 2754c1414c8SBarry Smith ia[0] = oshift; 2764c1414c8SBarry Smith for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) { 2774c1414c8SBarry Smith j = aj + ai[row] + ishift; 2784c1414c8SBarry Smith col = *j++ + ishift; 2794c1414c8SBarry Smith i2 = tvc[col]; 2804c1414c8SBarry Smith nz = ai[row + 1] - ai[row]; 2816aad120cSJose E. Roman while (nz-- > 0) { /* off-diagonal elements */ 2824c1414c8SBarry Smith /* ia[i1+1]++; */ 2834c1414c8SBarry Smith ia[i2 + 1]++; 2844c1414c8SBarry Smith i2++; 285a8e3a797SJed Brown while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 2864c1414c8SBarry Smith if (nz > 0) i2 = tvc[col]; 2874c1414c8SBarry Smith } 2884c1414c8SBarry Smith } 2894c1414c8SBarry Smith 2904c1414c8SBarry Smith /* shift ia[i] to point to next col */ 2914c1414c8SBarry Smith for (i1 = 1; i1 < nslim_col + 1; i1++) { 2924c1414c8SBarry Smith col = ia[i1 - 1]; 2934c1414c8SBarry Smith ia[i1] += col; 2944c1414c8SBarry Smith work[i1 - 1] = col - oshift; 2954c1414c8SBarry Smith } 2964c1414c8SBarry Smith 2974c1414c8SBarry Smith /* allocate space for column pointers */ 2984c1414c8SBarry Smith nz = ia[nslim_col] + (!ishift); 2999566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(nz, &ja)); 3004c1414c8SBarry Smith *jja = ja; 3014c1414c8SBarry Smith 3024c1414c8SBarry Smith /* loop over matrix putting into ja */ 3034c1414c8SBarry Smith for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) { 3044c1414c8SBarry Smith j = aj + ai[row] + ishift; 3054c1414c8SBarry Smith col = *j++ + ishift; 3064c1414c8SBarry Smith i2 = tvc[col]; 3074c1414c8SBarry Smith nz = ai[row + 1] - ai[row]; 3084c1414c8SBarry Smith while (nz-- > 0) { 3094c1414c8SBarry Smith /* ja[work[i1]++] = i2 + oshift; */ 3104c1414c8SBarry Smith ja[work[i2]++] = i1 + oshift; 3114c1414c8SBarry Smith i2++; 312a8e3a797SJed Brown while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 3134c1414c8SBarry Smith if (nz > 0) i2 = tvc[col]; 3144c1414c8SBarry Smith } 3154c1414c8SBarry Smith } 3169566063dSJacob Faibussowitsch PetscCall(PetscFree(ns_col)); 3179566063dSJacob Faibussowitsch PetscCall(PetscFree(work)); 3189566063dSJacob Faibussowitsch PetscCall(PetscFree2(tns, tvc)); 3193ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3204c1414c8SBarry Smith } 3214c1414c8SBarry Smith 322d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) 323d71ae5a4SJacob Faibussowitsch { 3244c1414c8SBarry Smith PetscFunctionBegin; 3259566063dSJacob Faibussowitsch PetscCall(MatCreateColInode_Private(A, n, NULL)); 3263ba16761SJacob Faibussowitsch if (!ia) PetscFunctionReturn(PETSC_SUCCESS); 3274c1414c8SBarry Smith 3288f7157efSSatish Balay if (!blockcompressed) { 3299566063dSJacob Faibussowitsch PetscCall(MatGetColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done)); 3308f7157efSSatish Balay } else if (symmetric) { 331a5b23f4aSJose E. Roman /* Since the indices are symmetric it doesn't matter */ 3329566063dSJacob Faibussowitsch PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift)); 3334c1414c8SBarry Smith } else { 3349566063dSJacob Faibussowitsch PetscCall(MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift)); 3354c1414c8SBarry Smith } 3363ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3374c1414c8SBarry Smith } 3384c1414c8SBarry Smith 339d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) 340d71ae5a4SJacob Faibussowitsch { 3414c1414c8SBarry Smith PetscFunctionBegin; 3423ba16761SJacob Faibussowitsch if (!ia) PetscFunctionReturn(PETSC_SUCCESS); 3438f7157efSSatish Balay if (!blockcompressed) { 3449566063dSJacob Faibussowitsch PetscCall(MatRestoreColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done)); 3458f7157efSSatish Balay } else { 3469566063dSJacob Faibussowitsch PetscCall(PetscFree(*ia)); 3479566063dSJacob Faibussowitsch PetscCall(PetscFree(*ja)); 3488f7157efSSatish Balay } 3493ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3504c1414c8SBarry Smith } 3514c1414c8SBarry Smith 352d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqAIJ_Inode(Mat A, Vec xx, Vec yy) 353d71ae5a4SJacob Faibussowitsch { 3544c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3554c1414c8SBarry Smith PetscScalar sum1, sum2, sum3, sum4, sum5, tmp0, tmp1; 356d9fead3dSBarry Smith PetscScalar *y; 357dd6ea824SBarry Smith const PetscScalar *x; 358dd6ea824SBarry Smith const MatScalar *v1, *v2, *v3, *v4, *v5; 3598758e1faSBarry Smith PetscInt i1, i2, n, i, row, node_max, nsz, sz, nonzerorow = 0; 3608758e1faSBarry Smith const PetscInt *idx, *ns, *ii; 3614c1414c8SBarry Smith 3624c1414c8SBarry Smith #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 3634c1414c8SBarry Smith #pragma disjoint(*x, *y, *v1, *v2, *v3, *v4, *v5) 3644c1414c8SBarry Smith #endif 3654c1414c8SBarry Smith 3664c1414c8SBarry Smith PetscFunctionBegin; 36708401ef6SPierre Jolivet PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 3684c1414c8SBarry Smith node_max = a->inode.node_count; 3694c1414c8SBarry Smith ns = a->inode.size; /* Node Size array */ 3709566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 3719566063dSJacob Faibussowitsch PetscCall(VecGetArray(yy, &y)); 3724c1414c8SBarry Smith idx = a->j; 3734c1414c8SBarry Smith v1 = a->a; 3744c1414c8SBarry Smith ii = a->i; 3754c1414c8SBarry Smith 3764c1414c8SBarry Smith for (i = 0, row = 0; i < node_max; ++i) { 3774c1414c8SBarry Smith nsz = ns[i]; 3784c1414c8SBarry Smith n = ii[1] - ii[0]; 37998c9bda7SSatish Balay nonzerorow += (n > 0) * nsz; 3804c1414c8SBarry Smith ii += nsz; 38150d8bf02SJed Brown PetscPrefetchBlock(idx + nsz * n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the indices for the block row after the current one */ 38250d8bf02SJed Brown PetscPrefetchBlock(v1 + nsz * n, nsz * n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one */ 3834c1414c8SBarry Smith sz = n; /* No of non zeros in this row */ 3844c1414c8SBarry Smith /* Switch on the size of Node */ 3854c1414c8SBarry Smith switch (nsz) { /* Each loop in 'case' is unrolled */ 3864c1414c8SBarry Smith case 1: 38775567043SBarry Smith sum1 = 0.; 3884c1414c8SBarry Smith 3894c1414c8SBarry Smith for (n = 0; n < sz - 1; n += 2) { 3904c1414c8SBarry Smith i1 = idx[0]; /* The instructions are ordered to */ 3914c1414c8SBarry Smith i2 = idx[1]; /* make the compiler's job easy */ 3924c1414c8SBarry Smith idx += 2; 3934c1414c8SBarry Smith tmp0 = x[i1]; 3944c1414c8SBarry Smith tmp1 = x[i2]; 3959371c9d4SSatish Balay sum1 += v1[0] * tmp0 + v1[1] * tmp1; 3969371c9d4SSatish Balay v1 += 2; 3974c1414c8SBarry Smith } 3984c1414c8SBarry Smith 3994c1414c8SBarry Smith if (n == sz - 1) { /* Take care of the last nonzero */ 4004c1414c8SBarry Smith tmp0 = x[*idx++]; 4014c1414c8SBarry Smith sum1 += *v1++ * tmp0; 4024c1414c8SBarry Smith } 4034c1414c8SBarry Smith y[row++] = sum1; 4044c1414c8SBarry Smith break; 4054c1414c8SBarry Smith case 2: 40675567043SBarry Smith sum1 = 0.; 40775567043SBarry Smith sum2 = 0.; 4084c1414c8SBarry Smith v2 = v1 + n; 4094c1414c8SBarry Smith 4104c1414c8SBarry Smith for (n = 0; n < sz - 1; n += 2) { 4114c1414c8SBarry Smith i1 = idx[0]; 4124c1414c8SBarry Smith i2 = idx[1]; 4134c1414c8SBarry Smith idx += 2; 4144c1414c8SBarry Smith tmp0 = x[i1]; 4154c1414c8SBarry Smith tmp1 = x[i2]; 4169371c9d4SSatish Balay sum1 += v1[0] * tmp0 + v1[1] * tmp1; 4179371c9d4SSatish Balay v1 += 2; 4189371c9d4SSatish Balay sum2 += v2[0] * tmp0 + v2[1] * tmp1; 4199371c9d4SSatish Balay v2 += 2; 4204c1414c8SBarry Smith } 4214c1414c8SBarry Smith if (n == sz - 1) { 4224c1414c8SBarry Smith tmp0 = x[*idx++]; 4234c1414c8SBarry Smith sum1 += *v1++ * tmp0; 4244c1414c8SBarry Smith sum2 += *v2++ * tmp0; 4254c1414c8SBarry Smith } 4264c1414c8SBarry Smith y[row++] = sum1; 4274c1414c8SBarry Smith y[row++] = sum2; 4284c1414c8SBarry Smith v1 = v2; /* Since the next block to be processed starts there*/ 4294c1414c8SBarry Smith idx += sz; 4304c1414c8SBarry Smith break; 4314c1414c8SBarry Smith case 3: 43275567043SBarry Smith sum1 = 0.; 43375567043SBarry Smith sum2 = 0.; 43475567043SBarry Smith sum3 = 0.; 4354c1414c8SBarry Smith v2 = v1 + n; 4364c1414c8SBarry Smith v3 = v2 + n; 4374c1414c8SBarry Smith 4384c1414c8SBarry Smith for (n = 0; n < sz - 1; n += 2) { 4394c1414c8SBarry Smith i1 = idx[0]; 4404c1414c8SBarry Smith i2 = idx[1]; 4414c1414c8SBarry Smith idx += 2; 4424c1414c8SBarry Smith tmp0 = x[i1]; 4434c1414c8SBarry Smith tmp1 = x[i2]; 4449371c9d4SSatish Balay sum1 += v1[0] * tmp0 + v1[1] * tmp1; 4459371c9d4SSatish Balay v1 += 2; 4469371c9d4SSatish Balay sum2 += v2[0] * tmp0 + v2[1] * tmp1; 4479371c9d4SSatish Balay v2 += 2; 4489371c9d4SSatish Balay sum3 += v3[0] * tmp0 + v3[1] * tmp1; 4499371c9d4SSatish Balay v3 += 2; 4504c1414c8SBarry Smith } 4514c1414c8SBarry Smith if (n == sz - 1) { 4524c1414c8SBarry Smith tmp0 = x[*idx++]; 4534c1414c8SBarry Smith sum1 += *v1++ * tmp0; 4544c1414c8SBarry Smith sum2 += *v2++ * tmp0; 4554c1414c8SBarry Smith sum3 += *v3++ * tmp0; 4564c1414c8SBarry Smith } 4574c1414c8SBarry Smith y[row++] = sum1; 4584c1414c8SBarry Smith y[row++] = sum2; 4594c1414c8SBarry Smith y[row++] = sum3; 4604c1414c8SBarry Smith v1 = v3; /* Since the next block to be processed starts there*/ 4614c1414c8SBarry Smith idx += 2 * sz; 4624c1414c8SBarry Smith break; 4634c1414c8SBarry Smith case 4: 46475567043SBarry Smith sum1 = 0.; 46575567043SBarry Smith sum2 = 0.; 46675567043SBarry Smith sum3 = 0.; 46775567043SBarry Smith sum4 = 0.; 4684c1414c8SBarry Smith v2 = v1 + n; 4694c1414c8SBarry Smith v3 = v2 + n; 4704c1414c8SBarry Smith v4 = v3 + n; 4714c1414c8SBarry Smith 4724c1414c8SBarry Smith for (n = 0; n < sz - 1; n += 2) { 4734c1414c8SBarry Smith i1 = idx[0]; 4744c1414c8SBarry Smith i2 = idx[1]; 4754c1414c8SBarry Smith idx += 2; 4764c1414c8SBarry Smith tmp0 = x[i1]; 4774c1414c8SBarry Smith tmp1 = x[i2]; 4789371c9d4SSatish Balay sum1 += v1[0] * tmp0 + v1[1] * tmp1; 4799371c9d4SSatish Balay v1 += 2; 4809371c9d4SSatish Balay sum2 += v2[0] * tmp0 + v2[1] * tmp1; 4819371c9d4SSatish Balay v2 += 2; 4829371c9d4SSatish Balay sum3 += v3[0] * tmp0 + v3[1] * tmp1; 4839371c9d4SSatish Balay v3 += 2; 4849371c9d4SSatish Balay sum4 += v4[0] * tmp0 + v4[1] * tmp1; 4859371c9d4SSatish Balay v4 += 2; 4864c1414c8SBarry Smith } 4874c1414c8SBarry Smith if (n == sz - 1) { 4884c1414c8SBarry Smith tmp0 = x[*idx++]; 4894c1414c8SBarry Smith sum1 += *v1++ * tmp0; 4904c1414c8SBarry Smith sum2 += *v2++ * tmp0; 4914c1414c8SBarry Smith sum3 += *v3++ * tmp0; 4924c1414c8SBarry Smith sum4 += *v4++ * tmp0; 4934c1414c8SBarry Smith } 4944c1414c8SBarry Smith y[row++] = sum1; 4954c1414c8SBarry Smith y[row++] = sum2; 4964c1414c8SBarry Smith y[row++] = sum3; 4974c1414c8SBarry Smith y[row++] = sum4; 4984c1414c8SBarry Smith v1 = v4; /* Since the next block to be processed starts there*/ 4994c1414c8SBarry Smith idx += 3 * sz; 5004c1414c8SBarry Smith break; 5014c1414c8SBarry Smith case 5: 50275567043SBarry Smith sum1 = 0.; 50375567043SBarry Smith sum2 = 0.; 50475567043SBarry Smith sum3 = 0.; 50575567043SBarry Smith sum4 = 0.; 50675567043SBarry Smith sum5 = 0.; 5074c1414c8SBarry Smith v2 = v1 + n; 5084c1414c8SBarry Smith v3 = v2 + n; 5094c1414c8SBarry Smith v4 = v3 + n; 5104c1414c8SBarry Smith v5 = v4 + n; 5114c1414c8SBarry Smith 5124c1414c8SBarry Smith for (n = 0; n < sz - 1; n += 2) { 5134c1414c8SBarry Smith i1 = idx[0]; 5144c1414c8SBarry Smith i2 = idx[1]; 5154c1414c8SBarry Smith idx += 2; 5164c1414c8SBarry Smith tmp0 = x[i1]; 5174c1414c8SBarry Smith tmp1 = x[i2]; 5189371c9d4SSatish Balay sum1 += v1[0] * tmp0 + v1[1] * tmp1; 5199371c9d4SSatish Balay v1 += 2; 5209371c9d4SSatish Balay sum2 += v2[0] * tmp0 + v2[1] * tmp1; 5219371c9d4SSatish Balay v2 += 2; 5229371c9d4SSatish Balay sum3 += v3[0] * tmp0 + v3[1] * tmp1; 5239371c9d4SSatish Balay v3 += 2; 5249371c9d4SSatish Balay sum4 += v4[0] * tmp0 + v4[1] * tmp1; 5259371c9d4SSatish Balay v4 += 2; 5269371c9d4SSatish Balay sum5 += v5[0] * tmp0 + v5[1] * tmp1; 5279371c9d4SSatish Balay v5 += 2; 5284c1414c8SBarry Smith } 5294c1414c8SBarry Smith if (n == sz - 1) { 5304c1414c8SBarry Smith tmp0 = x[*idx++]; 5314c1414c8SBarry Smith sum1 += *v1++ * tmp0; 5324c1414c8SBarry Smith sum2 += *v2++ * tmp0; 5334c1414c8SBarry Smith sum3 += *v3++ * tmp0; 5344c1414c8SBarry Smith sum4 += *v4++ * tmp0; 5354c1414c8SBarry Smith sum5 += *v5++ * tmp0; 5364c1414c8SBarry Smith } 5374c1414c8SBarry Smith y[row++] = sum1; 5384c1414c8SBarry Smith y[row++] = sum2; 5394c1414c8SBarry Smith y[row++] = sum3; 5404c1414c8SBarry Smith y[row++] = sum4; 5414c1414c8SBarry Smith y[row++] = sum5; 5424c1414c8SBarry Smith v1 = v5; /* Since the next block to be processed starts there */ 5434c1414c8SBarry Smith idx += 4 * sz; 5444c1414c8SBarry Smith break; 545d71ae5a4SJacob Faibussowitsch default: 546d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported"); 5474c1414c8SBarry Smith } 5484c1414c8SBarry Smith } 5499566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 5509566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(yy, &y)); 5519566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz - nonzerorow)); 5523ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 5534c1414c8SBarry Smith } 5542ef1f0ffSBarry Smith 5554108e4d5SBarry Smith /* Almost same code as the MatMult_SeqAIJ_Inode() */ 556d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A, Vec xx, Vec zz, Vec yy) 557d71ae5a4SJacob Faibussowitsch { 5584c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 5594c1414c8SBarry Smith PetscScalar sum1, sum2, sum3, sum4, sum5, tmp0, tmp1; 5608758e1faSBarry Smith const MatScalar *v1, *v2, *v3, *v4, *v5; 5618758e1faSBarry Smith const PetscScalar *x; 5628758e1faSBarry Smith PetscScalar *y, *z, *zt; 5638758e1faSBarry Smith PetscInt i1, i2, n, i, row, node_max, nsz, sz; 5648758e1faSBarry Smith const PetscInt *idx, *ns, *ii; 5654c1414c8SBarry Smith 5664c1414c8SBarry Smith PetscFunctionBegin; 56708401ef6SPierre Jolivet PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 5684c1414c8SBarry Smith node_max = a->inode.node_count; 5694c1414c8SBarry Smith ns = a->inode.size; /* Node Size array */ 5702205254eSKarl Rupp 5719566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 5729566063dSJacob Faibussowitsch PetscCall(VecGetArrayPair(zz, yy, &z, &y)); 5734c1414c8SBarry Smith zt = z; 5744c1414c8SBarry Smith 5754c1414c8SBarry Smith idx = a->j; 5764c1414c8SBarry Smith v1 = a->a; 5774c1414c8SBarry Smith ii = a->i; 5784c1414c8SBarry Smith 5794c1414c8SBarry Smith for (i = 0, row = 0; i < node_max; ++i) { 5804c1414c8SBarry Smith nsz = ns[i]; 5814c1414c8SBarry Smith n = ii[1] - ii[0]; 5824c1414c8SBarry Smith ii += nsz; 5834c1414c8SBarry Smith sz = n; /* No of non zeros in this row */ 5844c1414c8SBarry Smith /* Switch on the size of Node */ 5854c1414c8SBarry Smith switch (nsz) { /* Each loop in 'case' is unrolled */ 5864c1414c8SBarry Smith case 1: 5874c1414c8SBarry Smith sum1 = *zt++; 5884c1414c8SBarry Smith 5894c1414c8SBarry Smith for (n = 0; n < sz - 1; n += 2) { 5904c1414c8SBarry Smith i1 = idx[0]; /* The instructions are ordered to */ 5914c1414c8SBarry Smith i2 = idx[1]; /* make the compiler's job easy */ 5924c1414c8SBarry Smith idx += 2; 5934c1414c8SBarry Smith tmp0 = x[i1]; 5944c1414c8SBarry Smith tmp1 = x[i2]; 5959371c9d4SSatish Balay sum1 += v1[0] * tmp0 + v1[1] * tmp1; 5969371c9d4SSatish Balay v1 += 2; 5974c1414c8SBarry Smith } 5984c1414c8SBarry Smith 5994c1414c8SBarry Smith if (n == sz - 1) { /* Take care of the last nonzero */ 6004c1414c8SBarry Smith tmp0 = x[*idx++]; 6014c1414c8SBarry Smith sum1 += *v1++ * tmp0; 6024c1414c8SBarry Smith } 6034c1414c8SBarry Smith y[row++] = sum1; 6044c1414c8SBarry Smith break; 6054c1414c8SBarry Smith case 2: 6064c1414c8SBarry Smith sum1 = *zt++; 6074c1414c8SBarry Smith sum2 = *zt++; 6084c1414c8SBarry Smith v2 = v1 + n; 6094c1414c8SBarry Smith 6104c1414c8SBarry Smith for (n = 0; n < sz - 1; n += 2) { 6114c1414c8SBarry Smith i1 = idx[0]; 6124c1414c8SBarry Smith i2 = idx[1]; 6134c1414c8SBarry Smith idx += 2; 6144c1414c8SBarry Smith tmp0 = x[i1]; 6154c1414c8SBarry Smith tmp1 = x[i2]; 6169371c9d4SSatish Balay sum1 += v1[0] * tmp0 + v1[1] * tmp1; 6179371c9d4SSatish Balay v1 += 2; 6189371c9d4SSatish Balay sum2 += v2[0] * tmp0 + v2[1] * tmp1; 6199371c9d4SSatish Balay v2 += 2; 6204c1414c8SBarry Smith } 6214c1414c8SBarry Smith if (n == sz - 1) { 6224c1414c8SBarry Smith tmp0 = x[*idx++]; 6234c1414c8SBarry Smith sum1 += *v1++ * tmp0; 6244c1414c8SBarry Smith sum2 += *v2++ * tmp0; 6254c1414c8SBarry Smith } 6264c1414c8SBarry Smith y[row++] = sum1; 6274c1414c8SBarry Smith y[row++] = sum2; 6284c1414c8SBarry Smith v1 = v2; /* Since the next block to be processed starts there*/ 6294c1414c8SBarry Smith idx += sz; 6304c1414c8SBarry Smith break; 6314c1414c8SBarry Smith case 3: 6324c1414c8SBarry Smith sum1 = *zt++; 6334c1414c8SBarry Smith sum2 = *zt++; 6344c1414c8SBarry Smith sum3 = *zt++; 6354c1414c8SBarry Smith v2 = v1 + n; 6364c1414c8SBarry Smith v3 = v2 + n; 6374c1414c8SBarry Smith 6384c1414c8SBarry Smith for (n = 0; n < sz - 1; n += 2) { 6394c1414c8SBarry Smith i1 = idx[0]; 6404c1414c8SBarry Smith i2 = idx[1]; 6414c1414c8SBarry Smith idx += 2; 6424c1414c8SBarry Smith tmp0 = x[i1]; 6434c1414c8SBarry Smith tmp1 = x[i2]; 6449371c9d4SSatish Balay sum1 += v1[0] * tmp0 + v1[1] * tmp1; 6459371c9d4SSatish Balay v1 += 2; 6469371c9d4SSatish Balay sum2 += v2[0] * tmp0 + v2[1] * tmp1; 6479371c9d4SSatish Balay v2 += 2; 6489371c9d4SSatish Balay sum3 += v3[0] * tmp0 + v3[1] * tmp1; 6499371c9d4SSatish Balay v3 += 2; 6504c1414c8SBarry Smith } 6514c1414c8SBarry Smith if (n == sz - 1) { 6524c1414c8SBarry Smith tmp0 = x[*idx++]; 6534c1414c8SBarry Smith sum1 += *v1++ * tmp0; 6544c1414c8SBarry Smith sum2 += *v2++ * tmp0; 6554c1414c8SBarry Smith sum3 += *v3++ * tmp0; 6564c1414c8SBarry Smith } 6574c1414c8SBarry Smith y[row++] = sum1; 6584c1414c8SBarry Smith y[row++] = sum2; 6594c1414c8SBarry Smith y[row++] = sum3; 6604c1414c8SBarry Smith v1 = v3; /* Since the next block to be processed starts there*/ 6614c1414c8SBarry Smith idx += 2 * sz; 6624c1414c8SBarry Smith break; 6634c1414c8SBarry Smith case 4: 6644c1414c8SBarry Smith sum1 = *zt++; 6654c1414c8SBarry Smith sum2 = *zt++; 6664c1414c8SBarry Smith sum3 = *zt++; 6674c1414c8SBarry Smith sum4 = *zt++; 6684c1414c8SBarry Smith v2 = v1 + n; 6694c1414c8SBarry Smith v3 = v2 + n; 6704c1414c8SBarry Smith v4 = v3 + n; 6714c1414c8SBarry Smith 6724c1414c8SBarry Smith for (n = 0; n < sz - 1; n += 2) { 6734c1414c8SBarry Smith i1 = idx[0]; 6744c1414c8SBarry Smith i2 = idx[1]; 6754c1414c8SBarry Smith idx += 2; 6764c1414c8SBarry Smith tmp0 = x[i1]; 6774c1414c8SBarry Smith tmp1 = x[i2]; 6789371c9d4SSatish Balay sum1 += v1[0] * tmp0 + v1[1] * tmp1; 6799371c9d4SSatish Balay v1 += 2; 6809371c9d4SSatish Balay sum2 += v2[0] * tmp0 + v2[1] * tmp1; 6819371c9d4SSatish Balay v2 += 2; 6829371c9d4SSatish Balay sum3 += v3[0] * tmp0 + v3[1] * tmp1; 6839371c9d4SSatish Balay v3 += 2; 6849371c9d4SSatish Balay sum4 += v4[0] * tmp0 + v4[1] * tmp1; 6859371c9d4SSatish Balay v4 += 2; 6864c1414c8SBarry Smith } 6874c1414c8SBarry Smith if (n == sz - 1) { 6884c1414c8SBarry Smith tmp0 = x[*idx++]; 6894c1414c8SBarry Smith sum1 += *v1++ * tmp0; 6904c1414c8SBarry Smith sum2 += *v2++ * tmp0; 6914c1414c8SBarry Smith sum3 += *v3++ * tmp0; 6924c1414c8SBarry Smith sum4 += *v4++ * tmp0; 6934c1414c8SBarry Smith } 6944c1414c8SBarry Smith y[row++] = sum1; 6954c1414c8SBarry Smith y[row++] = sum2; 6964c1414c8SBarry Smith y[row++] = sum3; 6974c1414c8SBarry Smith y[row++] = sum4; 6984c1414c8SBarry Smith v1 = v4; /* Since the next block to be processed starts there*/ 6994c1414c8SBarry Smith idx += 3 * sz; 7004c1414c8SBarry Smith break; 7014c1414c8SBarry Smith case 5: 7024c1414c8SBarry Smith sum1 = *zt++; 7034c1414c8SBarry Smith sum2 = *zt++; 7044c1414c8SBarry Smith sum3 = *zt++; 7054c1414c8SBarry Smith sum4 = *zt++; 7064c1414c8SBarry Smith sum5 = *zt++; 7074c1414c8SBarry Smith v2 = v1 + n; 7084c1414c8SBarry Smith v3 = v2 + n; 7094c1414c8SBarry Smith v4 = v3 + n; 7104c1414c8SBarry Smith v5 = v4 + n; 7114c1414c8SBarry Smith 7124c1414c8SBarry Smith for (n = 0; n < sz - 1; n += 2) { 7134c1414c8SBarry Smith i1 = idx[0]; 7144c1414c8SBarry Smith i2 = idx[1]; 7154c1414c8SBarry Smith idx += 2; 7164c1414c8SBarry Smith tmp0 = x[i1]; 7174c1414c8SBarry Smith tmp1 = x[i2]; 7189371c9d4SSatish Balay sum1 += v1[0] * tmp0 + v1[1] * tmp1; 7199371c9d4SSatish Balay v1 += 2; 7209371c9d4SSatish Balay sum2 += v2[0] * tmp0 + v2[1] * tmp1; 7219371c9d4SSatish Balay v2 += 2; 7229371c9d4SSatish Balay sum3 += v3[0] * tmp0 + v3[1] * tmp1; 7239371c9d4SSatish Balay v3 += 2; 7249371c9d4SSatish Balay sum4 += v4[0] * tmp0 + v4[1] * tmp1; 7259371c9d4SSatish Balay v4 += 2; 7269371c9d4SSatish Balay sum5 += v5[0] * tmp0 + v5[1] * tmp1; 7279371c9d4SSatish Balay v5 += 2; 7284c1414c8SBarry Smith } 7294c1414c8SBarry Smith if (n == sz - 1) { 7304c1414c8SBarry Smith tmp0 = x[*idx++]; 7314c1414c8SBarry Smith sum1 += *v1++ * tmp0; 7324c1414c8SBarry Smith sum2 += *v2++ * tmp0; 7334c1414c8SBarry Smith sum3 += *v3++ * tmp0; 7344c1414c8SBarry Smith sum4 += *v4++ * tmp0; 7354c1414c8SBarry Smith sum5 += *v5++ * tmp0; 7364c1414c8SBarry Smith } 7374c1414c8SBarry Smith y[row++] = sum1; 7384c1414c8SBarry Smith y[row++] = sum2; 7394c1414c8SBarry Smith y[row++] = sum3; 7404c1414c8SBarry Smith y[row++] = sum4; 7414c1414c8SBarry Smith y[row++] = sum5; 7424c1414c8SBarry Smith v1 = v5; /* Since the next block to be processed starts there */ 7434c1414c8SBarry Smith idx += 4 * sz; 7444c1414c8SBarry Smith break; 745d71ae5a4SJacob Faibussowitsch default: 746d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported"); 7474c1414c8SBarry Smith } 7484c1414c8SBarry Smith } 7499566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 7509566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayPair(zz, yy, &z, &y)); 7519566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz)); 7523ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 7534c1414c8SBarry Smith } 7544c1414c8SBarry Smith 755ff6a9541SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A, Vec bb, Vec xx) 756d71ae5a4SJacob Faibussowitsch { 7574c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 7584c1414c8SBarry Smith IS iscol = a->col, isrow = a->row; 7595d0c19d7SBarry Smith const PetscInt *r, *c, *rout, *cout; 7608758e1faSBarry Smith PetscInt i, j, n = A->rmap->n, nz; 7618758e1faSBarry Smith PetscInt node_max, *ns, row, nsz, aii, i0, i1; 7628758e1faSBarry Smith const PetscInt *ai = a->i, *a_j = a->j, *vi, *ad, *aj; 763d9fead3dSBarry Smith PetscScalar *x, *tmp, *tmps, tmp0, tmp1; 764d9fead3dSBarry Smith PetscScalar sum1, sum2, sum3, sum4, sum5; 765dd6ea824SBarry Smith const MatScalar *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa; 766dd6ea824SBarry Smith const PetscScalar *b; 7674c1414c8SBarry Smith 7684c1414c8SBarry Smith PetscFunctionBegin; 76908401ef6SPierre Jolivet PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 7704c1414c8SBarry Smith node_max = a->inode.node_count; 7714c1414c8SBarry Smith ns = a->inode.size; /* Node Size array */ 7724c1414c8SBarry Smith 7739566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 7749566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(xx, &x)); 7754c1414c8SBarry Smith tmp = a->solve_work; 7764c1414c8SBarry Smith 7779371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 7789371c9d4SSatish Balay r = rout; 7799371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 7809371c9d4SSatish Balay c = cout + (n - 1); 7814c1414c8SBarry Smith 7824c1414c8SBarry Smith /* forward solve the lower triangular */ 7834c1414c8SBarry Smith tmps = tmp; 7844c1414c8SBarry Smith aa = a_a; 7854c1414c8SBarry Smith aj = a_j; 7864c1414c8SBarry Smith ad = a->diag; 7874c1414c8SBarry Smith 7884c1414c8SBarry Smith for (i = 0, row = 0; i < node_max; ++i) { 7894c1414c8SBarry Smith nsz = ns[i]; 7904c1414c8SBarry Smith aii = ai[row]; 7914c1414c8SBarry Smith v1 = aa + aii; 7924c1414c8SBarry Smith vi = aj + aii; 7934c1414c8SBarry Smith nz = ad[row] - aii; 79426549573SJed Brown if (i < node_max - 1) { 79526549573SJed Brown /* Prefetch the block after the current one, the prefetch itself can't cause a memory error, 79691c35059SPierre Jolivet * but our indexing to determine its size could. */ 79750d8bf02SJed Brown PetscPrefetchBlock(aj + ai[row + nsz], ad[row + nsz] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */ 79826549573SJed Brown /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */ 79950d8bf02SJed Brown PetscPrefetchBlock(aa + ai[row + nsz], ad[row + nsz + ns[i + 1] - 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); 80026549573SJed Brown /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */ 80126549573SJed Brown } 8024c1414c8SBarry Smith 8034c1414c8SBarry Smith switch (nsz) { /* Each loop in 'case' is unrolled */ 8044c1414c8SBarry Smith case 1: 8054c1414c8SBarry Smith sum1 = b[*r++]; 8064c1414c8SBarry Smith for (j = 0; j < nz - 1; j += 2) { 8074c1414c8SBarry Smith i0 = vi[0]; 8084c1414c8SBarry Smith i1 = vi[1]; 8094c1414c8SBarry Smith vi += 2; 8104c1414c8SBarry Smith tmp0 = tmps[i0]; 8114c1414c8SBarry Smith tmp1 = tmps[i1]; 8129371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 8139371c9d4SSatish Balay v1 += 2; 8144c1414c8SBarry Smith } 8154c1414c8SBarry Smith if (j == nz - 1) { 8164c1414c8SBarry Smith tmp0 = tmps[*vi++]; 8174c1414c8SBarry Smith sum1 -= *v1++ * tmp0; 8184c1414c8SBarry Smith } 8194c1414c8SBarry Smith tmp[row++] = sum1; 8204c1414c8SBarry Smith break; 8214c1414c8SBarry Smith case 2: 8224c1414c8SBarry Smith sum1 = b[*r++]; 8234c1414c8SBarry Smith sum2 = b[*r++]; 8244c1414c8SBarry Smith v2 = aa + ai[row + 1]; 8254c1414c8SBarry Smith 8264c1414c8SBarry Smith for (j = 0; j < nz - 1; j += 2) { 8274c1414c8SBarry Smith i0 = vi[0]; 8284c1414c8SBarry Smith i1 = vi[1]; 8294c1414c8SBarry Smith vi += 2; 8304c1414c8SBarry Smith tmp0 = tmps[i0]; 8314c1414c8SBarry Smith tmp1 = tmps[i1]; 8329371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 8339371c9d4SSatish Balay v1 += 2; 8349371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 8359371c9d4SSatish Balay v2 += 2; 8364c1414c8SBarry Smith } 8374c1414c8SBarry Smith if (j == nz - 1) { 8384c1414c8SBarry Smith tmp0 = tmps[*vi++]; 8394c1414c8SBarry Smith sum1 -= *v1++ * tmp0; 8404c1414c8SBarry Smith sum2 -= *v2++ * tmp0; 8414c1414c8SBarry Smith } 8424c1414c8SBarry Smith sum2 -= *v2++ * sum1; 8434c1414c8SBarry Smith tmp[row++] = sum1; 8444c1414c8SBarry Smith tmp[row++] = sum2; 8454c1414c8SBarry Smith break; 8464c1414c8SBarry Smith case 3: 8474c1414c8SBarry Smith sum1 = b[*r++]; 8484c1414c8SBarry Smith sum2 = b[*r++]; 8494c1414c8SBarry Smith sum3 = b[*r++]; 8504c1414c8SBarry Smith v2 = aa + ai[row + 1]; 8514c1414c8SBarry Smith v3 = aa + ai[row + 2]; 8524c1414c8SBarry Smith 8534c1414c8SBarry Smith for (j = 0; j < nz - 1; j += 2) { 8544c1414c8SBarry Smith i0 = vi[0]; 8554c1414c8SBarry Smith i1 = vi[1]; 8564c1414c8SBarry Smith vi += 2; 8574c1414c8SBarry Smith tmp0 = tmps[i0]; 8584c1414c8SBarry Smith tmp1 = tmps[i1]; 8599371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 8609371c9d4SSatish Balay v1 += 2; 8619371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 8629371c9d4SSatish Balay v2 += 2; 8639371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 8649371c9d4SSatish Balay v3 += 2; 8654c1414c8SBarry Smith } 8664c1414c8SBarry Smith if (j == nz - 1) { 8674c1414c8SBarry Smith tmp0 = tmps[*vi++]; 8684c1414c8SBarry Smith sum1 -= *v1++ * tmp0; 8694c1414c8SBarry Smith sum2 -= *v2++ * tmp0; 8704c1414c8SBarry Smith sum3 -= *v3++ * tmp0; 8714c1414c8SBarry Smith } 8724c1414c8SBarry Smith sum2 -= *v2++ * sum1; 8734c1414c8SBarry Smith sum3 -= *v3++ * sum1; 8744c1414c8SBarry Smith sum3 -= *v3++ * sum2; 8752205254eSKarl Rupp 8764c1414c8SBarry Smith tmp[row++] = sum1; 8774c1414c8SBarry Smith tmp[row++] = sum2; 8784c1414c8SBarry Smith tmp[row++] = sum3; 8794c1414c8SBarry Smith break; 8804c1414c8SBarry Smith 8814c1414c8SBarry Smith case 4: 8824c1414c8SBarry Smith sum1 = b[*r++]; 8834c1414c8SBarry Smith sum2 = b[*r++]; 8844c1414c8SBarry Smith sum3 = b[*r++]; 8854c1414c8SBarry Smith sum4 = b[*r++]; 8864c1414c8SBarry Smith v2 = aa + ai[row + 1]; 8874c1414c8SBarry Smith v3 = aa + ai[row + 2]; 8884c1414c8SBarry Smith v4 = aa + ai[row + 3]; 8894c1414c8SBarry Smith 8904c1414c8SBarry Smith for (j = 0; j < nz - 1; j += 2) { 8914c1414c8SBarry Smith i0 = vi[0]; 8924c1414c8SBarry Smith i1 = vi[1]; 8934c1414c8SBarry Smith vi += 2; 8944c1414c8SBarry Smith tmp0 = tmps[i0]; 8954c1414c8SBarry Smith tmp1 = tmps[i1]; 8969371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 8979371c9d4SSatish Balay v1 += 2; 8989371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 8999371c9d4SSatish Balay v2 += 2; 9009371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 9019371c9d4SSatish Balay v3 += 2; 9029371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 9039371c9d4SSatish Balay v4 += 2; 9044c1414c8SBarry Smith } 9054c1414c8SBarry Smith if (j == nz - 1) { 9064c1414c8SBarry Smith tmp0 = tmps[*vi++]; 9074c1414c8SBarry Smith sum1 -= *v1++ * tmp0; 9084c1414c8SBarry Smith sum2 -= *v2++ * tmp0; 9094c1414c8SBarry Smith sum3 -= *v3++ * tmp0; 9104c1414c8SBarry Smith sum4 -= *v4++ * tmp0; 9114c1414c8SBarry Smith } 9124c1414c8SBarry Smith sum2 -= *v2++ * sum1; 9134c1414c8SBarry Smith sum3 -= *v3++ * sum1; 9144c1414c8SBarry Smith sum4 -= *v4++ * sum1; 9154c1414c8SBarry Smith sum3 -= *v3++ * sum2; 9164c1414c8SBarry Smith sum4 -= *v4++ * sum2; 9174c1414c8SBarry Smith sum4 -= *v4++ * sum3; 9184c1414c8SBarry Smith 9194c1414c8SBarry Smith tmp[row++] = sum1; 9204c1414c8SBarry Smith tmp[row++] = sum2; 9214c1414c8SBarry Smith tmp[row++] = sum3; 9224c1414c8SBarry Smith tmp[row++] = sum4; 9234c1414c8SBarry Smith break; 9244c1414c8SBarry Smith case 5: 9254c1414c8SBarry Smith sum1 = b[*r++]; 9264c1414c8SBarry Smith sum2 = b[*r++]; 9274c1414c8SBarry Smith sum3 = b[*r++]; 9284c1414c8SBarry Smith sum4 = b[*r++]; 9294c1414c8SBarry Smith sum5 = b[*r++]; 9304c1414c8SBarry Smith v2 = aa + ai[row + 1]; 9314c1414c8SBarry Smith v3 = aa + ai[row + 2]; 9324c1414c8SBarry Smith v4 = aa + ai[row + 3]; 9334c1414c8SBarry Smith v5 = aa + ai[row + 4]; 9344c1414c8SBarry Smith 9354c1414c8SBarry Smith for (j = 0; j < nz - 1; j += 2) { 9364c1414c8SBarry Smith i0 = vi[0]; 9374c1414c8SBarry Smith i1 = vi[1]; 9384c1414c8SBarry Smith vi += 2; 9394c1414c8SBarry Smith tmp0 = tmps[i0]; 9404c1414c8SBarry Smith tmp1 = tmps[i1]; 9419371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 9429371c9d4SSatish Balay v1 += 2; 9439371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 9449371c9d4SSatish Balay v2 += 2; 9459371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 9469371c9d4SSatish Balay v3 += 2; 9479371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 9489371c9d4SSatish Balay v4 += 2; 9499371c9d4SSatish Balay sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 9509371c9d4SSatish Balay v5 += 2; 9514c1414c8SBarry Smith } 9524c1414c8SBarry Smith if (j == nz - 1) { 9534c1414c8SBarry Smith tmp0 = tmps[*vi++]; 9544c1414c8SBarry Smith sum1 -= *v1++ * tmp0; 9554c1414c8SBarry Smith sum2 -= *v2++ * tmp0; 9564c1414c8SBarry Smith sum3 -= *v3++ * tmp0; 9574c1414c8SBarry Smith sum4 -= *v4++ * tmp0; 9584c1414c8SBarry Smith sum5 -= *v5++ * tmp0; 9594c1414c8SBarry Smith } 9604c1414c8SBarry Smith 9614c1414c8SBarry Smith sum2 -= *v2++ * sum1; 9624c1414c8SBarry Smith sum3 -= *v3++ * sum1; 9634c1414c8SBarry Smith sum4 -= *v4++ * sum1; 9644c1414c8SBarry Smith sum5 -= *v5++ * sum1; 9654c1414c8SBarry Smith sum3 -= *v3++ * sum2; 9664c1414c8SBarry Smith sum4 -= *v4++ * sum2; 9674c1414c8SBarry Smith sum5 -= *v5++ * sum2; 9684c1414c8SBarry Smith sum4 -= *v4++ * sum3; 9694c1414c8SBarry Smith sum5 -= *v5++ * sum3; 9704c1414c8SBarry Smith sum5 -= *v5++ * sum4; 9714c1414c8SBarry Smith 9724c1414c8SBarry Smith tmp[row++] = sum1; 9734c1414c8SBarry Smith tmp[row++] = sum2; 9744c1414c8SBarry Smith tmp[row++] = sum3; 9754c1414c8SBarry Smith tmp[row++] = sum4; 9764c1414c8SBarry Smith tmp[row++] = sum5; 9774c1414c8SBarry Smith break; 978d71ae5a4SJacob Faibussowitsch default: 979d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported "); 9804c1414c8SBarry Smith } 9814c1414c8SBarry Smith } 9824c1414c8SBarry Smith /* backward solve the upper triangular */ 9834c1414c8SBarry Smith for (i = node_max - 1, row = n - 1; i >= 0; i--) { 9844c1414c8SBarry Smith nsz = ns[i]; 9854c1414c8SBarry Smith aii = ai[row + 1] - 1; 9864c1414c8SBarry Smith v1 = aa + aii; 9874c1414c8SBarry Smith vi = aj + aii; 9884c1414c8SBarry Smith nz = aii - ad[row]; 9894c1414c8SBarry Smith switch (nsz) { /* Each loop in 'case' is unrolled */ 9904c1414c8SBarry Smith case 1: 9914c1414c8SBarry Smith sum1 = tmp[row]; 9924c1414c8SBarry Smith 9934c1414c8SBarry Smith for (j = nz; j > 1; j -= 2) { 9944c1414c8SBarry Smith vi -= 2; 9954c1414c8SBarry Smith i0 = vi[2]; 9964c1414c8SBarry Smith i1 = vi[1]; 9974c1414c8SBarry Smith tmp0 = tmps[i0]; 9984c1414c8SBarry Smith tmp1 = tmps[i1]; 9994c1414c8SBarry Smith v1 -= 2; 10004c1414c8SBarry Smith sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 10014c1414c8SBarry Smith } 10024c1414c8SBarry Smith if (j == 1) { 10034c1414c8SBarry Smith tmp0 = tmps[*vi--]; 10044c1414c8SBarry Smith sum1 -= *v1-- * tmp0; 10054c1414c8SBarry Smith } 10069371c9d4SSatish Balay x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 10079371c9d4SSatish Balay row--; 10084c1414c8SBarry Smith break; 10094c1414c8SBarry Smith case 2: 10104c1414c8SBarry Smith sum1 = tmp[row]; 10114c1414c8SBarry Smith sum2 = tmp[row - 1]; 10124c1414c8SBarry Smith v2 = aa + ai[row] - 1; 10134c1414c8SBarry Smith for (j = nz; j > 1; j -= 2) { 10144c1414c8SBarry Smith vi -= 2; 10154c1414c8SBarry Smith i0 = vi[2]; 10164c1414c8SBarry Smith i1 = vi[1]; 10174c1414c8SBarry Smith tmp0 = tmps[i0]; 10184c1414c8SBarry Smith tmp1 = tmps[i1]; 10194c1414c8SBarry Smith v1 -= 2; 10204c1414c8SBarry Smith v2 -= 2; 10214c1414c8SBarry Smith sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 10224c1414c8SBarry Smith sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 10234c1414c8SBarry Smith } 10244c1414c8SBarry Smith if (j == 1) { 10254c1414c8SBarry Smith tmp0 = tmps[*vi--]; 10264c1414c8SBarry Smith sum1 -= *v1-- * tmp0; 10274c1414c8SBarry Smith sum2 -= *v2-- * tmp0; 10284c1414c8SBarry Smith } 10294c1414c8SBarry Smith 10309371c9d4SSatish Balay tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 10319371c9d4SSatish Balay row--; 10324c1414c8SBarry Smith sum2 -= *v2-- * tmp0; 10339371c9d4SSatish Balay x[*c--] = tmp[row] = sum2 * a_a[ad[row]]; 10349371c9d4SSatish Balay row--; 10354c1414c8SBarry Smith break; 10364c1414c8SBarry Smith case 3: 10374c1414c8SBarry Smith sum1 = tmp[row]; 10384c1414c8SBarry Smith sum2 = tmp[row - 1]; 10394c1414c8SBarry Smith sum3 = tmp[row - 2]; 10404c1414c8SBarry Smith v2 = aa + ai[row] - 1; 10414c1414c8SBarry Smith v3 = aa + ai[row - 1] - 1; 10424c1414c8SBarry Smith for (j = nz; j > 1; j -= 2) { 10434c1414c8SBarry Smith vi -= 2; 10444c1414c8SBarry Smith i0 = vi[2]; 10454c1414c8SBarry Smith i1 = vi[1]; 10464c1414c8SBarry Smith tmp0 = tmps[i0]; 10474c1414c8SBarry Smith tmp1 = tmps[i1]; 10484c1414c8SBarry Smith v1 -= 2; 10494c1414c8SBarry Smith v2 -= 2; 10504c1414c8SBarry Smith v3 -= 2; 10514c1414c8SBarry Smith sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 10524c1414c8SBarry Smith sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 10534c1414c8SBarry Smith sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 10544c1414c8SBarry Smith } 10554c1414c8SBarry Smith if (j == 1) { 10564c1414c8SBarry Smith tmp0 = tmps[*vi--]; 10574c1414c8SBarry Smith sum1 -= *v1-- * tmp0; 10584c1414c8SBarry Smith sum2 -= *v2-- * tmp0; 10594c1414c8SBarry Smith sum3 -= *v3-- * tmp0; 10604c1414c8SBarry Smith } 10619371c9d4SSatish Balay tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 10629371c9d4SSatish Balay row--; 10634c1414c8SBarry Smith sum2 -= *v2-- * tmp0; 10644c1414c8SBarry Smith sum3 -= *v3-- * tmp0; 10659371c9d4SSatish Balay tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]]; 10669371c9d4SSatish Balay row--; 10674c1414c8SBarry Smith sum3 -= *v3-- * tmp0; 10689371c9d4SSatish Balay x[*c--] = tmp[row] = sum3 * a_a[ad[row]]; 10699371c9d4SSatish Balay row--; 10704c1414c8SBarry Smith 10714c1414c8SBarry Smith break; 10724c1414c8SBarry Smith case 4: 10734c1414c8SBarry Smith sum1 = tmp[row]; 10744c1414c8SBarry Smith sum2 = tmp[row - 1]; 10754c1414c8SBarry Smith sum3 = tmp[row - 2]; 10764c1414c8SBarry Smith sum4 = tmp[row - 3]; 10774c1414c8SBarry Smith v2 = aa + ai[row] - 1; 10784c1414c8SBarry Smith v3 = aa + ai[row - 1] - 1; 10794c1414c8SBarry Smith v4 = aa + ai[row - 2] - 1; 10804c1414c8SBarry Smith 10814c1414c8SBarry Smith for (j = nz; j > 1; j -= 2) { 10824c1414c8SBarry Smith vi -= 2; 10834c1414c8SBarry Smith i0 = vi[2]; 10844c1414c8SBarry Smith i1 = vi[1]; 10854c1414c8SBarry Smith tmp0 = tmps[i0]; 10864c1414c8SBarry Smith tmp1 = tmps[i1]; 10874c1414c8SBarry Smith v1 -= 2; 10884c1414c8SBarry Smith v2 -= 2; 10894c1414c8SBarry Smith v3 -= 2; 10904c1414c8SBarry Smith v4 -= 2; 10914c1414c8SBarry Smith sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 10924c1414c8SBarry Smith sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 10934c1414c8SBarry Smith sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 10944c1414c8SBarry Smith sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 10954c1414c8SBarry Smith } 10964c1414c8SBarry Smith if (j == 1) { 10974c1414c8SBarry Smith tmp0 = tmps[*vi--]; 10984c1414c8SBarry Smith sum1 -= *v1-- * tmp0; 10994c1414c8SBarry Smith sum2 -= *v2-- * tmp0; 11004c1414c8SBarry Smith sum3 -= *v3-- * tmp0; 11014c1414c8SBarry Smith sum4 -= *v4-- * tmp0; 11024c1414c8SBarry Smith } 11034c1414c8SBarry Smith 11049371c9d4SSatish Balay tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 11059371c9d4SSatish Balay row--; 11064c1414c8SBarry Smith sum2 -= *v2-- * tmp0; 11074c1414c8SBarry Smith sum3 -= *v3-- * tmp0; 11084c1414c8SBarry Smith sum4 -= *v4-- * tmp0; 11099371c9d4SSatish Balay tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]]; 11109371c9d4SSatish Balay row--; 11114c1414c8SBarry Smith sum3 -= *v3-- * tmp0; 11124c1414c8SBarry Smith sum4 -= *v4-- * tmp0; 11139371c9d4SSatish Balay tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]]; 11149371c9d4SSatish Balay row--; 11154c1414c8SBarry Smith sum4 -= *v4-- * tmp0; 11169371c9d4SSatish Balay x[*c--] = tmp[row] = sum4 * a_a[ad[row]]; 11179371c9d4SSatish Balay row--; 11184c1414c8SBarry Smith break; 11194c1414c8SBarry Smith case 5: 11204c1414c8SBarry Smith sum1 = tmp[row]; 11214c1414c8SBarry Smith sum2 = tmp[row - 1]; 11224c1414c8SBarry Smith sum3 = tmp[row - 2]; 11234c1414c8SBarry Smith sum4 = tmp[row - 3]; 11244c1414c8SBarry Smith sum5 = tmp[row - 4]; 11254c1414c8SBarry Smith v2 = aa + ai[row] - 1; 11264c1414c8SBarry Smith v3 = aa + ai[row - 1] - 1; 11274c1414c8SBarry Smith v4 = aa + ai[row - 2] - 1; 11284c1414c8SBarry Smith v5 = aa + ai[row - 3] - 1; 11294c1414c8SBarry Smith for (j = nz; j > 1; j -= 2) { 11304c1414c8SBarry Smith vi -= 2; 11314c1414c8SBarry Smith i0 = vi[2]; 11324c1414c8SBarry Smith i1 = vi[1]; 11334c1414c8SBarry Smith tmp0 = tmps[i0]; 11344c1414c8SBarry Smith tmp1 = tmps[i1]; 11354c1414c8SBarry Smith v1 -= 2; 11364c1414c8SBarry Smith v2 -= 2; 11374c1414c8SBarry Smith v3 -= 2; 11384c1414c8SBarry Smith v4 -= 2; 11394c1414c8SBarry Smith v5 -= 2; 11404c1414c8SBarry Smith sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 11414c1414c8SBarry Smith sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 11424c1414c8SBarry Smith sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 11434c1414c8SBarry Smith sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 11444c1414c8SBarry Smith sum5 -= v5[2] * tmp0 + v5[1] * tmp1; 11454c1414c8SBarry Smith } 11464c1414c8SBarry Smith if (j == 1) { 11474c1414c8SBarry Smith tmp0 = tmps[*vi--]; 11484c1414c8SBarry Smith sum1 -= *v1-- * tmp0; 11494c1414c8SBarry Smith sum2 -= *v2-- * tmp0; 11504c1414c8SBarry Smith sum3 -= *v3-- * tmp0; 11514c1414c8SBarry Smith sum4 -= *v4-- * tmp0; 11524c1414c8SBarry Smith sum5 -= *v5-- * tmp0; 11534c1414c8SBarry Smith } 11544c1414c8SBarry Smith 11559371c9d4SSatish Balay tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 11569371c9d4SSatish Balay row--; 11574c1414c8SBarry Smith sum2 -= *v2-- * tmp0; 11584c1414c8SBarry Smith sum3 -= *v3-- * tmp0; 11594c1414c8SBarry Smith sum4 -= *v4-- * tmp0; 11604c1414c8SBarry Smith sum5 -= *v5-- * tmp0; 11619371c9d4SSatish Balay tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]]; 11629371c9d4SSatish Balay row--; 11634c1414c8SBarry Smith sum3 -= *v3-- * tmp0; 11644c1414c8SBarry Smith sum4 -= *v4-- * tmp0; 11654c1414c8SBarry Smith sum5 -= *v5-- * tmp0; 11669371c9d4SSatish Balay tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]]; 11679371c9d4SSatish Balay row--; 11684c1414c8SBarry Smith sum4 -= *v4-- * tmp0; 11694c1414c8SBarry Smith sum5 -= *v5-- * tmp0; 11709371c9d4SSatish Balay tmp0 = x[*c--] = tmp[row] = sum4 * a_a[ad[row]]; 11719371c9d4SSatish Balay row--; 11724c1414c8SBarry Smith sum5 -= *v5-- * tmp0; 11739371c9d4SSatish Balay x[*c--] = tmp[row] = sum5 * a_a[ad[row]]; 11749371c9d4SSatish Balay row--; 11754c1414c8SBarry Smith break; 1176d71ae5a4SJacob Faibussowitsch default: 1177d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported "); 11784c1414c8SBarry Smith } 11794c1414c8SBarry Smith } 11809566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 11819566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 11829566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 11839566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(xx, &x)); 11849566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n)); 11853ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 11864c1414c8SBarry Smith } 11874c1414c8SBarry Smith 1188d71ae5a4SJacob Faibussowitsch PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B, Mat A, const MatFactorInfo *info) 1189d71ae5a4SJacob Faibussowitsch { 119028f1b45aSHong Zhang Mat C = B; 119128f1b45aSHong Zhang Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data; 119228f1b45aSHong Zhang IS isrow = b->row, isicol = b->icol; 119328f1b45aSHong Zhang const PetscInt *r, *ic, *ics; 119428f1b45aSHong Zhang const PetscInt n = A->rmap->n, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag; 119528f1b45aSHong Zhang PetscInt i, j, k, nz, nzL, row, *pj; 119628f1b45aSHong Zhang const PetscInt *ajtmp, *bjtmp; 11979877982aSShri Abhyankar MatScalar *pc, *pc1, *pc2, *pc3, *pc4, mul1, mul2, mul3, mul4, *pv, *rtmp1, *rtmp2, *rtmp3, *rtmp4; 11989877982aSShri Abhyankar const MatScalar *aa = a->a, *v, *v1, *v2, *v3, *v4; 119928f1b45aSHong Zhang FactorShiftCtx sctx; 12004f81c4b7SBarry Smith const PetscInt *ddiag; 120128f1b45aSHong Zhang PetscReal rs; 120228f1b45aSHong Zhang MatScalar d; 12034f81c4b7SBarry Smith PetscInt inod, nodesz, node_max, col; 12044f81c4b7SBarry Smith const PetscInt *ns; 120507b50cabSHong Zhang PetscInt *tmp_vec1, *tmp_vec2, *nsmap; 12060e95ead3SHong Zhang 120728f1b45aSHong Zhang PetscFunctionBegin; 120828f1b45aSHong Zhang /* MatPivotSetUp(): initialize shift context sctx */ 12099566063dSJacob Faibussowitsch PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx))); 121028f1b45aSHong Zhang 1211f4db908eSBarry Smith if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 121228f1b45aSHong Zhang ddiag = a->diag; 121328f1b45aSHong Zhang sctx.shift_top = info->zeropivot; 121428f1b45aSHong Zhang for (i = 0; i < n; i++) { 121528f1b45aSHong Zhang /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 121628f1b45aSHong Zhang d = (aa)[ddiag[i]]; 121728f1b45aSHong Zhang rs = -PetscAbsScalar(d) - PetscRealPart(d); 121828f1b45aSHong Zhang v = aa + ai[i]; 121928f1b45aSHong Zhang nz = ai[i + 1] - ai[i]; 12202205254eSKarl Rupp for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]); 122128f1b45aSHong Zhang if (rs > sctx.shift_top) sctx.shift_top = rs; 122228f1b45aSHong Zhang } 122328f1b45aSHong Zhang sctx.shift_top *= 1.1; 122428f1b45aSHong Zhang sctx.nshift_max = 5; 122528f1b45aSHong Zhang sctx.shift_lo = 0.; 122628f1b45aSHong Zhang sctx.shift_hi = 1.; 122728f1b45aSHong Zhang } 122828f1b45aSHong Zhang 12299566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow, &r)); 12309566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isicol, &ic)); 123168785679SHong Zhang 12329566063dSJacob Faibussowitsch PetscCall(PetscCalloc4(n, &rtmp1, n, &rtmp2, n, &rtmp3, n, &rtmp4)); 123328f1b45aSHong Zhang ics = ic; 123428f1b45aSHong Zhang 123528f1b45aSHong Zhang node_max = a->inode.node_count; 123628f1b45aSHong Zhang ns = a->inode.size; 123728b400f6SJacob Faibussowitsch PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information"); 123828f1b45aSHong Zhang 12399877982aSShri Abhyankar /* If max inode size > 4, split it into two inodes.*/ 124068785679SHong Zhang /* also map the inode sizes according to the ordering */ 12419566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(n + 1, &tmp_vec1)); 124268785679SHong Zhang for (i = 0, j = 0; i < node_max; ++i, ++j) { 1243b1550197SShri Abhyankar if (ns[i] > 4) { 1244048b5e81SShri Abhyankar tmp_vec1[j] = 4; 124568785679SHong Zhang ++j; 124668785679SHong Zhang tmp_vec1[j] = ns[i] - tmp_vec1[j - 1]; 124768785679SHong Zhang } else { 124868785679SHong Zhang tmp_vec1[j] = ns[i]; 124968785679SHong Zhang } 125068785679SHong Zhang } 125168785679SHong Zhang /* Use the correct node_max */ 125268785679SHong Zhang node_max = j; 125368785679SHong Zhang 125468785679SHong Zhang /* Now reorder the inode info based on mat re-ordering info */ 125568785679SHong Zhang /* First create a row -> inode_size_array_index map */ 12569566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(n + 1, &nsmap)); 12579566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(node_max + 1, &tmp_vec2)); 125868785679SHong Zhang for (i = 0, row = 0; i < node_max; i++) { 125968785679SHong Zhang nodesz = tmp_vec1[i]; 1260ad540459SPierre Jolivet for (j = 0; j < nodesz; j++, row++) nsmap[row] = i; 126168785679SHong Zhang } 126268785679SHong Zhang /* Using nsmap, create a reordered ns structure */ 126368785679SHong Zhang for (i = 0, j = 0; i < node_max; i++) { 126468785679SHong Zhang nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 126568785679SHong Zhang tmp_vec2[i] = nodesz; 126668785679SHong Zhang j += nodesz; 126768785679SHong Zhang } 12689566063dSJacob Faibussowitsch PetscCall(PetscFree(nsmap)); 12699566063dSJacob Faibussowitsch PetscCall(PetscFree(tmp_vec1)); 1270b89f182dSHong Zhang 127168785679SHong Zhang /* Now use the correct ns */ 127268785679SHong Zhang ns = tmp_vec2; 127368785679SHong Zhang 127428f1b45aSHong Zhang do { 127507b50cabSHong Zhang sctx.newshift = PETSC_FALSE; 127628f1b45aSHong Zhang /* Now loop over each block-row, and do the factorization */ 127728f1b45aSHong Zhang for (inod = 0, i = 0; inod < node_max; inod++) { /* i: row index; inod: inode index */ 127828f1b45aSHong Zhang nodesz = ns[inod]; 127928f1b45aSHong Zhang 128028f1b45aSHong Zhang switch (nodesz) { 128128f1b45aSHong Zhang case 1: 1282b89f182dSHong Zhang /* zero rtmp1 */ 128328f1b45aSHong Zhang /* L part */ 128428f1b45aSHong Zhang nz = bi[i + 1] - bi[i]; 128528f1b45aSHong Zhang bjtmp = bj + bi[i]; 1286b89f182dSHong Zhang for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0; 128728f1b45aSHong Zhang 128828f1b45aSHong Zhang /* U part */ 128928f1b45aSHong Zhang nz = bdiag[i] - bdiag[i + 1]; 129028f1b45aSHong Zhang bjtmp = bj + bdiag[i + 1] + 1; 1291b89f182dSHong Zhang for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0; 129228f1b45aSHong Zhang 129328f1b45aSHong Zhang /* load in initial (unfactored row) */ 129428f1b45aSHong Zhang nz = ai[r[i] + 1] - ai[r[i]]; 129528f1b45aSHong Zhang ajtmp = aj + ai[r[i]]; 129628f1b45aSHong Zhang v = aa + ai[r[i]]; 12972205254eSKarl Rupp for (j = 0; j < nz; j++) rtmp1[ics[ajtmp[j]]] = v[j]; 12982205254eSKarl Rupp 129928f1b45aSHong Zhang /* ZeropivotApply() */ 1300b89f182dSHong Zhang rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */ 130128f1b45aSHong Zhang 130228f1b45aSHong Zhang /* elimination */ 130328f1b45aSHong Zhang bjtmp = bj + bi[i]; 130428f1b45aSHong Zhang row = *bjtmp++; 130528f1b45aSHong Zhang nzL = bi[i + 1] - bi[i]; 130628f1b45aSHong Zhang for (k = 0; k < nzL; k++) { 1307b89f182dSHong Zhang pc = rtmp1 + row; 130828f1b45aSHong Zhang if (*pc != 0.0) { 130928f1b45aSHong Zhang pv = b->a + bdiag[row]; 1310b89f182dSHong Zhang mul1 = *pc * (*pv); 1311b89f182dSHong Zhang *pc = mul1; 131228f1b45aSHong Zhang pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 131328f1b45aSHong Zhang pv = b->a + bdiag[row + 1] + 1; 131428f1b45aSHong Zhang nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 1315b89f182dSHong Zhang for (j = 0; j < nz; j++) rtmp1[pj[j]] -= mul1 * pv[j]; 13169566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(1 + 2.0 * nz)); 131728f1b45aSHong Zhang } 131828f1b45aSHong Zhang row = *bjtmp++; 131928f1b45aSHong Zhang } 132028f1b45aSHong Zhang 132128f1b45aSHong Zhang /* finished row so stick it into b->a */ 132228f1b45aSHong Zhang rs = 0.0; 132328f1b45aSHong Zhang /* L part */ 132428f1b45aSHong Zhang pv = b->a + bi[i]; 132528f1b45aSHong Zhang pj = b->j + bi[i]; 132628f1b45aSHong Zhang nz = bi[i + 1] - bi[i]; 132728f1b45aSHong Zhang for (j = 0; j < nz; j++) { 13289371c9d4SSatish Balay pv[j] = rtmp1[pj[j]]; 13299371c9d4SSatish Balay rs += PetscAbsScalar(pv[j]); 133028f1b45aSHong Zhang } 133128f1b45aSHong Zhang 133228f1b45aSHong Zhang /* U part */ 133328f1b45aSHong Zhang pv = b->a + bdiag[i + 1] + 1; 133428f1b45aSHong Zhang pj = b->j + bdiag[i + 1] + 1; 133528f1b45aSHong Zhang nz = bdiag[i] - bdiag[i + 1] - 1; 133628f1b45aSHong Zhang for (j = 0; j < nz; j++) { 13379371c9d4SSatish Balay pv[j] = rtmp1[pj[j]]; 13389371c9d4SSatish Balay rs += PetscAbsScalar(pv[j]); 133928f1b45aSHong Zhang } 134028f1b45aSHong Zhang 1341b89f182dSHong Zhang /* Check zero pivot */ 134228f1b45aSHong Zhang sctx.rs = rs; 1343b89f182dSHong Zhang sctx.pv = rtmp1[i]; 13449566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 134507b50cabSHong Zhang if (sctx.newshift) break; 134628f1b45aSHong Zhang 1347a5b23f4aSJose E. Roman /* Mark diagonal and invert diagonal for simpler triangular solves */ 134828f1b45aSHong Zhang pv = b->a + bdiag[i]; 1349b89f182dSHong Zhang *pv = 1.0 / sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */ 135028f1b45aSHong Zhang break; 135128f1b45aSHong Zhang 135228f1b45aSHong Zhang case 2: 1353b89f182dSHong Zhang /* zero rtmp1 and rtmp2 */ 135428f1b45aSHong Zhang /* L part */ 135528f1b45aSHong Zhang nz = bi[i + 1] - bi[i]; 135628f1b45aSHong Zhang bjtmp = bj + bi[i]; 135728f1b45aSHong Zhang for (j = 0; j < nz; j++) { 135868785679SHong Zhang col = bjtmp[j]; 13599371c9d4SSatish Balay rtmp1[col] = 0.0; 13609371c9d4SSatish Balay rtmp2[col] = 0.0; 136128f1b45aSHong Zhang } 136228f1b45aSHong Zhang 136328f1b45aSHong Zhang /* U part */ 136428f1b45aSHong Zhang nz = bdiag[i] - bdiag[i + 1]; 136528f1b45aSHong Zhang bjtmp = bj + bdiag[i + 1] + 1; 136628f1b45aSHong Zhang for (j = 0; j < nz; j++) { 136768785679SHong Zhang col = bjtmp[j]; 13689371c9d4SSatish Balay rtmp1[col] = 0.0; 13699371c9d4SSatish Balay rtmp2[col] = 0.0; 137028f1b45aSHong Zhang } 137128f1b45aSHong Zhang 137228f1b45aSHong Zhang /* load in initial (unfactored row) */ 137328f1b45aSHong Zhang nz = ai[r[i] + 1] - ai[r[i]]; 137428f1b45aSHong Zhang ajtmp = aj + ai[r[i]]; 13759371c9d4SSatish Balay v1 = aa + ai[r[i]]; 13769371c9d4SSatish Balay v2 = aa + ai[r[i] + 1]; 137728f1b45aSHong Zhang for (j = 0; j < nz; j++) { 137868785679SHong Zhang col = ics[ajtmp[j]]; 13799371c9d4SSatish Balay rtmp1[col] = v1[j]; 13809371c9d4SSatish Balay rtmp2[col] = v2[j]; 138128f1b45aSHong Zhang } 138228f1b45aSHong Zhang /* ZeropivotApply(): shift the diagonal of the matrix */ 13839371c9d4SSatish Balay rtmp1[i] += sctx.shift_amount; 13849371c9d4SSatish Balay rtmp2[i + 1] += sctx.shift_amount; 138528f1b45aSHong Zhang 138628f1b45aSHong Zhang /* elimination */ 138728f1b45aSHong Zhang bjtmp = bj + bi[i]; 138828f1b45aSHong Zhang row = *bjtmp++; /* pivot row */ 138928f1b45aSHong Zhang nzL = bi[i + 1] - bi[i]; 139028f1b45aSHong Zhang for (k = 0; k < nzL; k++) { 1391b89f182dSHong Zhang pc1 = rtmp1 + row; 1392b89f182dSHong Zhang pc2 = rtmp2 + row; 139328f1b45aSHong Zhang if (*pc1 != 0.0 || *pc2 != 0.0) { 139428f1b45aSHong Zhang pv = b->a + bdiag[row]; 13959371c9d4SSatish Balay mul1 = *pc1 * (*pv); 13969371c9d4SSatish Balay mul2 = *pc2 * (*pv); 13979371c9d4SSatish Balay *pc1 = mul1; 13989371c9d4SSatish Balay *pc2 = mul2; 139928f1b45aSHong Zhang 140028f1b45aSHong Zhang pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 140128f1b45aSHong Zhang pv = b->a + bdiag[row + 1] + 1; 140228f1b45aSHong Zhang nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 140328f1b45aSHong Zhang for (j = 0; j < nz; j++) { 140468785679SHong Zhang col = pj[j]; 1405b89f182dSHong Zhang rtmp1[col] -= mul1 * pv[j]; 1406b89f182dSHong Zhang rtmp2[col] -= mul2 * pv[j]; 140728f1b45aSHong Zhang } 14089566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2 + 4.0 * nz)); 140928f1b45aSHong Zhang } 141028f1b45aSHong Zhang row = *bjtmp++; 141128f1b45aSHong Zhang } 141228f1b45aSHong Zhang 1413b89f182dSHong Zhang /* finished row i; check zero pivot, then stick row i into b->a */ 141428f1b45aSHong Zhang rs = 0.0; 141528f1b45aSHong Zhang /* L part */ 1416b89f182dSHong Zhang pc1 = b->a + bi[i]; 141728f1b45aSHong Zhang pj = b->j + bi[i]; 141828f1b45aSHong Zhang nz = bi[i + 1] - bi[i]; 141928f1b45aSHong Zhang for (j = 0; j < nz; j++) { 142068785679SHong Zhang col = pj[j]; 14219371c9d4SSatish Balay pc1[j] = rtmp1[col]; 14229371c9d4SSatish Balay rs += PetscAbsScalar(pc1[j]); 142328f1b45aSHong Zhang } 142428f1b45aSHong Zhang /* U part */ 1425b89f182dSHong Zhang pc1 = b->a + bdiag[i + 1] + 1; 142628f1b45aSHong Zhang pj = b->j + bdiag[i + 1] + 1; 14270e7a5c2bSHong Zhang nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */ 142828f1b45aSHong Zhang for (j = 0; j < nz; j++) { 142968785679SHong Zhang col = pj[j]; 14309371c9d4SSatish Balay pc1[j] = rtmp1[col]; 14319371c9d4SSatish Balay rs += PetscAbsScalar(pc1[j]); 143228f1b45aSHong Zhang } 143328f1b45aSHong Zhang 143428f1b45aSHong Zhang sctx.rs = rs; 1435b89f182dSHong Zhang sctx.pv = rtmp1[i]; 14369566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 143707b50cabSHong Zhang if (sctx.newshift) break; 1438b89f182dSHong Zhang pc1 = b->a + bdiag[i]; /* Mark diagonal */ 1439b89f182dSHong Zhang *pc1 = 1.0 / sctx.pv; 1440b89f182dSHong Zhang 1441b89f182dSHong Zhang /* Now take care of diagonal 2x2 block. */ 1442b89f182dSHong Zhang pc2 = rtmp2 + i; 1443b89f182dSHong Zhang if (*pc2 != 0.0) { 1444b89f182dSHong Zhang mul1 = (*pc2) * (*pc1); /* *pc1=diag[i] is inverted! */ 1445b89f182dSHong Zhang *pc2 = mul1; /* insert L entry */ 1446b89f182dSHong Zhang pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */ 1447b89f182dSHong Zhang nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */ 1448b89f182dSHong Zhang for (j = 0; j < nz; j++) { 14499371c9d4SSatish Balay col = pj[j]; 14509371c9d4SSatish Balay rtmp2[col] -= mul1 * rtmp1[col]; 145128f1b45aSHong Zhang } 14529566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(1 + 2.0 * nz)); 1453b89f182dSHong Zhang } 1454b89f182dSHong Zhang 1455b89f182dSHong Zhang /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1456b89f182dSHong Zhang rs = 0.0; 1457b89f182dSHong Zhang /* L part */ 1458b89f182dSHong Zhang pc2 = b->a + bi[i + 1]; 1459b89f182dSHong Zhang pj = b->j + bi[i + 1]; 1460b89f182dSHong Zhang nz = bi[i + 2] - bi[i + 1]; 1461b89f182dSHong Zhang for (j = 0; j < nz; j++) { 1462b89f182dSHong Zhang col = pj[j]; 14639371c9d4SSatish Balay pc2[j] = rtmp2[col]; 14649371c9d4SSatish Balay rs += PetscAbsScalar(pc2[j]); 1465b89f182dSHong Zhang } 1466b89f182dSHong Zhang /* U part */ 1467b89f182dSHong Zhang pc2 = b->a + bdiag[i + 2] + 1; 14680e7a5c2bSHong Zhang pj = b->j + bdiag[i + 2] + 1; 14690e7a5c2bSHong Zhang nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */ 1470b89f182dSHong Zhang for (j = 0; j < nz; j++) { 1471b89f182dSHong Zhang col = pj[j]; 14729371c9d4SSatish Balay pc2[j] = rtmp2[col]; 14739371c9d4SSatish Balay rs += PetscAbsScalar(pc2[j]); 1474b89f182dSHong Zhang } 1475b89f182dSHong Zhang 147628f1b45aSHong Zhang sctx.rs = rs; 1477b89f182dSHong Zhang sctx.pv = rtmp2[i + 1]; 14789566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1)); 147907b50cabSHong Zhang if (sctx.newshift) break; 148028f1b45aSHong Zhang pc2 = b->a + bdiag[i + 1]; 1481b89f182dSHong Zhang *pc2 = 1.0 / sctx.pv; 148228f1b45aSHong Zhang break; 1483b89f182dSHong Zhang 148468785679SHong Zhang case 3: 148568785679SHong Zhang /* zero rtmp */ 148668785679SHong Zhang /* L part */ 148768785679SHong Zhang nz = bi[i + 1] - bi[i]; 148868785679SHong Zhang bjtmp = bj + bi[i]; 148968785679SHong Zhang for (j = 0; j < nz; j++) { 149068785679SHong Zhang col = bjtmp[j]; 14919371c9d4SSatish Balay rtmp1[col] = 0.0; 14929371c9d4SSatish Balay rtmp2[col] = 0.0; 14939371c9d4SSatish Balay rtmp3[col] = 0.0; 149468785679SHong Zhang } 149568785679SHong Zhang 149668785679SHong Zhang /* U part */ 149768785679SHong Zhang nz = bdiag[i] - bdiag[i + 1]; 149868785679SHong Zhang bjtmp = bj + bdiag[i + 1] + 1; 149968785679SHong Zhang for (j = 0; j < nz; j++) { 150068785679SHong Zhang col = bjtmp[j]; 15019371c9d4SSatish Balay rtmp1[col] = 0.0; 15029371c9d4SSatish Balay rtmp2[col] = 0.0; 15039371c9d4SSatish Balay rtmp3[col] = 0.0; 150468785679SHong Zhang } 150568785679SHong Zhang 150668785679SHong Zhang /* load in initial (unfactored row) */ 150768785679SHong Zhang nz = ai[r[i] + 1] - ai[r[i]]; 150868785679SHong Zhang ajtmp = aj + ai[r[i]]; 15099371c9d4SSatish Balay v1 = aa + ai[r[i]]; 15109371c9d4SSatish Balay v2 = aa + ai[r[i] + 1]; 15119371c9d4SSatish Balay v3 = aa + ai[r[i] + 2]; 151268785679SHong Zhang for (j = 0; j < nz; j++) { 151368785679SHong Zhang col = ics[ajtmp[j]]; 15149371c9d4SSatish Balay rtmp1[col] = v1[j]; 15159371c9d4SSatish Balay rtmp2[col] = v2[j]; 15169371c9d4SSatish Balay rtmp3[col] = v3[j]; 151768785679SHong Zhang } 151868785679SHong Zhang /* ZeropivotApply(): shift the diagonal of the matrix */ 15199371c9d4SSatish Balay rtmp1[i] += sctx.shift_amount; 15209371c9d4SSatish Balay rtmp2[i + 1] += sctx.shift_amount; 15219371c9d4SSatish Balay rtmp3[i + 2] += sctx.shift_amount; 152268785679SHong Zhang 152368785679SHong Zhang /* elimination */ 152468785679SHong Zhang bjtmp = bj + bi[i]; 152568785679SHong Zhang row = *bjtmp++; /* pivot row */ 152668785679SHong Zhang nzL = bi[i + 1] - bi[i]; 152768785679SHong Zhang for (k = 0; k < nzL; k++) { 1528b89f182dSHong Zhang pc1 = rtmp1 + row; 1529b89f182dSHong Zhang pc2 = rtmp2 + row; 1530b89f182dSHong Zhang pc3 = rtmp3 + row; 153168785679SHong Zhang if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) { 153268785679SHong Zhang pv = b->a + bdiag[row]; 15339371c9d4SSatish Balay mul1 = *pc1 * (*pv); 15349371c9d4SSatish Balay mul2 = *pc2 * (*pv); 15359371c9d4SSatish Balay mul3 = *pc3 * (*pv); 15369371c9d4SSatish Balay *pc1 = mul1; 15379371c9d4SSatish Balay *pc2 = mul2; 15389371c9d4SSatish Balay *pc3 = mul3; 153968785679SHong Zhang 154068785679SHong Zhang pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 154168785679SHong Zhang pv = b->a + bdiag[row + 1] + 1; 154268785679SHong Zhang nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 154368785679SHong Zhang for (j = 0; j < nz; j++) { 154468785679SHong Zhang col = pj[j]; 1545b89f182dSHong Zhang rtmp1[col] -= mul1 * pv[j]; 1546b89f182dSHong Zhang rtmp2[col] -= mul2 * pv[j]; 1547b89f182dSHong Zhang rtmp3[col] -= mul3 * pv[j]; 154868785679SHong Zhang } 15499566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(3 + 6.0 * nz)); 155068785679SHong Zhang } 155168785679SHong Zhang row = *bjtmp++; 155268785679SHong Zhang } 155368785679SHong Zhang 1554b89f182dSHong Zhang /* finished row i; check zero pivot, then stick row i into b->a */ 1555b89f182dSHong Zhang rs = 0.0; 1556b89f182dSHong Zhang /* L part */ 1557b89f182dSHong Zhang pc1 = b->a + bi[i]; 1558b89f182dSHong Zhang pj = b->j + bi[i]; 1559b89f182dSHong Zhang nz = bi[i + 1] - bi[i]; 1560b89f182dSHong Zhang for (j = 0; j < nz; j++) { 1561b89f182dSHong Zhang col = pj[j]; 15629371c9d4SSatish Balay pc1[j] = rtmp1[col]; 15639371c9d4SSatish Balay rs += PetscAbsScalar(pc1[j]); 1564b89f182dSHong Zhang } 1565b89f182dSHong Zhang /* U part */ 1566b89f182dSHong Zhang pc1 = b->a + bdiag[i + 1] + 1; 1567b89f182dSHong Zhang pj = b->j + bdiag[i + 1] + 1; 15680e7a5c2bSHong Zhang nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */ 1569b89f182dSHong Zhang for (j = 0; j < nz; j++) { 1570b89f182dSHong Zhang col = pj[j]; 15719371c9d4SSatish Balay pc1[j] = rtmp1[col]; 15729371c9d4SSatish Balay rs += PetscAbsScalar(pc1[j]); 1573b89f182dSHong Zhang } 157468785679SHong Zhang 1575b89f182dSHong Zhang sctx.rs = rs; 1576b89f182dSHong Zhang sctx.pv = rtmp1[i]; 15779566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 157807b50cabSHong Zhang if (sctx.newshift) break; 1579b89f182dSHong Zhang pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1580b89f182dSHong Zhang *pc1 = 1.0 / sctx.pv; 1581b89f182dSHong Zhang 1582b89f182dSHong Zhang /* Now take care of 1st column of diagonal 3x3 block. */ 1583b89f182dSHong Zhang pc2 = rtmp2 + i; 1584b89f182dSHong Zhang pc3 = rtmp3 + i; 1585b89f182dSHong Zhang if (*pc2 != 0.0 || *pc3 != 0.0) { 15869371c9d4SSatish Balay mul2 = (*pc2) * (*pc1); 15879371c9d4SSatish Balay *pc2 = mul2; 15889371c9d4SSatish Balay mul3 = (*pc3) * (*pc1); 15899371c9d4SSatish Balay *pc3 = mul3; 159068785679SHong Zhang pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */ 159168785679SHong Zhang nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */ 159268785679SHong Zhang for (j = 0; j < nz; j++) { 159368785679SHong Zhang col = pj[j]; 1594b89f182dSHong Zhang rtmp2[col] -= mul2 * rtmp1[col]; 1595b89f182dSHong Zhang rtmp3[col] -= mul3 * rtmp1[col]; 159668785679SHong Zhang } 15979566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2 + 4.0 * nz)); 159868785679SHong Zhang } 159968785679SHong Zhang 1600b89f182dSHong Zhang /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1601b89f182dSHong Zhang rs = 0.0; 1602b89f182dSHong Zhang /* L part */ 1603b89f182dSHong Zhang pc2 = b->a + bi[i + 1]; 1604b89f182dSHong Zhang pj = b->j + bi[i + 1]; 1605b89f182dSHong Zhang nz = bi[i + 2] - bi[i + 1]; 1606b89f182dSHong Zhang for (j = 0; j < nz; j++) { 1607b89f182dSHong Zhang col = pj[j]; 16089371c9d4SSatish Balay pc2[j] = rtmp2[col]; 16099371c9d4SSatish Balay rs += PetscAbsScalar(pc2[j]); 1610b89f182dSHong Zhang } 1611b89f182dSHong Zhang /* U part */ 1612b89f182dSHong Zhang pc2 = b->a + bdiag[i + 2] + 1; 16130e7a5c2bSHong Zhang pj = b->j + bdiag[i + 2] + 1; 16140e7a5c2bSHong Zhang nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */ 1615b89f182dSHong Zhang for (j = 0; j < nz; j++) { 1616b89f182dSHong Zhang col = pj[j]; 16179371c9d4SSatish Balay pc2[j] = rtmp2[col]; 16189371c9d4SSatish Balay rs += PetscAbsScalar(pc2[j]); 1619b89f182dSHong Zhang } 1620b89f182dSHong Zhang 1621b89f182dSHong Zhang sctx.rs = rs; 1622b89f182dSHong Zhang sctx.pv = rtmp2[i + 1]; 16239566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1)); 162407b50cabSHong Zhang if (sctx.newshift) break; 1625b89f182dSHong Zhang pc2 = b->a + bdiag[i + 1]; 1626b89f182dSHong Zhang *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */ 1627b89f182dSHong Zhang 1628b89f182dSHong Zhang /* Now take care of 2nd column of diagonal 3x3 block. */ 1629b89f182dSHong Zhang pc3 = rtmp3 + i + 1; 163068785679SHong Zhang if (*pc3 != 0.0) { 16319371c9d4SSatish Balay mul3 = (*pc3) * (*pc2); 16329371c9d4SSatish Balay *pc3 = mul3; 163368785679SHong Zhang pj = b->j + bdiag[i + 2] + 1; /* beginning of U(i+1,:) */ 163468785679SHong Zhang nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */ 163568785679SHong Zhang for (j = 0; j < nz; j++) { 163668785679SHong Zhang col = pj[j]; 1637b89f182dSHong Zhang rtmp3[col] -= mul3 * rtmp2[col]; 163868785679SHong Zhang } 16399566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(1 + 2.0 * nz)); 164068785679SHong Zhang } 164168785679SHong Zhang 1642b89f182dSHong Zhang /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 164368785679SHong Zhang rs = 0.0; 164468785679SHong Zhang /* L part */ 1645b89f182dSHong Zhang pc3 = b->a + bi[i + 2]; 1646b89f182dSHong Zhang pj = b->j + bi[i + 2]; 1647b89f182dSHong Zhang nz = bi[i + 3] - bi[i + 2]; 164868785679SHong Zhang for (j = 0; j < nz; j++) { 164968785679SHong Zhang col = pj[j]; 16509371c9d4SSatish Balay pc3[j] = rtmp3[col]; 16519371c9d4SSatish Balay rs += PetscAbsScalar(pc3[j]); 165268785679SHong Zhang } 165368785679SHong Zhang /* U part */ 1654b89f182dSHong Zhang pc3 = b->a + bdiag[i + 3] + 1; 16550e7a5c2bSHong Zhang pj = b->j + bdiag[i + 3] + 1; 16560e7a5c2bSHong Zhang nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */ 165768785679SHong Zhang for (j = 0; j < nz; j++) { 165868785679SHong Zhang col = pj[j]; 16599371c9d4SSatish Balay pc3[j] = rtmp3[col]; 16609371c9d4SSatish Balay rs += PetscAbsScalar(pc3[j]); 166168785679SHong Zhang } 166268785679SHong Zhang 166368785679SHong Zhang sctx.rs = rs; 1664b89f182dSHong Zhang sctx.pv = rtmp3[i + 2]; 16659566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2)); 166607b50cabSHong Zhang if (sctx.newshift) break; 166768785679SHong Zhang pc3 = b->a + bdiag[i + 2]; 1668b89f182dSHong Zhang *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */ 166968785679SHong Zhang break; 16709877982aSShri Abhyankar case 4: 16719877982aSShri Abhyankar /* zero rtmp */ 16729877982aSShri Abhyankar /* L part */ 16739877982aSShri Abhyankar nz = bi[i + 1] - bi[i]; 16749877982aSShri Abhyankar bjtmp = bj + bi[i]; 16759877982aSShri Abhyankar for (j = 0; j < nz; j++) { 16769877982aSShri Abhyankar col = bjtmp[j]; 16779371c9d4SSatish Balay rtmp1[col] = 0.0; 16789371c9d4SSatish Balay rtmp2[col] = 0.0; 16799371c9d4SSatish Balay rtmp3[col] = 0.0; 16809371c9d4SSatish Balay rtmp4[col] = 0.0; 16819877982aSShri Abhyankar } 16829877982aSShri Abhyankar 16839877982aSShri Abhyankar /* U part */ 16849877982aSShri Abhyankar nz = bdiag[i] - bdiag[i + 1]; 16859877982aSShri Abhyankar bjtmp = bj + bdiag[i + 1] + 1; 16869877982aSShri Abhyankar for (j = 0; j < nz; j++) { 16879877982aSShri Abhyankar col = bjtmp[j]; 16889371c9d4SSatish Balay rtmp1[col] = 0.0; 16899371c9d4SSatish Balay rtmp2[col] = 0.0; 16909371c9d4SSatish Balay rtmp3[col] = 0.0; 16919371c9d4SSatish Balay rtmp4[col] = 0.0; 16929877982aSShri Abhyankar } 16939877982aSShri Abhyankar 16949877982aSShri Abhyankar /* load in initial (unfactored row) */ 16959877982aSShri Abhyankar nz = ai[r[i] + 1] - ai[r[i]]; 16969877982aSShri Abhyankar ajtmp = aj + ai[r[i]]; 16979371c9d4SSatish Balay v1 = aa + ai[r[i]]; 16989371c9d4SSatish Balay v2 = aa + ai[r[i] + 1]; 16999371c9d4SSatish Balay v3 = aa + ai[r[i] + 2]; 17009371c9d4SSatish Balay v4 = aa + ai[r[i] + 3]; 17019877982aSShri Abhyankar for (j = 0; j < nz; j++) { 17029877982aSShri Abhyankar col = ics[ajtmp[j]]; 17039371c9d4SSatish Balay rtmp1[col] = v1[j]; 17049371c9d4SSatish Balay rtmp2[col] = v2[j]; 17059371c9d4SSatish Balay rtmp3[col] = v3[j]; 17069371c9d4SSatish Balay rtmp4[col] = v4[j]; 17079877982aSShri Abhyankar } 17089877982aSShri Abhyankar /* ZeropivotApply(): shift the diagonal of the matrix */ 17099371c9d4SSatish Balay rtmp1[i] += sctx.shift_amount; 17109371c9d4SSatish Balay rtmp2[i + 1] += sctx.shift_amount; 17119371c9d4SSatish Balay rtmp3[i + 2] += sctx.shift_amount; 17129371c9d4SSatish Balay rtmp4[i + 3] += sctx.shift_amount; 17139877982aSShri Abhyankar 17149877982aSShri Abhyankar /* elimination */ 17159877982aSShri Abhyankar bjtmp = bj + bi[i]; 17169877982aSShri Abhyankar row = *bjtmp++; /* pivot row */ 17179877982aSShri Abhyankar nzL = bi[i + 1] - bi[i]; 17189877982aSShri Abhyankar for (k = 0; k < nzL; k++) { 17199877982aSShri Abhyankar pc1 = rtmp1 + row; 17209877982aSShri Abhyankar pc2 = rtmp2 + row; 17219877982aSShri Abhyankar pc3 = rtmp3 + row; 17229877982aSShri Abhyankar pc4 = rtmp4 + row; 17239877982aSShri Abhyankar if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 17249877982aSShri Abhyankar pv = b->a + bdiag[row]; 17259371c9d4SSatish Balay mul1 = *pc1 * (*pv); 17269371c9d4SSatish Balay mul2 = *pc2 * (*pv); 17279371c9d4SSatish Balay mul3 = *pc3 * (*pv); 17289371c9d4SSatish Balay mul4 = *pc4 * (*pv); 17299371c9d4SSatish Balay *pc1 = mul1; 17309371c9d4SSatish Balay *pc2 = mul2; 17319371c9d4SSatish Balay *pc3 = mul3; 17329371c9d4SSatish Balay *pc4 = mul4; 17339877982aSShri Abhyankar 17349877982aSShri Abhyankar pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 17359877982aSShri Abhyankar pv = b->a + bdiag[row + 1] + 1; 17369877982aSShri Abhyankar nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 17379877982aSShri Abhyankar for (j = 0; j < nz; j++) { 17389877982aSShri Abhyankar col = pj[j]; 17399877982aSShri Abhyankar rtmp1[col] -= mul1 * pv[j]; 17409877982aSShri Abhyankar rtmp2[col] -= mul2 * pv[j]; 17419877982aSShri Abhyankar rtmp3[col] -= mul3 * pv[j]; 17429877982aSShri Abhyankar rtmp4[col] -= mul4 * pv[j]; 17439877982aSShri Abhyankar } 17449566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(4 + 8.0 * nz)); 17459877982aSShri Abhyankar } 17469877982aSShri Abhyankar row = *bjtmp++; 17479877982aSShri Abhyankar } 17489877982aSShri Abhyankar 17499877982aSShri Abhyankar /* finished row i; check zero pivot, then stick row i into b->a */ 17509877982aSShri Abhyankar rs = 0.0; 17519877982aSShri Abhyankar /* L part */ 17529877982aSShri Abhyankar pc1 = b->a + bi[i]; 17539877982aSShri Abhyankar pj = b->j + bi[i]; 17549877982aSShri Abhyankar nz = bi[i + 1] - bi[i]; 17559877982aSShri Abhyankar for (j = 0; j < nz; j++) { 17569877982aSShri Abhyankar col = pj[j]; 17579371c9d4SSatish Balay pc1[j] = rtmp1[col]; 17589371c9d4SSatish Balay rs += PetscAbsScalar(pc1[j]); 17599877982aSShri Abhyankar } 17609877982aSShri Abhyankar /* U part */ 17619877982aSShri Abhyankar pc1 = b->a + bdiag[i + 1] + 1; 17629877982aSShri Abhyankar pj = b->j + bdiag[i + 1] + 1; 17639877982aSShri Abhyankar nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */ 17649877982aSShri Abhyankar for (j = 0; j < nz; j++) { 17659877982aSShri Abhyankar col = pj[j]; 17669371c9d4SSatish Balay pc1[j] = rtmp1[col]; 17679371c9d4SSatish Balay rs += PetscAbsScalar(pc1[j]); 17689877982aSShri Abhyankar } 17699877982aSShri Abhyankar 17709877982aSShri Abhyankar sctx.rs = rs; 17719877982aSShri Abhyankar sctx.pv = rtmp1[i]; 17729566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 177307b50cabSHong Zhang if (sctx.newshift) break; 17749877982aSShri Abhyankar pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 17759877982aSShri Abhyankar *pc1 = 1.0 / sctx.pv; 17769877982aSShri Abhyankar 17779877982aSShri Abhyankar /* Now take care of 1st column of diagonal 4x4 block. */ 17789877982aSShri Abhyankar pc2 = rtmp2 + i; 17799877982aSShri Abhyankar pc3 = rtmp3 + i; 17809877982aSShri Abhyankar pc4 = rtmp4 + i; 17819877982aSShri Abhyankar if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 17829371c9d4SSatish Balay mul2 = (*pc2) * (*pc1); 17839371c9d4SSatish Balay *pc2 = mul2; 17849371c9d4SSatish Balay mul3 = (*pc3) * (*pc1); 17859371c9d4SSatish Balay *pc3 = mul3; 17869371c9d4SSatish Balay mul4 = (*pc4) * (*pc1); 17879371c9d4SSatish Balay *pc4 = mul4; 17889877982aSShri Abhyankar pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */ 17899877982aSShri Abhyankar nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */ 17909877982aSShri Abhyankar for (j = 0; j < nz; j++) { 17919877982aSShri Abhyankar col = pj[j]; 17929877982aSShri Abhyankar rtmp2[col] -= mul2 * rtmp1[col]; 17939877982aSShri Abhyankar rtmp3[col] -= mul3 * rtmp1[col]; 17949877982aSShri Abhyankar rtmp4[col] -= mul4 * rtmp1[col]; 17959877982aSShri Abhyankar } 17969566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(3 + 6.0 * nz)); 17979877982aSShri Abhyankar } 17989877982aSShri Abhyankar 17999877982aSShri Abhyankar /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 18009877982aSShri Abhyankar rs = 0.0; 18019877982aSShri Abhyankar /* L part */ 18029877982aSShri Abhyankar pc2 = b->a + bi[i + 1]; 18039877982aSShri Abhyankar pj = b->j + bi[i + 1]; 18049877982aSShri Abhyankar nz = bi[i + 2] - bi[i + 1]; 18059877982aSShri Abhyankar for (j = 0; j < nz; j++) { 18069877982aSShri Abhyankar col = pj[j]; 18079371c9d4SSatish Balay pc2[j] = rtmp2[col]; 18089371c9d4SSatish Balay rs += PetscAbsScalar(pc2[j]); 18099877982aSShri Abhyankar } 18109877982aSShri Abhyankar /* U part */ 18119877982aSShri Abhyankar pc2 = b->a + bdiag[i + 2] + 1; 18129877982aSShri Abhyankar pj = b->j + bdiag[i + 2] + 1; 18139877982aSShri Abhyankar nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */ 18149877982aSShri Abhyankar for (j = 0; j < nz; j++) { 18159877982aSShri Abhyankar col = pj[j]; 18169371c9d4SSatish Balay pc2[j] = rtmp2[col]; 18179371c9d4SSatish Balay rs += PetscAbsScalar(pc2[j]); 18189877982aSShri Abhyankar } 18199877982aSShri Abhyankar 18209877982aSShri Abhyankar sctx.rs = rs; 18219877982aSShri Abhyankar sctx.pv = rtmp2[i + 1]; 18229566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1)); 182307b50cabSHong Zhang if (sctx.newshift) break; 18249877982aSShri Abhyankar pc2 = b->a + bdiag[i + 1]; 18259877982aSShri Abhyankar *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */ 18269877982aSShri Abhyankar 18279877982aSShri Abhyankar /* Now take care of 2nd column of diagonal 4x4 block. */ 18289877982aSShri Abhyankar pc3 = rtmp3 + i + 1; 18299877982aSShri Abhyankar pc4 = rtmp4 + i + 1; 18309877982aSShri Abhyankar if (*pc3 != 0.0 || *pc4 != 0.0) { 18319371c9d4SSatish Balay mul3 = (*pc3) * (*pc2); 18329371c9d4SSatish Balay *pc3 = mul3; 18339371c9d4SSatish Balay mul4 = (*pc4) * (*pc2); 18349371c9d4SSatish Balay *pc4 = mul4; 18359877982aSShri Abhyankar pj = b->j + bdiag[i + 2] + 1; /* beginning of U(i+1,:) */ 18369877982aSShri Abhyankar nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */ 18379877982aSShri Abhyankar for (j = 0; j < nz; j++) { 18389877982aSShri Abhyankar col = pj[j]; 18399877982aSShri Abhyankar rtmp3[col] -= mul3 * rtmp2[col]; 18409877982aSShri Abhyankar rtmp4[col] -= mul4 * rtmp2[col]; 18419877982aSShri Abhyankar } 18429566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(4.0 * nz)); 18439877982aSShri Abhyankar } 18449877982aSShri Abhyankar 18459877982aSShri Abhyankar /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 18469877982aSShri Abhyankar rs = 0.0; 18479877982aSShri Abhyankar /* L part */ 18489877982aSShri Abhyankar pc3 = b->a + bi[i + 2]; 18499877982aSShri Abhyankar pj = b->j + bi[i + 2]; 18509877982aSShri Abhyankar nz = bi[i + 3] - bi[i + 2]; 18519877982aSShri Abhyankar for (j = 0; j < nz; j++) { 18529877982aSShri Abhyankar col = pj[j]; 18539371c9d4SSatish Balay pc3[j] = rtmp3[col]; 18549371c9d4SSatish Balay rs += PetscAbsScalar(pc3[j]); 18559877982aSShri Abhyankar } 18569877982aSShri Abhyankar /* U part */ 18579877982aSShri Abhyankar pc3 = b->a + bdiag[i + 3] + 1; 18589877982aSShri Abhyankar pj = b->j + bdiag[i + 3] + 1; 18599877982aSShri Abhyankar nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */ 18609877982aSShri Abhyankar for (j = 0; j < nz; j++) { 18619877982aSShri Abhyankar col = pj[j]; 18629371c9d4SSatish Balay pc3[j] = rtmp3[col]; 18639371c9d4SSatish Balay rs += PetscAbsScalar(pc3[j]); 18649877982aSShri Abhyankar } 18659877982aSShri Abhyankar 18669877982aSShri Abhyankar sctx.rs = rs; 18679877982aSShri Abhyankar sctx.pv = rtmp3[i + 2]; 18689566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2)); 186907b50cabSHong Zhang if (sctx.newshift) break; 18709877982aSShri Abhyankar pc3 = b->a + bdiag[i + 2]; 18719877982aSShri Abhyankar *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */ 18729877982aSShri Abhyankar 18739877982aSShri Abhyankar /* Now take care of 3rd column of diagonal 4x4 block. */ 18749877982aSShri Abhyankar pc4 = rtmp4 + i + 2; 18759877982aSShri Abhyankar if (*pc4 != 0.0) { 18769371c9d4SSatish Balay mul4 = (*pc4) * (*pc3); 18779371c9d4SSatish Balay *pc4 = mul4; 18789877982aSShri Abhyankar pj = b->j + bdiag[i + 3] + 1; /* beginning of U(i+2,:) */ 18799877982aSShri Abhyankar nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* num of entries in U(i+2,:) excluding diag */ 18809877982aSShri Abhyankar for (j = 0; j < nz; j++) { 18819877982aSShri Abhyankar col = pj[j]; 18829877982aSShri Abhyankar rtmp4[col] -= mul4 * rtmp3[col]; 18839877982aSShri Abhyankar } 18849566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(1 + 2.0 * nz)); 18859877982aSShri Abhyankar } 18869877982aSShri Abhyankar 18879877982aSShri Abhyankar /* finished i+3; check zero pivot, then stick row i+3 into b->a */ 18889877982aSShri Abhyankar rs = 0.0; 18899877982aSShri Abhyankar /* L part */ 18909877982aSShri Abhyankar pc4 = b->a + bi[i + 3]; 18919877982aSShri Abhyankar pj = b->j + bi[i + 3]; 18929877982aSShri Abhyankar nz = bi[i + 4] - bi[i + 3]; 18939877982aSShri Abhyankar for (j = 0; j < nz; j++) { 18949877982aSShri Abhyankar col = pj[j]; 18959371c9d4SSatish Balay pc4[j] = rtmp4[col]; 18969371c9d4SSatish Balay rs += PetscAbsScalar(pc4[j]); 18979877982aSShri Abhyankar } 18989877982aSShri Abhyankar /* U part */ 18999877982aSShri Abhyankar pc4 = b->a + bdiag[i + 4] + 1; 19009877982aSShri Abhyankar pj = b->j + bdiag[i + 4] + 1; 19019877982aSShri Abhyankar nz = bdiag[i + 3] - bdiag[i + 4] - 1; /* exclude diagonal */ 19029877982aSShri Abhyankar for (j = 0; j < nz; j++) { 19039877982aSShri Abhyankar col = pj[j]; 19049371c9d4SSatish Balay pc4[j] = rtmp4[col]; 19059371c9d4SSatish Balay rs += PetscAbsScalar(pc4[j]); 19069877982aSShri Abhyankar } 19079877982aSShri Abhyankar 19089877982aSShri Abhyankar sctx.rs = rs; 19099877982aSShri Abhyankar sctx.pv = rtmp4[i + 3]; 19109566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(B, A, info, &sctx, i + 3)); 191107b50cabSHong Zhang if (sctx.newshift) break; 19129877982aSShri Abhyankar pc4 = b->a + bdiag[i + 3]; 19139877982aSShri Abhyankar *pc4 = 1.0 / sctx.pv; /* Mark diag[i+3] */ 19149877982aSShri Abhyankar break; 191568785679SHong Zhang 1916d71ae5a4SJacob Faibussowitsch default: 1917d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported "); 191828f1b45aSHong Zhang } 1919c2b86aeeSHong Zhang if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */ 192028f1b45aSHong Zhang i += nodesz; /* Update the row */ 192168785679SHong Zhang } 192228f1b45aSHong Zhang 192328f1b45aSHong Zhang /* MatPivotRefine() */ 192407b50cabSHong Zhang if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) { 192528f1b45aSHong Zhang /* 192628f1b45aSHong Zhang * if no shift in this attempt & shifting & started shifting & can refine, 192728f1b45aSHong Zhang * then try lower shift 192828f1b45aSHong Zhang */ 192928f1b45aSHong Zhang sctx.shift_hi = sctx.shift_fraction; 193028f1b45aSHong Zhang sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.; 193128f1b45aSHong Zhang sctx.shift_amount = sctx.shift_fraction * sctx.shift_top; 193207b50cabSHong Zhang sctx.newshift = PETSC_TRUE; 193328f1b45aSHong Zhang sctx.nshift++; 193428f1b45aSHong Zhang } 193507b50cabSHong Zhang } while (sctx.newshift); 193628f1b45aSHong Zhang 19379566063dSJacob Faibussowitsch PetscCall(PetscFree4(rtmp1, rtmp2, rtmp3, rtmp4)); 19389566063dSJacob Faibussowitsch PetscCall(PetscFree(tmp_vec2)); 19399566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isicol, &ic)); 19409566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &r)); 194128f1b45aSHong Zhang 1942abb87a52SBarry Smith if (b->inode.size) { 1943abb87a52SBarry Smith C->ops->solve = MatSolve_SeqAIJ_Inode; 1944abb87a52SBarry Smith } else { 1945d3ac4fa3SBarry Smith C->ops->solve = MatSolve_SeqAIJ; 1946abb87a52SBarry Smith } 194728f1b45aSHong Zhang C->ops->solveadd = MatSolveAdd_SeqAIJ; 194828f1b45aSHong Zhang C->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 194928f1b45aSHong Zhang C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 195028f1b45aSHong Zhang C->ops->matsolve = MatMatSolve_SeqAIJ; 195128f1b45aSHong Zhang C->assembled = PETSC_TRUE; 195228f1b45aSHong Zhang C->preallocated = PETSC_TRUE; 19532205254eSKarl Rupp 19549566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(C->cmap->n)); 195528f1b45aSHong Zhang 195628f1b45aSHong Zhang /* MatShiftView(A,info,&sctx) */ 195728f1b45aSHong Zhang if (sctx.nshift) { 1958f4db908eSBarry Smith if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 19599566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top)); 1960f4db908eSBarry Smith } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 19619566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 1962f4db908eSBarry Smith } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) { 19639566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount)); 196428f1b45aSHong Zhang } 196528f1b45aSHong Zhang } 19663ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 196728f1b45aSHong Zhang } 1968628f99d7SShri Abhyankar 1969ff6a9541SJacob Faibussowitsch #if 0 1970ff6a9541SJacob Faibussowitsch // unused 1971ff6a9541SJacob Faibussowitsch static PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B, Mat A, const MatFactorInfo *info) 1972d71ae5a4SJacob Faibussowitsch { 1973628f99d7SShri Abhyankar Mat C = B; 1974628f99d7SShri Abhyankar Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data; 1975628f99d7SShri Abhyankar IS iscol = b->col, isrow = b->row, isicol = b->icol; 1976628f99d7SShri Abhyankar const PetscInt *r, *ic, *c, *ics; 1977628f99d7SShri Abhyankar PetscInt n = A->rmap->n, *bi = b->i; 1978628f99d7SShri Abhyankar PetscInt *bj = b->j, *nbj = b->j + 1, *ajtmp, *bjtmp, nz, nz_tmp, row, prow; 19798758e1faSBarry Smith PetscInt i, j, idx, *bd = b->diag, node_max, nodesz; 19808758e1faSBarry Smith PetscInt *ai = a->i, *aj = a->j; 1981628f99d7SShri Abhyankar PetscInt *ns, *tmp_vec1, *tmp_vec2, *nsmap, *pj; 1982628f99d7SShri Abhyankar PetscScalar mul1, mul2, mul3, tmp; 1983628f99d7SShri Abhyankar MatScalar *pc1, *pc2, *pc3, *ba = b->a, *pv, *rtmp11, *rtmp22, *rtmp33; 1984628f99d7SShri Abhyankar const MatScalar *v1, *v2, *v3, *aa = a->a, *rtmp1; 1985628f99d7SShri Abhyankar PetscReal rs = 0.0; 1986628f99d7SShri Abhyankar FactorShiftCtx sctx; 1987628f99d7SShri Abhyankar 1988628f99d7SShri Abhyankar PetscFunctionBegin; 1989628f99d7SShri Abhyankar sctx.shift_top = 0; 1990628f99d7SShri Abhyankar sctx.nshift_max = 0; 1991628f99d7SShri Abhyankar sctx.shift_lo = 0; 1992628f99d7SShri Abhyankar sctx.shift_hi = 0; 1993628f99d7SShri Abhyankar sctx.shift_fraction = 0; 1994628f99d7SShri Abhyankar 1995628f99d7SShri Abhyankar /* if both shift schemes are chosen by user, only use info->shiftpd */ 1996f4db908eSBarry Smith if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1997628f99d7SShri Abhyankar sctx.shift_top = 0; 1998628f99d7SShri Abhyankar for (i = 0; i < n; i++) { 1999628f99d7SShri Abhyankar /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 2000628f99d7SShri Abhyankar rs = 0.0; 2001628f99d7SShri Abhyankar ajtmp = aj + ai[i]; 2002628f99d7SShri Abhyankar rtmp1 = aa + ai[i]; 2003628f99d7SShri Abhyankar nz = ai[i + 1] - ai[i]; 2004628f99d7SShri Abhyankar for (j = 0; j < nz; j++) { 2005628f99d7SShri Abhyankar if (*ajtmp != i) { 2006628f99d7SShri Abhyankar rs += PetscAbsScalar(*rtmp1++); 2007628f99d7SShri Abhyankar } else { 2008628f99d7SShri Abhyankar rs -= PetscRealPart(*rtmp1++); 2009628f99d7SShri Abhyankar } 2010628f99d7SShri Abhyankar ajtmp++; 2011628f99d7SShri Abhyankar } 2012628f99d7SShri Abhyankar if (rs > sctx.shift_top) sctx.shift_top = rs; 2013628f99d7SShri Abhyankar } 2014628f99d7SShri Abhyankar if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12; 2015628f99d7SShri Abhyankar sctx.shift_top *= 1.1; 2016628f99d7SShri Abhyankar sctx.nshift_max = 5; 2017628f99d7SShri Abhyankar sctx.shift_lo = 0.; 2018628f99d7SShri Abhyankar sctx.shift_hi = 1.; 2019628f99d7SShri Abhyankar } 2020628f99d7SShri Abhyankar sctx.shift_amount = 0; 2021628f99d7SShri Abhyankar sctx.nshift = 0; 2022628f99d7SShri Abhyankar 20239566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow, &r)); 20249566063dSJacob Faibussowitsch PetscCall(ISGetIndices(iscol, &c)); 20259566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isicol, &ic)); 20269566063dSJacob Faibussowitsch PetscCall(PetscCalloc3(n, &rtmp11, n, &rtmp22, n, &rtmp33)); 2027628f99d7SShri Abhyankar ics = ic; 2028628f99d7SShri Abhyankar 2029628f99d7SShri Abhyankar node_max = a->inode.node_count; 2030628f99d7SShri Abhyankar ns = a->inode.size; 203128b400f6SJacob Faibussowitsch PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information"); 2032628f99d7SShri Abhyankar 2033628f99d7SShri Abhyankar /* If max inode size > 3, split it into two inodes.*/ 2034628f99d7SShri Abhyankar /* also map the inode sizes according to the ordering */ 20359566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(n + 1, &tmp_vec1)); 2036628f99d7SShri Abhyankar for (i = 0, j = 0; i < node_max; ++i, ++j) { 2037628f99d7SShri Abhyankar if (ns[i] > 3) { 2038628f99d7SShri Abhyankar tmp_vec1[j] = ns[i] / 2; /* Assuming ns[i] < =5 */ 2039628f99d7SShri Abhyankar ++j; 2040628f99d7SShri Abhyankar tmp_vec1[j] = ns[i] - tmp_vec1[j - 1]; 2041628f99d7SShri Abhyankar } else { 2042628f99d7SShri Abhyankar tmp_vec1[j] = ns[i]; 2043628f99d7SShri Abhyankar } 2044628f99d7SShri Abhyankar } 2045628f99d7SShri Abhyankar /* Use the correct node_max */ 2046628f99d7SShri Abhyankar node_max = j; 2047628f99d7SShri Abhyankar 2048628f99d7SShri Abhyankar /* Now reorder the inode info based on mat re-ordering info */ 2049628f99d7SShri Abhyankar /* First create a row -> inode_size_array_index map */ 20509566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(n + 1, &nsmap, node_max + 1, &tmp_vec2)); 2051628f99d7SShri Abhyankar for (i = 0, row = 0; i < node_max; i++) { 2052628f99d7SShri Abhyankar nodesz = tmp_vec1[i]; 2053ad540459SPierre Jolivet for (j = 0; j < nodesz; j++, row++) nsmap[row] = i; 2054628f99d7SShri Abhyankar } 2055628f99d7SShri Abhyankar /* Using nsmap, create a reordered ns structure */ 2056628f99d7SShri Abhyankar for (i = 0, j = 0; i < node_max; i++) { 2057628f99d7SShri Abhyankar nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 2058628f99d7SShri Abhyankar tmp_vec2[i] = nodesz; 2059628f99d7SShri Abhyankar j += nodesz; 2060628f99d7SShri Abhyankar } 20619566063dSJacob Faibussowitsch PetscCall(PetscFree2(nsmap, tmp_vec1)); 2062628f99d7SShri Abhyankar /* Now use the correct ns */ 2063628f99d7SShri Abhyankar ns = tmp_vec2; 2064628f99d7SShri Abhyankar 2065628f99d7SShri Abhyankar do { 206607b50cabSHong Zhang sctx.newshift = PETSC_FALSE; 2067628f99d7SShri Abhyankar /* Now loop over each block-row, and do the factorization */ 2068628f99d7SShri Abhyankar for (i = 0, row = 0; i < node_max; i++) { 2069628f99d7SShri Abhyankar nodesz = ns[i]; 2070628f99d7SShri Abhyankar nz = bi[row + 1] - bi[row]; 2071628f99d7SShri Abhyankar bjtmp = bj + bi[row]; 2072628f99d7SShri Abhyankar 2073628f99d7SShri Abhyankar switch (nodesz) { 2074628f99d7SShri Abhyankar case 1: 2075628f99d7SShri Abhyankar for (j = 0; j < nz; j++) { 2076628f99d7SShri Abhyankar idx = bjtmp[j]; 2077628f99d7SShri Abhyankar rtmp11[idx] = 0.0; 2078628f99d7SShri Abhyankar } 2079628f99d7SShri Abhyankar 2080628f99d7SShri Abhyankar /* load in initial (unfactored row) */ 2081628f99d7SShri Abhyankar idx = r[row]; 2082628f99d7SShri Abhyankar nz_tmp = ai[idx + 1] - ai[idx]; 2083628f99d7SShri Abhyankar ajtmp = aj + ai[idx]; 2084628f99d7SShri Abhyankar v1 = aa + ai[idx]; 2085628f99d7SShri Abhyankar 2086628f99d7SShri Abhyankar for (j = 0; j < nz_tmp; j++) { 2087628f99d7SShri Abhyankar idx = ics[ajtmp[j]]; 2088628f99d7SShri Abhyankar rtmp11[idx] = v1[j]; 2089628f99d7SShri Abhyankar } 2090628f99d7SShri Abhyankar rtmp11[ics[r[row]]] += sctx.shift_amount; 2091628f99d7SShri Abhyankar 2092628f99d7SShri Abhyankar prow = *bjtmp++; 2093628f99d7SShri Abhyankar while (prow < row) { 2094628f99d7SShri Abhyankar pc1 = rtmp11 + prow; 2095628f99d7SShri Abhyankar if (*pc1 != 0.0) { 2096628f99d7SShri Abhyankar pv = ba + bd[prow]; 2097628f99d7SShri Abhyankar pj = nbj + bd[prow]; 2098628f99d7SShri Abhyankar mul1 = *pc1 * *pv++; 2099628f99d7SShri Abhyankar *pc1 = mul1; 2100628f99d7SShri Abhyankar nz_tmp = bi[prow + 1] - bd[prow] - 1; 21019566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp)); 2102628f99d7SShri Abhyankar for (j = 0; j < nz_tmp; j++) { 2103628f99d7SShri Abhyankar tmp = pv[j]; 2104628f99d7SShri Abhyankar idx = pj[j]; 2105628f99d7SShri Abhyankar rtmp11[idx] -= mul1 * tmp; 2106628f99d7SShri Abhyankar } 2107628f99d7SShri Abhyankar } 2108628f99d7SShri Abhyankar prow = *bjtmp++; 2109628f99d7SShri Abhyankar } 2110628f99d7SShri Abhyankar pj = bj + bi[row]; 2111628f99d7SShri Abhyankar pc1 = ba + bi[row]; 2112628f99d7SShri Abhyankar 2113628f99d7SShri Abhyankar sctx.pv = rtmp11[row]; 2114628f99d7SShri Abhyankar rtmp11[row] = 1.0 / rtmp11[row]; /* invert diag */ 2115628f99d7SShri Abhyankar rs = 0.0; 2116628f99d7SShri Abhyankar for (j = 0; j < nz; j++) { 2117628f99d7SShri Abhyankar idx = pj[j]; 2118628f99d7SShri Abhyankar pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */ 2119628f99d7SShri Abhyankar if (idx != row) rs += PetscAbsScalar(pc1[j]); 2120628f99d7SShri Abhyankar } 2121628f99d7SShri Abhyankar sctx.rs = rs; 21229566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(B, A, info, &sctx, row)); 212307b50cabSHong Zhang if (sctx.newshift) goto endofwhile; 2124628f99d7SShri Abhyankar break; 2125628f99d7SShri Abhyankar 2126628f99d7SShri Abhyankar case 2: 2127628f99d7SShri Abhyankar for (j = 0; j < nz; j++) { 2128628f99d7SShri Abhyankar idx = bjtmp[j]; 2129628f99d7SShri Abhyankar rtmp11[idx] = 0.0; 2130628f99d7SShri Abhyankar rtmp22[idx] = 0.0; 2131628f99d7SShri Abhyankar } 2132628f99d7SShri Abhyankar 2133628f99d7SShri Abhyankar /* load in initial (unfactored row) */ 2134628f99d7SShri Abhyankar idx = r[row]; 2135628f99d7SShri Abhyankar nz_tmp = ai[idx + 1] - ai[idx]; 2136628f99d7SShri Abhyankar ajtmp = aj + ai[idx]; 2137628f99d7SShri Abhyankar v1 = aa + ai[idx]; 2138628f99d7SShri Abhyankar v2 = aa + ai[idx + 1]; 2139628f99d7SShri Abhyankar for (j = 0; j < nz_tmp; j++) { 2140628f99d7SShri Abhyankar idx = ics[ajtmp[j]]; 2141628f99d7SShri Abhyankar rtmp11[idx] = v1[j]; 2142628f99d7SShri Abhyankar rtmp22[idx] = v2[j]; 2143628f99d7SShri Abhyankar } 2144628f99d7SShri Abhyankar rtmp11[ics[r[row]]] += sctx.shift_amount; 2145628f99d7SShri Abhyankar rtmp22[ics[r[row + 1]]] += sctx.shift_amount; 2146628f99d7SShri Abhyankar 2147628f99d7SShri Abhyankar prow = *bjtmp++; 2148628f99d7SShri Abhyankar while (prow < row) { 2149628f99d7SShri Abhyankar pc1 = rtmp11 + prow; 2150628f99d7SShri Abhyankar pc2 = rtmp22 + prow; 2151628f99d7SShri Abhyankar if (*pc1 != 0.0 || *pc2 != 0.0) { 2152628f99d7SShri Abhyankar pv = ba + bd[prow]; 2153628f99d7SShri Abhyankar pj = nbj + bd[prow]; 2154628f99d7SShri Abhyankar mul1 = *pc1 * *pv; 2155628f99d7SShri Abhyankar mul2 = *pc2 * *pv; 2156628f99d7SShri Abhyankar ++pv; 2157628f99d7SShri Abhyankar *pc1 = mul1; 2158628f99d7SShri Abhyankar *pc2 = mul2; 2159628f99d7SShri Abhyankar 2160628f99d7SShri Abhyankar nz_tmp = bi[prow + 1] - bd[prow] - 1; 2161628f99d7SShri Abhyankar for (j = 0; j < nz_tmp; j++) { 2162628f99d7SShri Abhyankar tmp = pv[j]; 2163628f99d7SShri Abhyankar idx = pj[j]; 2164628f99d7SShri Abhyankar rtmp11[idx] -= mul1 * tmp; 2165628f99d7SShri Abhyankar rtmp22[idx] -= mul2 * tmp; 2166628f99d7SShri Abhyankar } 21679566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp)); 2168628f99d7SShri Abhyankar } 2169628f99d7SShri Abhyankar prow = *bjtmp++; 2170628f99d7SShri Abhyankar } 2171628f99d7SShri Abhyankar 2172628f99d7SShri Abhyankar /* Now take care of diagonal 2x2 block. Note: prow = row here */ 2173628f99d7SShri Abhyankar pc1 = rtmp11 + prow; 2174628f99d7SShri Abhyankar pc2 = rtmp22 + prow; 2175628f99d7SShri Abhyankar 2176628f99d7SShri Abhyankar sctx.pv = *pc1; 2177628f99d7SShri Abhyankar pj = bj + bi[prow]; 2178628f99d7SShri Abhyankar rs = 0.0; 2179628f99d7SShri Abhyankar for (j = 0; j < nz; j++) { 2180628f99d7SShri Abhyankar idx = pj[j]; 2181628f99d7SShri Abhyankar if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]); 2182628f99d7SShri Abhyankar } 2183628f99d7SShri Abhyankar sctx.rs = rs; 21849566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(B, A, info, &sctx, row)); 218507b50cabSHong Zhang if (sctx.newshift) goto endofwhile; 2186628f99d7SShri Abhyankar 2187628f99d7SShri Abhyankar if (*pc2 != 0.0) { 2188628f99d7SShri Abhyankar pj = nbj + bd[prow]; 2189628f99d7SShri Abhyankar mul2 = (*pc2) / (*pc1); /* since diag is not yet inverted.*/ 2190628f99d7SShri Abhyankar *pc2 = mul2; 2191628f99d7SShri Abhyankar nz_tmp = bi[prow + 1] - bd[prow] - 1; 2192628f99d7SShri Abhyankar for (j = 0; j < nz_tmp; j++) { 2193628f99d7SShri Abhyankar idx = pj[j]; 2194628f99d7SShri Abhyankar tmp = rtmp11[idx]; 2195628f99d7SShri Abhyankar rtmp22[idx] -= mul2 * tmp; 2196628f99d7SShri Abhyankar } 21979566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp)); 2198628f99d7SShri Abhyankar } 2199628f99d7SShri Abhyankar 2200628f99d7SShri Abhyankar pj = bj + bi[row]; 2201628f99d7SShri Abhyankar pc1 = ba + bi[row]; 2202628f99d7SShri Abhyankar pc2 = ba + bi[row + 1]; 2203628f99d7SShri Abhyankar 2204628f99d7SShri Abhyankar sctx.pv = rtmp22[row + 1]; 2205628f99d7SShri Abhyankar rs = 0.0; 2206628f99d7SShri Abhyankar rtmp11[row] = 1.0 / rtmp11[row]; 2207628f99d7SShri Abhyankar rtmp22[row + 1] = 1.0 / rtmp22[row + 1]; 2208628f99d7SShri Abhyankar /* copy row entries from dense representation to sparse */ 2209628f99d7SShri Abhyankar for (j = 0; j < nz; j++) { 2210628f99d7SShri Abhyankar idx = pj[j]; 2211628f99d7SShri Abhyankar pc1[j] = rtmp11[idx]; 2212628f99d7SShri Abhyankar pc2[j] = rtmp22[idx]; 2213628f99d7SShri Abhyankar if (idx != row + 1) rs += PetscAbsScalar(pc2[j]); 2214628f99d7SShri Abhyankar } 2215628f99d7SShri Abhyankar sctx.rs = rs; 22169566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1)); 221707b50cabSHong Zhang if (sctx.newshift) goto endofwhile; 2218628f99d7SShri Abhyankar break; 2219628f99d7SShri Abhyankar 2220628f99d7SShri Abhyankar case 3: 2221628f99d7SShri Abhyankar for (j = 0; j < nz; j++) { 2222628f99d7SShri Abhyankar idx = bjtmp[j]; 2223628f99d7SShri Abhyankar rtmp11[idx] = 0.0; 2224628f99d7SShri Abhyankar rtmp22[idx] = 0.0; 2225628f99d7SShri Abhyankar rtmp33[idx] = 0.0; 2226628f99d7SShri Abhyankar } 2227628f99d7SShri Abhyankar /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */ 2228628f99d7SShri Abhyankar idx = r[row]; 2229628f99d7SShri Abhyankar nz_tmp = ai[idx + 1] - ai[idx]; 2230628f99d7SShri Abhyankar ajtmp = aj + ai[idx]; 2231628f99d7SShri Abhyankar v1 = aa + ai[idx]; 2232628f99d7SShri Abhyankar v2 = aa + ai[idx + 1]; 2233628f99d7SShri Abhyankar v3 = aa + ai[idx + 2]; 2234628f99d7SShri Abhyankar for (j = 0; j < nz_tmp; j++) { 2235628f99d7SShri Abhyankar idx = ics[ajtmp[j]]; 2236628f99d7SShri Abhyankar rtmp11[idx] = v1[j]; 2237628f99d7SShri Abhyankar rtmp22[idx] = v2[j]; 2238628f99d7SShri Abhyankar rtmp33[idx] = v3[j]; 2239628f99d7SShri Abhyankar } 2240628f99d7SShri Abhyankar rtmp11[ics[r[row]]] += sctx.shift_amount; 2241628f99d7SShri Abhyankar rtmp22[ics[r[row + 1]]] += sctx.shift_amount; 2242628f99d7SShri Abhyankar rtmp33[ics[r[row + 2]]] += sctx.shift_amount; 2243628f99d7SShri Abhyankar 2244628f99d7SShri Abhyankar /* loop over all pivot row blocks above this row block */ 2245628f99d7SShri Abhyankar prow = *bjtmp++; 2246628f99d7SShri Abhyankar while (prow < row) { 2247628f99d7SShri Abhyankar pc1 = rtmp11 + prow; 2248628f99d7SShri Abhyankar pc2 = rtmp22 + prow; 2249628f99d7SShri Abhyankar pc3 = rtmp33 + prow; 2250628f99d7SShri Abhyankar if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) { 2251628f99d7SShri Abhyankar pv = ba + bd[prow]; 2252628f99d7SShri Abhyankar pj = nbj + bd[prow]; 2253628f99d7SShri Abhyankar mul1 = *pc1 * *pv; 2254628f99d7SShri Abhyankar mul2 = *pc2 * *pv; 2255628f99d7SShri Abhyankar mul3 = *pc3 * *pv; 2256628f99d7SShri Abhyankar ++pv; 2257628f99d7SShri Abhyankar *pc1 = mul1; 2258628f99d7SShri Abhyankar *pc2 = mul2; 2259628f99d7SShri Abhyankar *pc3 = mul3; 2260628f99d7SShri Abhyankar 2261628f99d7SShri Abhyankar nz_tmp = bi[prow + 1] - bd[prow] - 1; 2262628f99d7SShri Abhyankar /* update this row based on pivot row */ 2263628f99d7SShri Abhyankar for (j = 0; j < nz_tmp; j++) { 2264628f99d7SShri Abhyankar tmp = pv[j]; 2265628f99d7SShri Abhyankar idx = pj[j]; 2266628f99d7SShri Abhyankar rtmp11[idx] -= mul1 * tmp; 2267628f99d7SShri Abhyankar rtmp22[idx] -= mul2 * tmp; 2268628f99d7SShri Abhyankar rtmp33[idx] -= mul3 * tmp; 2269628f99d7SShri Abhyankar } 22709566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(3 + 6.0 * nz_tmp)); 2271628f99d7SShri Abhyankar } 2272628f99d7SShri Abhyankar prow = *bjtmp++; 2273628f99d7SShri Abhyankar } 2274628f99d7SShri Abhyankar 2275628f99d7SShri Abhyankar /* Now take care of diagonal 3x3 block in this set of rows */ 2276628f99d7SShri Abhyankar /* note: prow = row here */ 2277628f99d7SShri Abhyankar pc1 = rtmp11 + prow; 2278628f99d7SShri Abhyankar pc2 = rtmp22 + prow; 2279628f99d7SShri Abhyankar pc3 = rtmp33 + prow; 2280628f99d7SShri Abhyankar 2281628f99d7SShri Abhyankar sctx.pv = *pc1; 2282628f99d7SShri Abhyankar pj = bj + bi[prow]; 2283628f99d7SShri Abhyankar rs = 0.0; 2284628f99d7SShri Abhyankar for (j = 0; j < nz; j++) { 2285628f99d7SShri Abhyankar idx = pj[j]; 2286628f99d7SShri Abhyankar if (idx != row) rs += PetscAbsScalar(rtmp11[idx]); 2287628f99d7SShri Abhyankar } 2288628f99d7SShri Abhyankar sctx.rs = rs; 22899566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(B, A, info, &sctx, row)); 229007b50cabSHong Zhang if (sctx.newshift) goto endofwhile; 2291628f99d7SShri Abhyankar 2292628f99d7SShri Abhyankar if (*pc2 != 0.0 || *pc3 != 0.0) { 2293628f99d7SShri Abhyankar mul2 = (*pc2) / (*pc1); 2294628f99d7SShri Abhyankar mul3 = (*pc3) / (*pc1); 2295628f99d7SShri Abhyankar *pc2 = mul2; 2296628f99d7SShri Abhyankar *pc3 = mul3; 2297628f99d7SShri Abhyankar nz_tmp = bi[prow + 1] - bd[prow] - 1; 2298628f99d7SShri Abhyankar pj = nbj + bd[prow]; 2299628f99d7SShri Abhyankar for (j = 0; j < nz_tmp; j++) { 2300628f99d7SShri Abhyankar idx = pj[j]; 2301628f99d7SShri Abhyankar tmp = rtmp11[idx]; 2302628f99d7SShri Abhyankar rtmp22[idx] -= mul2 * tmp; 2303628f99d7SShri Abhyankar rtmp33[idx] -= mul3 * tmp; 2304628f99d7SShri Abhyankar } 23059566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp)); 2306628f99d7SShri Abhyankar } 2307628f99d7SShri Abhyankar ++prow; 2308628f99d7SShri Abhyankar 2309628f99d7SShri Abhyankar pc2 = rtmp22 + prow; 2310628f99d7SShri Abhyankar pc3 = rtmp33 + prow; 2311628f99d7SShri Abhyankar sctx.pv = *pc2; 2312628f99d7SShri Abhyankar pj = bj + bi[prow]; 2313628f99d7SShri Abhyankar rs = 0.0; 2314628f99d7SShri Abhyankar for (j = 0; j < nz; j++) { 2315628f99d7SShri Abhyankar idx = pj[j]; 2316628f99d7SShri Abhyankar if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]); 2317628f99d7SShri Abhyankar } 2318628f99d7SShri Abhyankar sctx.rs = rs; 23199566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1)); 232007b50cabSHong Zhang if (sctx.newshift) goto endofwhile; 2321628f99d7SShri Abhyankar 2322628f99d7SShri Abhyankar if (*pc3 != 0.0) { 2323628f99d7SShri Abhyankar mul3 = (*pc3) / (*pc2); 2324628f99d7SShri Abhyankar *pc3 = mul3; 2325628f99d7SShri Abhyankar pj = nbj + bd[prow]; 2326628f99d7SShri Abhyankar nz_tmp = bi[prow + 1] - bd[prow] - 1; 2327628f99d7SShri Abhyankar for (j = 0; j < nz_tmp; j++) { 2328628f99d7SShri Abhyankar idx = pj[j]; 2329628f99d7SShri Abhyankar tmp = rtmp22[idx]; 2330628f99d7SShri Abhyankar rtmp33[idx] -= mul3 * tmp; 2331628f99d7SShri Abhyankar } 23329566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp)); 2333628f99d7SShri Abhyankar } 2334628f99d7SShri Abhyankar 2335628f99d7SShri Abhyankar pj = bj + bi[row]; 2336628f99d7SShri Abhyankar pc1 = ba + bi[row]; 2337628f99d7SShri Abhyankar pc2 = ba + bi[row + 1]; 2338628f99d7SShri Abhyankar pc3 = ba + bi[row + 2]; 2339628f99d7SShri Abhyankar 2340628f99d7SShri Abhyankar sctx.pv = rtmp33[row + 2]; 2341628f99d7SShri Abhyankar rs = 0.0; 2342628f99d7SShri Abhyankar rtmp11[row] = 1.0 / rtmp11[row]; 2343628f99d7SShri Abhyankar rtmp22[row + 1] = 1.0 / rtmp22[row + 1]; 2344628f99d7SShri Abhyankar rtmp33[row + 2] = 1.0 / rtmp33[row + 2]; 2345628f99d7SShri Abhyankar /* copy row entries from dense representation to sparse */ 2346628f99d7SShri Abhyankar for (j = 0; j < nz; j++) { 2347628f99d7SShri Abhyankar idx = pj[j]; 2348628f99d7SShri Abhyankar pc1[j] = rtmp11[idx]; 2349628f99d7SShri Abhyankar pc2[j] = rtmp22[idx]; 2350628f99d7SShri Abhyankar pc3[j] = rtmp33[idx]; 2351628f99d7SShri Abhyankar if (idx != row + 2) rs += PetscAbsScalar(pc3[j]); 2352628f99d7SShri Abhyankar } 2353628f99d7SShri Abhyankar 2354628f99d7SShri Abhyankar sctx.rs = rs; 23559566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(B, A, info, &sctx, row + 2)); 235607b50cabSHong Zhang if (sctx.newshift) goto endofwhile; 2357628f99d7SShri Abhyankar break; 2358628f99d7SShri Abhyankar 2359d71ae5a4SJacob Faibussowitsch default: 2360d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported "); 2361628f99d7SShri Abhyankar } 2362628f99d7SShri Abhyankar row += nodesz; /* Update the row */ 2363628f99d7SShri Abhyankar } 2364628f99d7SShri Abhyankar endofwhile:; 236507b50cabSHong Zhang } while (sctx.newshift); 23669566063dSJacob Faibussowitsch PetscCall(PetscFree3(rtmp11, rtmp22, rtmp33)); 23679566063dSJacob Faibussowitsch PetscCall(PetscFree(tmp_vec2)); 23689566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isicol, &ic)); 23699566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &r)); 23709566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &c)); 23712205254eSKarl Rupp 2372d3ac4fa3SBarry Smith (B)->ops->solve = MatSolve_SeqAIJ_inplace; 2373628f99d7SShri Abhyankar /* do not set solve add, since MatSolve_Inode + Add is faster */ 2374628f99d7SShri Abhyankar C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace; 2375628f99d7SShri Abhyankar C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace; 2376628f99d7SShri Abhyankar C->assembled = PETSC_TRUE; 2377628f99d7SShri Abhyankar C->preallocated = PETSC_TRUE; 2378628f99d7SShri Abhyankar if (sctx.nshift) { 2379f4db908eSBarry Smith if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 23809566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top)); 2381f4db908eSBarry Smith } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 23829566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 2383628f99d7SShri Abhyankar } 2384628f99d7SShri Abhyankar } 23859566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(C->cmap->n)); 23869566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCheckInode(C)); 23873ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2388628f99d7SShri Abhyankar } 2389ff6a9541SJacob Faibussowitsch #endif 2390628f99d7SShri Abhyankar 2391d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A, Vec bb, Vec xx) 2392d71ae5a4SJacob Faibussowitsch { 2393019b515eSShri Abhyankar Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2394019b515eSShri Abhyankar IS iscol = a->col, isrow = a->row; 2395019b515eSShri Abhyankar const PetscInt *r, *c, *rout, *cout; 23968758e1faSBarry Smith PetscInt i, j, n = A->rmap->n; 23978758e1faSBarry Smith PetscInt node_max, row, nsz, aii, i0, i1, nz; 23988758e1faSBarry Smith const PetscInt *ai = a->i, *a_j = a->j, *ns, *vi, *ad, *aj; 2399019b515eSShri Abhyankar PetscScalar *x, *tmp, *tmps, tmp0, tmp1; 2400019b515eSShri Abhyankar PetscScalar sum1, sum2, sum3, sum4, sum5; 2401019b515eSShri Abhyankar const MatScalar *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa; 2402019b515eSShri Abhyankar const PetscScalar *b; 2403019b515eSShri Abhyankar 2404019b515eSShri Abhyankar PetscFunctionBegin; 240508401ef6SPierre Jolivet PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 2406019b515eSShri Abhyankar node_max = a->inode.node_count; 2407019b515eSShri Abhyankar ns = a->inode.size; /* Node Size array */ 2408019b515eSShri Abhyankar 24099566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 24109566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(xx, &x)); 2411019b515eSShri Abhyankar tmp = a->solve_work; 2412019b515eSShri Abhyankar 24139371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 24149371c9d4SSatish Balay r = rout; 24159371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 24169371c9d4SSatish Balay c = cout; 2417019b515eSShri Abhyankar 2418019b515eSShri Abhyankar /* forward solve the lower triangular */ 2419019b515eSShri Abhyankar tmps = tmp; 2420019b515eSShri Abhyankar aa = a_a; 2421019b515eSShri Abhyankar aj = a_j; 2422019b515eSShri Abhyankar ad = a->diag; 2423019b515eSShri Abhyankar 2424019b515eSShri Abhyankar for (i = 0, row = 0; i < node_max; ++i) { 2425019b515eSShri Abhyankar nsz = ns[i]; 2426019b515eSShri Abhyankar aii = ai[row]; 2427019b515eSShri Abhyankar v1 = aa + aii; 2428019b515eSShri Abhyankar vi = aj + aii; 2429019b515eSShri Abhyankar nz = ai[row + 1] - ai[row]; 2430019b515eSShri Abhyankar 243198991853SShri Abhyankar if (i < node_max - 1) { 243298991853SShri Abhyankar /* Prefetch the indices for the next block */ 243350d8bf02SJed Brown PetscPrefetchBlock(aj + ai[row + nsz], ai[row + nsz + 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */ 243498991853SShri Abhyankar /* Prefetch the data for the next block */ 243550d8bf02SJed Brown PetscPrefetchBlock(aa + ai[row + nsz], ai[row + nsz + ns[i + 1]] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); 243698991853SShri Abhyankar } 243798991853SShri Abhyankar 2438019b515eSShri Abhyankar switch (nsz) { /* Each loop in 'case' is unrolled */ 2439019b515eSShri Abhyankar case 1: 2440019b515eSShri Abhyankar sum1 = b[r[row]]; 2441019b515eSShri Abhyankar for (j = 0; j < nz - 1; j += 2) { 2442019b515eSShri Abhyankar i0 = vi[j]; 2443019b515eSShri Abhyankar i1 = vi[j + 1]; 2444019b515eSShri Abhyankar tmp0 = tmps[i0]; 2445019b515eSShri Abhyankar tmp1 = tmps[i1]; 2446019b515eSShri Abhyankar sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2447019b515eSShri Abhyankar } 2448019b515eSShri Abhyankar if (j == nz - 1) { 2449019b515eSShri Abhyankar tmp0 = tmps[vi[j]]; 2450019b515eSShri Abhyankar sum1 -= v1[j] * tmp0; 2451019b515eSShri Abhyankar } 2452019b515eSShri Abhyankar tmp[row++] = sum1; 2453019b515eSShri Abhyankar break; 2454019b515eSShri Abhyankar case 2: 2455019b515eSShri Abhyankar sum1 = b[r[row]]; 2456019b515eSShri Abhyankar sum2 = b[r[row + 1]]; 2457019b515eSShri Abhyankar v2 = aa + ai[row + 1]; 2458019b515eSShri Abhyankar 2459019b515eSShri Abhyankar for (j = 0; j < nz - 1; j += 2) { 2460019b515eSShri Abhyankar i0 = vi[j]; 2461019b515eSShri Abhyankar i1 = vi[j + 1]; 2462019b515eSShri Abhyankar tmp0 = tmps[i0]; 2463019b515eSShri Abhyankar tmp1 = tmps[i1]; 2464019b515eSShri Abhyankar sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2465019b515eSShri Abhyankar sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1; 2466019b515eSShri Abhyankar } 2467019b515eSShri Abhyankar if (j == nz - 1) { 2468019b515eSShri Abhyankar tmp0 = tmps[vi[j]]; 2469019b515eSShri Abhyankar sum1 -= v1[j] * tmp0; 2470019b515eSShri Abhyankar sum2 -= v2[j] * tmp0; 2471019b515eSShri Abhyankar } 2472019b515eSShri Abhyankar sum2 -= v2[nz] * sum1; 2473019b515eSShri Abhyankar tmp[row++] = sum1; 2474019b515eSShri Abhyankar tmp[row++] = sum2; 2475019b515eSShri Abhyankar break; 2476019b515eSShri Abhyankar case 3: 2477019b515eSShri Abhyankar sum1 = b[r[row]]; 2478019b515eSShri Abhyankar sum2 = b[r[row + 1]]; 2479019b515eSShri Abhyankar sum3 = b[r[row + 2]]; 2480019b515eSShri Abhyankar v2 = aa + ai[row + 1]; 2481019b515eSShri Abhyankar v3 = aa + ai[row + 2]; 2482019b515eSShri Abhyankar 2483019b515eSShri Abhyankar for (j = 0; j < nz - 1; j += 2) { 2484019b515eSShri Abhyankar i0 = vi[j]; 2485019b515eSShri Abhyankar i1 = vi[j + 1]; 2486019b515eSShri Abhyankar tmp0 = tmps[i0]; 2487019b515eSShri Abhyankar tmp1 = tmps[i1]; 2488019b515eSShri Abhyankar sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2489019b515eSShri Abhyankar sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1; 2490019b515eSShri Abhyankar sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1; 2491019b515eSShri Abhyankar } 2492019b515eSShri Abhyankar if (j == nz - 1) { 2493019b515eSShri Abhyankar tmp0 = tmps[vi[j]]; 2494019b515eSShri Abhyankar sum1 -= v1[j] * tmp0; 2495019b515eSShri Abhyankar sum2 -= v2[j] * tmp0; 2496019b515eSShri Abhyankar sum3 -= v3[j] * tmp0; 2497019b515eSShri Abhyankar } 2498019b515eSShri Abhyankar sum2 -= v2[nz] * sum1; 2499019b515eSShri Abhyankar sum3 -= v3[nz] * sum1; 2500019b515eSShri Abhyankar sum3 -= v3[nz + 1] * sum2; 2501019b515eSShri Abhyankar tmp[row++] = sum1; 2502019b515eSShri Abhyankar tmp[row++] = sum2; 2503019b515eSShri Abhyankar tmp[row++] = sum3; 2504019b515eSShri Abhyankar break; 2505019b515eSShri Abhyankar 2506019b515eSShri Abhyankar case 4: 2507019b515eSShri Abhyankar sum1 = b[r[row]]; 2508019b515eSShri Abhyankar sum2 = b[r[row + 1]]; 2509019b515eSShri Abhyankar sum3 = b[r[row + 2]]; 2510019b515eSShri Abhyankar sum4 = b[r[row + 3]]; 2511019b515eSShri Abhyankar v2 = aa + ai[row + 1]; 2512019b515eSShri Abhyankar v3 = aa + ai[row + 2]; 2513019b515eSShri Abhyankar v4 = aa + ai[row + 3]; 2514019b515eSShri Abhyankar 2515019b515eSShri Abhyankar for (j = 0; j < nz - 1; j += 2) { 2516019b515eSShri Abhyankar i0 = vi[j]; 2517019b515eSShri Abhyankar i1 = vi[j + 1]; 2518019b515eSShri Abhyankar tmp0 = tmps[i0]; 2519019b515eSShri Abhyankar tmp1 = tmps[i1]; 2520019b515eSShri Abhyankar sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2521019b515eSShri Abhyankar sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1; 2522019b515eSShri Abhyankar sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1; 2523019b515eSShri Abhyankar sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1; 2524019b515eSShri Abhyankar } 2525019b515eSShri Abhyankar if (j == nz - 1) { 2526019b515eSShri Abhyankar tmp0 = tmps[vi[j]]; 2527019b515eSShri Abhyankar sum1 -= v1[j] * tmp0; 2528019b515eSShri Abhyankar sum2 -= v2[j] * tmp0; 2529019b515eSShri Abhyankar sum3 -= v3[j] * tmp0; 2530019b515eSShri Abhyankar sum4 -= v4[j] * tmp0; 2531019b515eSShri Abhyankar } 2532019b515eSShri Abhyankar sum2 -= v2[nz] * sum1; 2533019b515eSShri Abhyankar sum3 -= v3[nz] * sum1; 2534019b515eSShri Abhyankar sum4 -= v4[nz] * sum1; 2535019b515eSShri Abhyankar sum3 -= v3[nz + 1] * sum2; 2536019b515eSShri Abhyankar sum4 -= v4[nz + 1] * sum2; 2537019b515eSShri Abhyankar sum4 -= v4[nz + 2] * sum3; 2538019b515eSShri Abhyankar 2539019b515eSShri Abhyankar tmp[row++] = sum1; 2540019b515eSShri Abhyankar tmp[row++] = sum2; 2541019b515eSShri Abhyankar tmp[row++] = sum3; 2542019b515eSShri Abhyankar tmp[row++] = sum4; 2543019b515eSShri Abhyankar break; 2544019b515eSShri Abhyankar case 5: 2545019b515eSShri Abhyankar sum1 = b[r[row]]; 2546019b515eSShri Abhyankar sum2 = b[r[row + 1]]; 2547019b515eSShri Abhyankar sum3 = b[r[row + 2]]; 2548019b515eSShri Abhyankar sum4 = b[r[row + 3]]; 2549019b515eSShri Abhyankar sum5 = b[r[row + 4]]; 2550019b515eSShri Abhyankar v2 = aa + ai[row + 1]; 2551019b515eSShri Abhyankar v3 = aa + ai[row + 2]; 2552019b515eSShri Abhyankar v4 = aa + ai[row + 3]; 2553019b515eSShri Abhyankar v5 = aa + ai[row + 4]; 2554019b515eSShri Abhyankar 2555019b515eSShri Abhyankar for (j = 0; j < nz - 1; j += 2) { 2556019b515eSShri Abhyankar i0 = vi[j]; 2557019b515eSShri Abhyankar i1 = vi[j + 1]; 2558019b515eSShri Abhyankar tmp0 = tmps[i0]; 2559019b515eSShri Abhyankar tmp1 = tmps[i1]; 2560019b515eSShri Abhyankar sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2561019b515eSShri Abhyankar sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1; 2562019b515eSShri Abhyankar sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1; 2563019b515eSShri Abhyankar sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1; 2564019b515eSShri Abhyankar sum5 -= v5[j] * tmp0 + v5[j + 1] * tmp1; 2565019b515eSShri Abhyankar } 2566019b515eSShri Abhyankar if (j == nz - 1) { 2567019b515eSShri Abhyankar tmp0 = tmps[vi[j]]; 2568019b515eSShri Abhyankar sum1 -= v1[j] * tmp0; 2569019b515eSShri Abhyankar sum2 -= v2[j] * tmp0; 2570019b515eSShri Abhyankar sum3 -= v3[j] * tmp0; 2571019b515eSShri Abhyankar sum4 -= v4[j] * tmp0; 2572019b515eSShri Abhyankar sum5 -= v5[j] * tmp0; 2573019b515eSShri Abhyankar } 2574019b515eSShri Abhyankar 2575019b515eSShri Abhyankar sum2 -= v2[nz] * sum1; 2576019b515eSShri Abhyankar sum3 -= v3[nz] * sum1; 2577019b515eSShri Abhyankar sum4 -= v4[nz] * sum1; 2578019b515eSShri Abhyankar sum5 -= v5[nz] * sum1; 2579019b515eSShri Abhyankar sum3 -= v3[nz + 1] * sum2; 2580019b515eSShri Abhyankar sum4 -= v4[nz + 1] * sum2; 2581019b515eSShri Abhyankar sum5 -= v5[nz + 1] * sum2; 2582019b515eSShri Abhyankar sum4 -= v4[nz + 2] * sum3; 2583019b515eSShri Abhyankar sum5 -= v5[nz + 2] * sum3; 2584019b515eSShri Abhyankar sum5 -= v5[nz + 3] * sum4; 2585019b515eSShri Abhyankar 2586019b515eSShri Abhyankar tmp[row++] = sum1; 2587019b515eSShri Abhyankar tmp[row++] = sum2; 2588019b515eSShri Abhyankar tmp[row++] = sum3; 2589019b515eSShri Abhyankar tmp[row++] = sum4; 2590019b515eSShri Abhyankar tmp[row++] = sum5; 2591019b515eSShri Abhyankar break; 2592d71ae5a4SJacob Faibussowitsch default: 2593d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported "); 2594019b515eSShri Abhyankar } 2595019b515eSShri Abhyankar } 2596019b515eSShri Abhyankar /* backward solve the upper triangular */ 2597019b515eSShri Abhyankar for (i = node_max - 1, row = n - 1; i >= 0; i--) { 2598019b515eSShri Abhyankar nsz = ns[i]; 2599019b515eSShri Abhyankar aii = ad[row + 1] + 1; 2600019b515eSShri Abhyankar v1 = aa + aii; 2601019b515eSShri Abhyankar vi = aj + aii; 2602019b515eSShri Abhyankar nz = ad[row] - ad[row + 1] - 1; 260398991853SShri Abhyankar 260498991853SShri Abhyankar if (i > 0) { 260598991853SShri Abhyankar /* Prefetch the indices for the next block */ 260650d8bf02SJed Brown PetscPrefetchBlock(aj + ad[row - nsz + 1] + 1, ad[row - nsz] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA); 260798991853SShri Abhyankar /* Prefetch the data for the next block */ 260850d8bf02SJed Brown PetscPrefetchBlock(aa + ad[row - nsz + 1] + 1, ad[row - nsz - ns[i - 1] + 1] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA); 260998991853SShri Abhyankar } 261098991853SShri Abhyankar 2611019b515eSShri Abhyankar switch (nsz) { /* Each loop in 'case' is unrolled */ 2612019b515eSShri Abhyankar case 1: 2613019b515eSShri Abhyankar sum1 = tmp[row]; 2614019b515eSShri Abhyankar 2615019b515eSShri Abhyankar for (j = 0; j < nz - 1; j += 2) { 2616019b515eSShri Abhyankar i0 = vi[j]; 2617019b515eSShri Abhyankar i1 = vi[j + 1]; 2618019b515eSShri Abhyankar tmp0 = tmps[i0]; 2619019b515eSShri Abhyankar tmp1 = tmps[i1]; 2620019b515eSShri Abhyankar sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2621019b515eSShri Abhyankar } 2622019b515eSShri Abhyankar if (j == nz - 1) { 2623019b515eSShri Abhyankar tmp0 = tmps[vi[j]]; 2624019b515eSShri Abhyankar sum1 -= v1[j] * tmp0; 2625019b515eSShri Abhyankar } 26269371c9d4SSatish Balay x[c[row]] = tmp[row] = sum1 * v1[nz]; 26279371c9d4SSatish Balay row--; 2628019b515eSShri Abhyankar break; 2629019b515eSShri Abhyankar case 2: 2630019b515eSShri Abhyankar sum1 = tmp[row]; 2631019b515eSShri Abhyankar sum2 = tmp[row - 1]; 2632019b515eSShri Abhyankar v2 = aa + ad[row] + 1; 2633019b515eSShri Abhyankar for (j = 0; j < nz - 1; j += 2) { 2634019b515eSShri Abhyankar i0 = vi[j]; 2635019b515eSShri Abhyankar i1 = vi[j + 1]; 2636019b515eSShri Abhyankar tmp0 = tmps[i0]; 2637019b515eSShri Abhyankar tmp1 = tmps[i1]; 2638019b515eSShri Abhyankar sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2639019b515eSShri Abhyankar sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1; 2640019b515eSShri Abhyankar } 2641019b515eSShri Abhyankar if (j == nz - 1) { 2642019b515eSShri Abhyankar tmp0 = tmps[vi[j]]; 2643019b515eSShri Abhyankar sum1 -= v1[j] * tmp0; 2644019b515eSShri Abhyankar sum2 -= v2[j + 1] * tmp0; 2645019b515eSShri Abhyankar } 2646019b515eSShri Abhyankar 26479371c9d4SSatish Balay tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz]; 26489371c9d4SSatish Balay row--; 2649019b515eSShri Abhyankar sum2 -= v2[0] * tmp0; 26509371c9d4SSatish Balay x[c[row]] = tmp[row] = sum2 * v2[nz + 1]; 26519371c9d4SSatish Balay row--; 2652019b515eSShri Abhyankar break; 2653019b515eSShri Abhyankar case 3: 2654019b515eSShri Abhyankar sum1 = tmp[row]; 2655019b515eSShri Abhyankar sum2 = tmp[row - 1]; 2656019b515eSShri Abhyankar sum3 = tmp[row - 2]; 2657019b515eSShri Abhyankar v2 = aa + ad[row] + 1; 2658019b515eSShri Abhyankar v3 = aa + ad[row - 1] + 1; 2659019b515eSShri Abhyankar for (j = 0; j < nz - 1; j += 2) { 2660019b515eSShri Abhyankar i0 = vi[j]; 2661019b515eSShri Abhyankar i1 = vi[j + 1]; 2662019b515eSShri Abhyankar tmp0 = tmps[i0]; 2663019b515eSShri Abhyankar tmp1 = tmps[i1]; 2664019b515eSShri Abhyankar sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2665019b515eSShri Abhyankar sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1; 2666019b515eSShri Abhyankar sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1; 2667019b515eSShri Abhyankar } 2668019b515eSShri Abhyankar if (j == nz - 1) { 2669019b515eSShri Abhyankar tmp0 = tmps[vi[j]]; 2670019b515eSShri Abhyankar sum1 -= v1[j] * tmp0; 2671019b515eSShri Abhyankar sum2 -= v2[j + 1] * tmp0; 2672019b515eSShri Abhyankar sum3 -= v3[j + 2] * tmp0; 2673019b515eSShri Abhyankar } 26749371c9d4SSatish Balay tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz]; 26759371c9d4SSatish Balay row--; 2676019b515eSShri Abhyankar sum2 -= v2[0] * tmp0; 2677019b515eSShri Abhyankar sum3 -= v3[1] * tmp0; 26789371c9d4SSatish Balay tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1]; 26799371c9d4SSatish Balay row--; 2680019b515eSShri Abhyankar sum3 -= v3[0] * tmp0; 26819371c9d4SSatish Balay x[c[row]] = tmp[row] = sum3 * v3[nz + 2]; 26829371c9d4SSatish Balay row--; 2683019b515eSShri Abhyankar 2684019b515eSShri Abhyankar break; 2685019b515eSShri Abhyankar case 4: 2686019b515eSShri Abhyankar sum1 = tmp[row]; 2687019b515eSShri Abhyankar sum2 = tmp[row - 1]; 2688019b515eSShri Abhyankar sum3 = tmp[row - 2]; 2689019b515eSShri Abhyankar sum4 = tmp[row - 3]; 2690019b515eSShri Abhyankar v2 = aa + ad[row] + 1; 2691019b515eSShri Abhyankar v3 = aa + ad[row - 1] + 1; 2692019b515eSShri Abhyankar v4 = aa + ad[row - 2] + 1; 2693019b515eSShri Abhyankar 2694019b515eSShri Abhyankar for (j = 0; j < nz - 1; j += 2) { 2695019b515eSShri Abhyankar i0 = vi[j]; 2696019b515eSShri Abhyankar i1 = vi[j + 1]; 2697019b515eSShri Abhyankar tmp0 = tmps[i0]; 2698019b515eSShri Abhyankar tmp1 = tmps[i1]; 2699019b515eSShri Abhyankar sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2700019b515eSShri Abhyankar sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1; 2701019b515eSShri Abhyankar sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1; 2702019b515eSShri Abhyankar sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1; 2703019b515eSShri Abhyankar } 2704019b515eSShri Abhyankar if (j == nz - 1) { 2705019b515eSShri Abhyankar tmp0 = tmps[vi[j]]; 2706019b515eSShri Abhyankar sum1 -= v1[j] * tmp0; 2707019b515eSShri Abhyankar sum2 -= v2[j + 1] * tmp0; 2708019b515eSShri Abhyankar sum3 -= v3[j + 2] * tmp0; 2709019b515eSShri Abhyankar sum4 -= v4[j + 3] * tmp0; 2710019b515eSShri Abhyankar } 2711019b515eSShri Abhyankar 27129371c9d4SSatish Balay tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz]; 27139371c9d4SSatish Balay row--; 2714019b515eSShri Abhyankar sum2 -= v2[0] * tmp0; 2715019b515eSShri Abhyankar sum3 -= v3[1] * tmp0; 2716019b515eSShri Abhyankar sum4 -= v4[2] * tmp0; 27179371c9d4SSatish Balay tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1]; 27189371c9d4SSatish Balay row--; 2719019b515eSShri Abhyankar sum3 -= v3[0] * tmp0; 2720019b515eSShri Abhyankar sum4 -= v4[1] * tmp0; 27219371c9d4SSatish Balay tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2]; 27229371c9d4SSatish Balay row--; 2723019b515eSShri Abhyankar sum4 -= v4[0] * tmp0; 27249371c9d4SSatish Balay x[c[row]] = tmp[row] = sum4 * v4[nz + 3]; 27259371c9d4SSatish Balay row--; 2726019b515eSShri Abhyankar break; 2727019b515eSShri Abhyankar case 5: 2728019b515eSShri Abhyankar sum1 = tmp[row]; 2729019b515eSShri Abhyankar sum2 = tmp[row - 1]; 2730019b515eSShri Abhyankar sum3 = tmp[row - 2]; 2731019b515eSShri Abhyankar sum4 = tmp[row - 3]; 2732019b515eSShri Abhyankar sum5 = tmp[row - 4]; 2733019b515eSShri Abhyankar v2 = aa + ad[row] + 1; 2734019b515eSShri Abhyankar v3 = aa + ad[row - 1] + 1; 2735019b515eSShri Abhyankar v4 = aa + ad[row - 2] + 1; 2736019b515eSShri Abhyankar v5 = aa + ad[row - 3] + 1; 2737019b515eSShri Abhyankar for (j = 0; j < nz - 1; j += 2) { 2738019b515eSShri Abhyankar i0 = vi[j]; 2739019b515eSShri Abhyankar i1 = vi[j + 1]; 2740019b515eSShri Abhyankar tmp0 = tmps[i0]; 2741019b515eSShri Abhyankar tmp1 = tmps[i1]; 2742019b515eSShri Abhyankar sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2743019b515eSShri Abhyankar sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1; 2744019b515eSShri Abhyankar sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1; 2745019b515eSShri Abhyankar sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1; 2746019b515eSShri Abhyankar sum5 -= v5[j + 4] * tmp0 + v5[j + 5] * tmp1; 2747019b515eSShri Abhyankar } 2748019b515eSShri Abhyankar if (j == nz - 1) { 2749019b515eSShri Abhyankar tmp0 = tmps[vi[j]]; 2750019b515eSShri Abhyankar sum1 -= v1[j] * tmp0; 2751019b515eSShri Abhyankar sum2 -= v2[j + 1] * tmp0; 2752019b515eSShri Abhyankar sum3 -= v3[j + 2] * tmp0; 2753019b515eSShri Abhyankar sum4 -= v4[j + 3] * tmp0; 2754019b515eSShri Abhyankar sum5 -= v5[j + 4] * tmp0; 2755019b515eSShri Abhyankar } 2756019b515eSShri Abhyankar 27579371c9d4SSatish Balay tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz]; 27589371c9d4SSatish Balay row--; 2759019b515eSShri Abhyankar sum2 -= v2[0] * tmp0; 2760019b515eSShri Abhyankar sum3 -= v3[1] * tmp0; 2761019b515eSShri Abhyankar sum4 -= v4[2] * tmp0; 2762019b515eSShri Abhyankar sum5 -= v5[3] * tmp0; 27639371c9d4SSatish Balay tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1]; 27649371c9d4SSatish Balay row--; 2765019b515eSShri Abhyankar sum3 -= v3[0] * tmp0; 2766019b515eSShri Abhyankar sum4 -= v4[1] * tmp0; 2767019b515eSShri Abhyankar sum5 -= v5[2] * tmp0; 27689371c9d4SSatish Balay tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2]; 27699371c9d4SSatish Balay row--; 2770019b515eSShri Abhyankar sum4 -= v4[0] * tmp0; 2771019b515eSShri Abhyankar sum5 -= v5[1] * tmp0; 27729371c9d4SSatish Balay tmp0 = x[c[row]] = tmp[row] = sum4 * v4[nz + 3]; 27739371c9d4SSatish Balay row--; 2774019b515eSShri Abhyankar sum5 -= v5[0] * tmp0; 27759371c9d4SSatish Balay x[c[row]] = tmp[row] = sum5 * v5[nz + 4]; 27769371c9d4SSatish Balay row--; 2777019b515eSShri Abhyankar break; 2778d71ae5a4SJacob Faibussowitsch default: 2779d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported "); 2780019b515eSShri Abhyankar } 2781019b515eSShri Abhyankar } 27829566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 27839566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 27849566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 27859566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(xx, &x)); 27869566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n)); 27873ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2788019b515eSShri Abhyankar } 2789019b515eSShri Abhyankar 27904c1414c8SBarry Smith /* 27914c1414c8SBarry Smith Makes a longer coloring[] array and calls the usual code with that 27924c1414c8SBarry Smith */ 2793d71ae5a4SJacob Faibussowitsch PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat, PetscInt ncolors, PetscInt nin, ISColoringValue coloring[], ISColoring *iscoloring) 2794d71ae5a4SJacob Faibussowitsch { 27954c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)mat->data; 2796d0f46423SBarry Smith PetscInt n = mat->cmap->n, m = a->inode.node_count, j, *ns = a->inode.size, row; 27974c1414c8SBarry Smith PetscInt *colorused, i; 27984c1414c8SBarry Smith ISColoringValue *newcolor; 27994c1414c8SBarry Smith 28004c1414c8SBarry Smith PetscFunctionBegin; 280108401ef6SPierre Jolivet PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 28029566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(n + 1, &newcolor)); 28034c1414c8SBarry Smith /* loop over inodes, marking a color for each column*/ 28044c1414c8SBarry Smith row = 0; 28054c1414c8SBarry Smith for (i = 0; i < m; i++) { 2806ad540459SPierre Jolivet for (j = 0; j < ns[i]; j++) newcolor[row++] = coloring[i] + j * ncolors; 28074c1414c8SBarry Smith } 28084c1414c8SBarry Smith 28094c1414c8SBarry Smith /* eliminate unneeded colors */ 28109566063dSJacob Faibussowitsch PetscCall(PetscCalloc1(5 * ncolors, &colorused)); 2811ad540459SPierre Jolivet for (i = 0; i < n; i++) colorused[newcolor[i]] = 1; 28124c1414c8SBarry Smith 2813ad540459SPierre Jolivet for (i = 1; i < 5 * ncolors; i++) colorused[i] += colorused[i - 1]; 28144c1414c8SBarry Smith ncolors = colorused[5 * ncolors - 1]; 2815ad540459SPierre Jolivet for (i = 0; i < n; i++) newcolor[i] = colorused[newcolor[i]] - 1; 28169566063dSJacob Faibussowitsch PetscCall(PetscFree(colorused)); 28179566063dSJacob Faibussowitsch PetscCall(ISColoringCreate(PetscObjectComm((PetscObject)mat), ncolors, n, newcolor, PETSC_OWN_POINTER, iscoloring)); 28189566063dSJacob Faibussowitsch PetscCall(PetscFree(coloring)); 28193ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 28204c1414c8SBarry Smith } 28214c1414c8SBarry Smith 2822af0996ceSBarry Smith #include <petsc/private/kernels/blockinvert.h> 28232af78befSBarry Smith 2824d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx) 2825d71ae5a4SJacob Faibussowitsch { 28262af78befSBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 28277aaeff0aSMatthew G. Knepley PetscScalar sum1 = 0.0, sum2 = 0.0, sum3 = 0.0, sum4 = 0.0, sum5 = 0.0, tmp0, tmp1, tmp2, tmp3; 28285850ef23SBarry Smith MatScalar *ibdiag, *bdiag, work[25], *t; 2829a8b09249SBarry Smith PetscScalar *x, tmp4, tmp5, x1, x2, x3, x4, x5; 28307aaeff0aSMatthew G. Knepley const MatScalar *v = a->a, *v1 = NULL, *v2 = NULL, *v3 = NULL, *v4 = NULL, *v5 = NULL; 28315850ef23SBarry Smith const PetscScalar *xb, *b; 28327b6c816cSBarry Smith PetscReal zeropivot = 100. * PETSC_MACHINE_EPSILON, shift = 0.0; 28338758e1faSBarry Smith PetscInt n, m = a->inode.node_count, cnt = 0, i, j, row, i1, i2; 28348758e1faSBarry Smith PetscInt sz, k, ipvt[5]; 28357b6c816cSBarry Smith PetscBool allowzeropivot, zeropivotdetected; 28368758e1faSBarry Smith const PetscInt *sizes = a->inode.size, *idx, *diag = a->diag, *ii = a->i; 28372af78befSBarry Smith 28382af78befSBarry Smith PetscFunctionBegin; 2839a455e926SHong Zhang allowzeropivot = PetscNot(A->erroriffailure); 284008401ef6SPierre Jolivet PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 284108401ef6SPierre Jolivet PetscCheck(omega == 1.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for omega != 1.0; use -mat_no_inode"); 284208401ef6SPierre Jolivet PetscCheck(fshift == 0.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for fshift != 0.0; use -mat_no_inode"); 28432af78befSBarry Smith 284471f1c65dSBarry Smith if (!a->inode.ibdiagvalid) { 28452af78befSBarry Smith if (!a->inode.ibdiag) { 28462af78befSBarry Smith /* calculate space needed for diagonal blocks */ 2847ad540459SPierre Jolivet for (i = 0; i < m; i++) cnt += sizes[i] * sizes[i]; 2848f0d39aaaSBarry Smith a->inode.bdiagsize = cnt; 28492205254eSKarl Rupp 28509566063dSJacob Faibussowitsch PetscCall(PetscMalloc3(cnt, &a->inode.ibdiag, cnt, &a->inode.bdiag, A->rmap->n, &a->inode.ssor_work)); 285171f1c65dSBarry Smith } 285271f1c65dSBarry Smith 285371f1c65dSBarry Smith /* copy over the diagonal blocks and invert them */ 28542af78befSBarry Smith ibdiag = a->inode.ibdiag; 28552af78befSBarry Smith bdiag = a->inode.bdiag; 28562af78befSBarry Smith cnt = 0; 28572af78befSBarry Smith for (i = 0, row = 0; i < m; i++) { 28582af78befSBarry Smith for (j = 0; j < sizes[i]; j++) { 2859ad540459SPierre Jolivet for (k = 0; k < sizes[i]; k++) bdiag[cnt + k * sizes[i] + j] = v[diag[row + j] - j + k]; 28602af78befSBarry Smith } 28619566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(ibdiag + cnt, bdiag + cnt, sizes[i] * sizes[i])); 28622af78befSBarry Smith 28632af78befSBarry Smith switch (sizes[i]) { 28642af78befSBarry Smith case 1: 28652af78befSBarry Smith /* Create matrix data structure */ 28668e0e2a9aSHong Zhang if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) { 28678e0e2a9aSHong Zhang if (allowzeropivot) { 28687b6c816cSBarry Smith A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 28697b6c816cSBarry Smith A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]); 28707b6c816cSBarry Smith A->factorerror_zeropivot_row = row; 28719566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "Zero pivot, row %" PetscInt_FMT "\n", row)); 287298921bdaSJacob Faibussowitsch } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot on row %" PetscInt_FMT, row); 28738e0e2a9aSHong Zhang } 287464c62002SMatthew Knepley ibdiag[cnt] = 1.0 / ibdiag[cnt]; 28752af78befSBarry Smith break; 28762af78befSBarry Smith case 2: 28779566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_inverse_A_2(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected)); 28787b6c816cSBarry Smith if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 28792af78befSBarry Smith break; 28802af78befSBarry Smith case 3: 28819566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_inverse_A_3(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected)); 28827b6c816cSBarry Smith if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 28832af78befSBarry Smith break; 28842af78befSBarry Smith case 4: 28859566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_inverse_A_4(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected)); 28867b6c816cSBarry Smith if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 28872af78befSBarry Smith break; 28882af78befSBarry Smith case 5: 28899566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_inverse_A_5(ibdiag + cnt, ipvt, work, shift, allowzeropivot, &zeropivotdetected)); 28907b6c816cSBarry Smith if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 28912af78befSBarry Smith break; 2892d71ae5a4SJacob Faibussowitsch default: 2893d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 28942af78befSBarry Smith } 28952af78befSBarry Smith cnt += sizes[i] * sizes[i]; 28962af78befSBarry Smith row += sizes[i]; 28972af78befSBarry Smith } 289871f1c65dSBarry Smith a->inode.ibdiagvalid = PETSC_TRUE; 28992af78befSBarry Smith } 29002af78befSBarry Smith ibdiag = a->inode.ibdiag; 29012af78befSBarry Smith bdiag = a->inode.bdiag; 29025850ef23SBarry Smith t = a->inode.ssor_work; 29032af78befSBarry Smith 29049566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 29059566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 29065850ef23SBarry Smith /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 29075850ef23SBarry Smith if (flag & SOR_ZERO_INITIAL_GUESS) { 29082af78befSBarry Smith if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 29098862d2efSBarry Smith for (i = 0, row = 0; i < m; i++) { 29108862d2efSBarry Smith sz = diag[row] - ii[row]; 29118862d2efSBarry Smith v1 = a->a + ii[row]; 29128862d2efSBarry Smith idx = a->j + ii[row]; 29138862d2efSBarry Smith 29144108e4d5SBarry Smith /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 29158862d2efSBarry Smith switch (sizes[i]) { 29168862d2efSBarry Smith case 1: 29178862d2efSBarry Smith 29188862d2efSBarry Smith sum1 = b[row]; 29198862d2efSBarry Smith for (n = 0; n < sz - 1; n += 2) { 29208862d2efSBarry Smith i1 = idx[0]; 29218862d2efSBarry Smith i2 = idx[1]; 29228862d2efSBarry Smith idx += 2; 29238862d2efSBarry Smith tmp0 = x[i1]; 29248862d2efSBarry Smith tmp1 = x[i2]; 29259371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 29269371c9d4SSatish Balay v1 += 2; 29278862d2efSBarry Smith } 29288862d2efSBarry Smith 29298862d2efSBarry Smith if (n == sz - 1) { 2930f0d39aaaSBarry Smith tmp0 = x[*idx]; 2931f0d39aaaSBarry Smith sum1 -= *v1 * tmp0; 29328862d2efSBarry Smith } 29335850ef23SBarry Smith t[row] = sum1; 29348862d2efSBarry Smith x[row++] = sum1 * (*ibdiag++); 29358862d2efSBarry Smith break; 2936f0d39aaaSBarry Smith case 2: 2937f0d39aaaSBarry Smith v2 = a->a + ii[row + 1]; 2938f0d39aaaSBarry Smith sum1 = b[row]; 2939f0d39aaaSBarry Smith sum2 = b[row + 1]; 2940f0d39aaaSBarry Smith for (n = 0; n < sz - 1; n += 2) { 2941f0d39aaaSBarry Smith i1 = idx[0]; 2942f0d39aaaSBarry Smith i2 = idx[1]; 2943f0d39aaaSBarry Smith idx += 2; 2944f0d39aaaSBarry Smith tmp0 = x[i1]; 2945f0d39aaaSBarry Smith tmp1 = x[i2]; 29469371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 29479371c9d4SSatish Balay v1 += 2; 29489371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 29499371c9d4SSatish Balay v2 += 2; 2950f0d39aaaSBarry Smith } 2951f0d39aaaSBarry Smith 2952f0d39aaaSBarry Smith if (n == sz - 1) { 2953f0d39aaaSBarry Smith tmp0 = x[*idx]; 2954f0d39aaaSBarry Smith sum1 -= v1[0] * tmp0; 2955f0d39aaaSBarry Smith sum2 -= v2[0] * tmp0; 2956f0d39aaaSBarry Smith } 29575850ef23SBarry Smith t[row] = sum1; 29585850ef23SBarry Smith t[row + 1] = sum2; 2959f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[2]; 2960f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[3]; 2961f0d39aaaSBarry Smith ibdiag += 4; 2962f0d39aaaSBarry Smith break; 2963f0d39aaaSBarry Smith case 3: 2964f0d39aaaSBarry Smith v2 = a->a + ii[row + 1]; 2965f0d39aaaSBarry Smith v3 = a->a + ii[row + 2]; 2966f0d39aaaSBarry Smith sum1 = b[row]; 2967f0d39aaaSBarry Smith sum2 = b[row + 1]; 2968f0d39aaaSBarry Smith sum3 = b[row + 2]; 2969f0d39aaaSBarry Smith for (n = 0; n < sz - 1; n += 2) { 2970f0d39aaaSBarry Smith i1 = idx[0]; 2971f0d39aaaSBarry Smith i2 = idx[1]; 2972f0d39aaaSBarry Smith idx += 2; 2973f0d39aaaSBarry Smith tmp0 = x[i1]; 2974f0d39aaaSBarry Smith tmp1 = x[i2]; 29759371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 29769371c9d4SSatish Balay v1 += 2; 29779371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 29789371c9d4SSatish Balay v2 += 2; 29799371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 29809371c9d4SSatish Balay v3 += 2; 2981f0d39aaaSBarry Smith } 2982f0d39aaaSBarry Smith 2983f0d39aaaSBarry Smith if (n == sz - 1) { 2984f0d39aaaSBarry Smith tmp0 = x[*idx]; 2985f0d39aaaSBarry Smith sum1 -= v1[0] * tmp0; 2986f0d39aaaSBarry Smith sum2 -= v2[0] * tmp0; 2987f0d39aaaSBarry Smith sum3 -= v3[0] * tmp0; 2988f0d39aaaSBarry Smith } 29895850ef23SBarry Smith t[row] = sum1; 29905850ef23SBarry Smith t[row + 1] = sum2; 29915850ef23SBarry Smith t[row + 2] = sum3; 2992f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6]; 2993f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7]; 2994f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8]; 2995f0d39aaaSBarry Smith ibdiag += 9; 2996f0d39aaaSBarry Smith break; 2997f0d39aaaSBarry Smith case 4: 2998f0d39aaaSBarry Smith v2 = a->a + ii[row + 1]; 2999f0d39aaaSBarry Smith v3 = a->a + ii[row + 2]; 3000f0d39aaaSBarry Smith v4 = a->a + ii[row + 3]; 3001f0d39aaaSBarry Smith sum1 = b[row]; 3002f0d39aaaSBarry Smith sum2 = b[row + 1]; 3003f0d39aaaSBarry Smith sum3 = b[row + 2]; 3004f0d39aaaSBarry Smith sum4 = b[row + 3]; 3005f0d39aaaSBarry Smith for (n = 0; n < sz - 1; n += 2) { 3006f0d39aaaSBarry Smith i1 = idx[0]; 3007f0d39aaaSBarry Smith i2 = idx[1]; 3008f0d39aaaSBarry Smith idx += 2; 3009f0d39aaaSBarry Smith tmp0 = x[i1]; 3010f0d39aaaSBarry Smith tmp1 = x[i2]; 30119371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 30129371c9d4SSatish Balay v1 += 2; 30139371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 30149371c9d4SSatish Balay v2 += 2; 30159371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 30169371c9d4SSatish Balay v3 += 2; 30179371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 30189371c9d4SSatish Balay v4 += 2; 3019f0d39aaaSBarry Smith } 3020f0d39aaaSBarry Smith 3021f0d39aaaSBarry Smith if (n == sz - 1) { 3022f0d39aaaSBarry Smith tmp0 = x[*idx]; 3023f0d39aaaSBarry Smith sum1 -= v1[0] * tmp0; 3024f0d39aaaSBarry Smith sum2 -= v2[0] * tmp0; 3025f0d39aaaSBarry Smith sum3 -= v3[0] * tmp0; 3026f0d39aaaSBarry Smith sum4 -= v4[0] * tmp0; 3027f0d39aaaSBarry Smith } 30285850ef23SBarry Smith t[row] = sum1; 30295850ef23SBarry Smith t[row + 1] = sum2; 30305850ef23SBarry Smith t[row + 2] = sum3; 30315850ef23SBarry Smith t[row + 3] = sum4; 3032f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12]; 3033f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13]; 3034f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14]; 3035f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15]; 3036f0d39aaaSBarry Smith ibdiag += 16; 3037f0d39aaaSBarry Smith break; 3038f0d39aaaSBarry Smith case 5: 3039f0d39aaaSBarry Smith v2 = a->a + ii[row + 1]; 3040f0d39aaaSBarry Smith v3 = a->a + ii[row + 2]; 3041f0d39aaaSBarry Smith v4 = a->a + ii[row + 3]; 3042f0d39aaaSBarry Smith v5 = a->a + ii[row + 4]; 3043f0d39aaaSBarry Smith sum1 = b[row]; 3044f0d39aaaSBarry Smith sum2 = b[row + 1]; 3045f0d39aaaSBarry Smith sum3 = b[row + 2]; 3046f0d39aaaSBarry Smith sum4 = b[row + 3]; 3047f0d39aaaSBarry Smith sum5 = b[row + 4]; 3048f0d39aaaSBarry Smith for (n = 0; n < sz - 1; n += 2) { 3049f0d39aaaSBarry Smith i1 = idx[0]; 3050f0d39aaaSBarry Smith i2 = idx[1]; 3051f0d39aaaSBarry Smith idx += 2; 3052f0d39aaaSBarry Smith tmp0 = x[i1]; 3053f0d39aaaSBarry Smith tmp1 = x[i2]; 30549371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 30559371c9d4SSatish Balay v1 += 2; 30569371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 30579371c9d4SSatish Balay v2 += 2; 30589371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 30599371c9d4SSatish Balay v3 += 2; 30609371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 30619371c9d4SSatish Balay v4 += 2; 30629371c9d4SSatish Balay sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 30639371c9d4SSatish Balay v5 += 2; 3064f0d39aaaSBarry Smith } 3065f0d39aaaSBarry Smith 3066f0d39aaaSBarry Smith if (n == sz - 1) { 3067f0d39aaaSBarry Smith tmp0 = x[*idx]; 3068f0d39aaaSBarry Smith sum1 -= v1[0] * tmp0; 3069f0d39aaaSBarry Smith sum2 -= v2[0] * tmp0; 3070f0d39aaaSBarry Smith sum3 -= v3[0] * tmp0; 3071f0d39aaaSBarry Smith sum4 -= v4[0] * tmp0; 3072f0d39aaaSBarry Smith sum5 -= v5[0] * tmp0; 3073f0d39aaaSBarry Smith } 30745850ef23SBarry Smith t[row] = sum1; 30755850ef23SBarry Smith t[row + 1] = sum2; 30765850ef23SBarry Smith t[row + 2] = sum3; 30775850ef23SBarry Smith t[row + 3] = sum4; 30785850ef23SBarry Smith t[row + 4] = sum5; 3079f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20]; 3080f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21]; 3081f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22]; 3082f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23]; 3083f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24]; 3084f0d39aaaSBarry Smith ibdiag += 25; 3085f0d39aaaSBarry Smith break; 3086d71ae5a4SJacob Faibussowitsch default: 3087d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 30888862d2efSBarry Smith } 30892af78befSBarry Smith } 30902af78befSBarry Smith 30915850ef23SBarry Smith xb = t; 30929566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(a->nz)); 30932af78befSBarry Smith } else xb = b; 30942af78befSBarry Smith if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 3095f0d39aaaSBarry Smith ibdiag = a->inode.ibdiag + a->inode.bdiagsize; 3096d0f46423SBarry Smith for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) { 3097f0d39aaaSBarry Smith ibdiag -= sizes[i] * sizes[i]; 30988862d2efSBarry Smith sz = ii[row + 1] - diag[row] - 1; 30998862d2efSBarry Smith v1 = a->a + diag[row] + 1; 31008862d2efSBarry Smith idx = a->j + diag[row] + 1; 31012af78befSBarry Smith 31024108e4d5SBarry Smith /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 31038862d2efSBarry Smith switch (sizes[i]) { 31048862d2efSBarry Smith case 1: 31058862d2efSBarry Smith 31068862d2efSBarry Smith sum1 = xb[row]; 31078862d2efSBarry Smith for (n = 0; n < sz - 1; n += 2) { 31088862d2efSBarry Smith i1 = idx[0]; 31098862d2efSBarry Smith i2 = idx[1]; 31108862d2efSBarry Smith idx += 2; 31118862d2efSBarry Smith tmp0 = x[i1]; 31128862d2efSBarry Smith tmp1 = x[i2]; 31139371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 31149371c9d4SSatish Balay v1 += 2; 31158862d2efSBarry Smith } 31168862d2efSBarry Smith 31178862d2efSBarry Smith if (n == sz - 1) { 3118f0d39aaaSBarry Smith tmp0 = x[*idx]; 3119f0d39aaaSBarry Smith sum1 -= *v1 * tmp0; 31208862d2efSBarry Smith } 3121f0d39aaaSBarry Smith x[row--] = sum1 * (*ibdiag); 3122f0d39aaaSBarry Smith break; 3123f0d39aaaSBarry Smith 3124f0d39aaaSBarry Smith case 2: 3125f0d39aaaSBarry Smith 3126f0d39aaaSBarry Smith sum1 = xb[row]; 3127f0d39aaaSBarry Smith sum2 = xb[row - 1]; 3128f0d39aaaSBarry Smith /* note that sum1 is associated with the second of the two rows */ 3129f0d39aaaSBarry Smith v2 = a->a + diag[row - 1] + 2; 3130f0d39aaaSBarry Smith for (n = 0; n < sz - 1; n += 2) { 3131f0d39aaaSBarry Smith i1 = idx[0]; 3132f0d39aaaSBarry Smith i2 = idx[1]; 3133f0d39aaaSBarry Smith idx += 2; 3134f0d39aaaSBarry Smith tmp0 = x[i1]; 3135f0d39aaaSBarry Smith tmp1 = x[i2]; 31369371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 31379371c9d4SSatish Balay v1 += 2; 31389371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 31399371c9d4SSatish Balay v2 += 2; 3140f0d39aaaSBarry Smith } 3141f0d39aaaSBarry Smith 3142f0d39aaaSBarry Smith if (n == sz - 1) { 3143f0d39aaaSBarry Smith tmp0 = x[*idx]; 3144f0d39aaaSBarry Smith sum1 -= *v1 * tmp0; 3145f0d39aaaSBarry Smith sum2 -= *v2 * tmp0; 3146f0d39aaaSBarry Smith } 3147f0d39aaaSBarry Smith x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3]; 3148f0d39aaaSBarry Smith x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2]; 3149f0d39aaaSBarry Smith break; 3150f0d39aaaSBarry Smith case 3: 3151f0d39aaaSBarry Smith 3152f0d39aaaSBarry Smith sum1 = xb[row]; 3153f0d39aaaSBarry Smith sum2 = xb[row - 1]; 3154f0d39aaaSBarry Smith sum3 = xb[row - 2]; 3155f0d39aaaSBarry Smith v2 = a->a + diag[row - 1] + 2; 3156f0d39aaaSBarry Smith v3 = a->a + diag[row - 2] + 3; 3157f0d39aaaSBarry Smith for (n = 0; n < sz - 1; n += 2) { 3158f0d39aaaSBarry Smith i1 = idx[0]; 3159f0d39aaaSBarry Smith i2 = idx[1]; 3160f0d39aaaSBarry Smith idx += 2; 3161f0d39aaaSBarry Smith tmp0 = x[i1]; 3162f0d39aaaSBarry Smith tmp1 = x[i2]; 31639371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 31649371c9d4SSatish Balay v1 += 2; 31659371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 31669371c9d4SSatish Balay v2 += 2; 31679371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 31689371c9d4SSatish Balay v3 += 2; 3169f0d39aaaSBarry Smith } 3170f0d39aaaSBarry Smith 3171f0d39aaaSBarry Smith if (n == sz - 1) { 3172f0d39aaaSBarry Smith tmp0 = x[*idx]; 3173f0d39aaaSBarry Smith sum1 -= *v1 * tmp0; 3174f0d39aaaSBarry Smith sum2 -= *v2 * tmp0; 3175f0d39aaaSBarry Smith sum3 -= *v3 * tmp0; 3176f0d39aaaSBarry Smith } 3177f0d39aaaSBarry Smith x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8]; 3178f0d39aaaSBarry Smith x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7]; 3179f0d39aaaSBarry Smith x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6]; 3180f0d39aaaSBarry Smith break; 3181f0d39aaaSBarry Smith case 4: 3182f0d39aaaSBarry Smith 3183f0d39aaaSBarry Smith sum1 = xb[row]; 3184f0d39aaaSBarry Smith sum2 = xb[row - 1]; 3185f0d39aaaSBarry Smith sum3 = xb[row - 2]; 3186f0d39aaaSBarry Smith sum4 = xb[row - 3]; 3187f0d39aaaSBarry Smith v2 = a->a + diag[row - 1] + 2; 3188f0d39aaaSBarry Smith v3 = a->a + diag[row - 2] + 3; 3189f0d39aaaSBarry Smith v4 = a->a + diag[row - 3] + 4; 3190f0d39aaaSBarry Smith for (n = 0; n < sz - 1; n += 2) { 3191f0d39aaaSBarry Smith i1 = idx[0]; 3192f0d39aaaSBarry Smith i2 = idx[1]; 3193f0d39aaaSBarry Smith idx += 2; 3194f0d39aaaSBarry Smith tmp0 = x[i1]; 3195f0d39aaaSBarry Smith tmp1 = x[i2]; 31969371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 31979371c9d4SSatish Balay v1 += 2; 31989371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 31999371c9d4SSatish Balay v2 += 2; 32009371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 32019371c9d4SSatish Balay v3 += 2; 32029371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 32039371c9d4SSatish Balay v4 += 2; 3204f0d39aaaSBarry Smith } 3205f0d39aaaSBarry Smith 3206f0d39aaaSBarry Smith if (n == sz - 1) { 3207f0d39aaaSBarry Smith tmp0 = x[*idx]; 3208f0d39aaaSBarry Smith sum1 -= *v1 * tmp0; 3209f0d39aaaSBarry Smith sum2 -= *v2 * tmp0; 3210f0d39aaaSBarry Smith sum3 -= *v3 * tmp0; 3211f0d39aaaSBarry Smith sum4 -= *v4 * tmp0; 3212f0d39aaaSBarry Smith } 3213f0d39aaaSBarry Smith x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15]; 3214f0d39aaaSBarry Smith x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14]; 3215f0d39aaaSBarry Smith x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13]; 3216f0d39aaaSBarry Smith x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12]; 3217f0d39aaaSBarry Smith break; 3218f0d39aaaSBarry Smith case 5: 3219f0d39aaaSBarry Smith 3220f0d39aaaSBarry Smith sum1 = xb[row]; 3221f0d39aaaSBarry Smith sum2 = xb[row - 1]; 3222f0d39aaaSBarry Smith sum3 = xb[row - 2]; 3223f0d39aaaSBarry Smith sum4 = xb[row - 3]; 3224f0d39aaaSBarry Smith sum5 = xb[row - 4]; 3225f0d39aaaSBarry Smith v2 = a->a + diag[row - 1] + 2; 3226f0d39aaaSBarry Smith v3 = a->a + diag[row - 2] + 3; 3227f0d39aaaSBarry Smith v4 = a->a + diag[row - 3] + 4; 3228f0d39aaaSBarry Smith v5 = a->a + diag[row - 4] + 5; 3229f0d39aaaSBarry Smith for (n = 0; n < sz - 1; n += 2) { 3230f0d39aaaSBarry Smith i1 = idx[0]; 3231f0d39aaaSBarry Smith i2 = idx[1]; 3232f0d39aaaSBarry Smith idx += 2; 3233f0d39aaaSBarry Smith tmp0 = x[i1]; 3234f0d39aaaSBarry Smith tmp1 = x[i2]; 32359371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 32369371c9d4SSatish Balay v1 += 2; 32379371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 32389371c9d4SSatish Balay v2 += 2; 32399371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 32409371c9d4SSatish Balay v3 += 2; 32419371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 32429371c9d4SSatish Balay v4 += 2; 32439371c9d4SSatish Balay sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 32449371c9d4SSatish Balay v5 += 2; 3245f0d39aaaSBarry Smith } 3246f0d39aaaSBarry Smith 3247f0d39aaaSBarry Smith if (n == sz - 1) { 3248f0d39aaaSBarry Smith tmp0 = x[*idx]; 3249f0d39aaaSBarry Smith sum1 -= *v1 * tmp0; 3250f0d39aaaSBarry Smith sum2 -= *v2 * tmp0; 3251f0d39aaaSBarry Smith sum3 -= *v3 * tmp0; 3252f0d39aaaSBarry Smith sum4 -= *v4 * tmp0; 3253f0d39aaaSBarry Smith sum5 -= *v5 * tmp0; 3254f0d39aaaSBarry Smith } 3255f0d39aaaSBarry Smith x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24]; 3256f0d39aaaSBarry Smith x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23]; 3257f0d39aaaSBarry Smith x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22]; 3258f0d39aaaSBarry Smith x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21]; 3259f0d39aaaSBarry Smith x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20]; 32608862d2efSBarry Smith break; 3261d71ae5a4SJacob Faibussowitsch default: 3262d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 32638862d2efSBarry Smith } 32642af78befSBarry Smith } 32652af78befSBarry Smith 32669566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(a->nz)); 32672af78befSBarry Smith } 32682af78befSBarry Smith its--; 32695850ef23SBarry Smith } 32705850ef23SBarry Smith while (its--) { 32715850ef23SBarry Smith if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 32729371c9d4SSatish Balay for (i = 0, row = 0, ibdiag = a->inode.ibdiag; i < m; row += sizes[i], ibdiag += sizes[i] * sizes[i], i++) { 3273d876e2b0SMark Adams sz = diag[row] - ii[row]; 32745850ef23SBarry Smith v1 = a->a + ii[row]; 32755850ef23SBarry Smith idx = a->j + ii[row]; 32765850ef23SBarry Smith /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 32775850ef23SBarry Smith switch (sizes[i]) { 32785850ef23SBarry Smith case 1: 32795850ef23SBarry Smith sum1 = b[row]; 32805850ef23SBarry Smith for (n = 0; n < sz - 1; n += 2) { 32815850ef23SBarry Smith i1 = idx[0]; 32825850ef23SBarry Smith i2 = idx[1]; 32835850ef23SBarry Smith idx += 2; 32845850ef23SBarry Smith tmp0 = x[i1]; 32855850ef23SBarry Smith tmp1 = x[i2]; 32869371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 32879371c9d4SSatish Balay v1 += 2; 32885850ef23SBarry Smith } 32895850ef23SBarry Smith if (n == sz - 1) { 3290d876e2b0SMark Adams tmp0 = x[*idx++]; 3291d876e2b0SMark Adams sum1 -= *v1 * tmp0; 3292d876e2b0SMark Adams v1++; 3293d876e2b0SMark Adams } 3294d876e2b0SMark Adams t[row] = sum1; 3295d876e2b0SMark Adams sz = ii[row + 1] - diag[row] - 1; 3296d876e2b0SMark Adams idx = a->j + diag[row] + 1; 3297d876e2b0SMark Adams v1 += 1; 3298d876e2b0SMark Adams for (n = 0; n < sz - 1; n += 2) { 3299d876e2b0SMark Adams i1 = idx[0]; 3300d876e2b0SMark Adams i2 = idx[1]; 3301d876e2b0SMark Adams idx += 2; 3302d876e2b0SMark Adams tmp0 = x[i1]; 3303d876e2b0SMark Adams tmp1 = x[i2]; 33049371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 33059371c9d4SSatish Balay v1 += 2; 3306d876e2b0SMark Adams } 3307d876e2b0SMark Adams if (n == sz - 1) { 3308d876e2b0SMark Adams tmp0 = x[*idx++]; 33095850ef23SBarry Smith sum1 -= *v1 * tmp0; 33105850ef23SBarry Smith } 33115850ef23SBarry Smith /* in MatSOR_SeqAIJ this line would be 33125850ef23SBarry Smith * 33135850ef23SBarry Smith * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++); 33145850ef23SBarry Smith * 33155850ef23SBarry Smith * but omega == 1, so this becomes 33165850ef23SBarry Smith * 3317d876e2b0SMark Adams * x[row] = sum1*(*ibdiag++); 33185850ef23SBarry Smith * 33195850ef23SBarry Smith */ 3320d876e2b0SMark Adams x[row] = sum1 * (*ibdiag); 33215850ef23SBarry Smith break; 33225850ef23SBarry Smith case 2: 33235850ef23SBarry Smith v2 = a->a + ii[row + 1]; 33245850ef23SBarry Smith sum1 = b[row]; 33255850ef23SBarry Smith sum2 = b[row + 1]; 33265850ef23SBarry Smith for (n = 0; n < sz - 1; n += 2) { 33275850ef23SBarry Smith i1 = idx[0]; 33285850ef23SBarry Smith i2 = idx[1]; 33295850ef23SBarry Smith idx += 2; 33305850ef23SBarry Smith tmp0 = x[i1]; 33315850ef23SBarry Smith tmp1 = x[i2]; 33329371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 33339371c9d4SSatish Balay v1 += 2; 33349371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 33359371c9d4SSatish Balay v2 += 2; 33365850ef23SBarry Smith } 3337d876e2b0SMark Adams if (n == sz - 1) { 3338d876e2b0SMark Adams tmp0 = x[*idx++]; 3339d876e2b0SMark Adams sum1 -= v1[0] * tmp0; 3340d876e2b0SMark Adams sum2 -= v2[0] * tmp0; 33419371c9d4SSatish Balay v1++; 33429371c9d4SSatish Balay v2++; 3343d876e2b0SMark Adams } 3344d876e2b0SMark Adams t[row] = sum1; 3345d876e2b0SMark Adams t[row + 1] = sum2; 3346d876e2b0SMark Adams sz = ii[row + 1] - diag[row] - 2; 3347d876e2b0SMark Adams idx = a->j + diag[row] + 2; 3348d876e2b0SMark Adams v1 += 2; 3349d876e2b0SMark Adams v2 += 2; 3350d876e2b0SMark Adams for (n = 0; n < sz - 1; n += 2) { 3351d876e2b0SMark Adams i1 = idx[0]; 3352d876e2b0SMark Adams i2 = idx[1]; 3353d876e2b0SMark Adams idx += 2; 3354d876e2b0SMark Adams tmp0 = x[i1]; 3355d876e2b0SMark Adams tmp1 = x[i2]; 33569371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 33579371c9d4SSatish Balay v1 += 2; 33589371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 33599371c9d4SSatish Balay v2 += 2; 3360d876e2b0SMark Adams } 33615850ef23SBarry Smith if (n == sz - 1) { 33625850ef23SBarry Smith tmp0 = x[*idx]; 33635850ef23SBarry Smith sum1 -= v1[0] * tmp0; 33645850ef23SBarry Smith sum2 -= v2[0] * tmp0; 33655850ef23SBarry Smith } 3366d876e2b0SMark Adams x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[2]; 3367d876e2b0SMark Adams x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3]; 33685850ef23SBarry Smith break; 33695850ef23SBarry Smith case 3: 33705850ef23SBarry Smith v2 = a->a + ii[row + 1]; 33715850ef23SBarry Smith v3 = a->a + ii[row + 2]; 33725850ef23SBarry Smith sum1 = b[row]; 33735850ef23SBarry Smith sum2 = b[row + 1]; 33745850ef23SBarry Smith sum3 = b[row + 2]; 33755850ef23SBarry Smith for (n = 0; n < sz - 1; n += 2) { 33765850ef23SBarry Smith i1 = idx[0]; 33775850ef23SBarry Smith i2 = idx[1]; 33785850ef23SBarry Smith idx += 2; 33795850ef23SBarry Smith tmp0 = x[i1]; 33805850ef23SBarry Smith tmp1 = x[i2]; 33819371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 33829371c9d4SSatish Balay v1 += 2; 33839371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 33849371c9d4SSatish Balay v2 += 2; 33859371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 33869371c9d4SSatish Balay v3 += 2; 33875850ef23SBarry Smith } 3388d876e2b0SMark Adams if (n == sz - 1) { 3389d876e2b0SMark Adams tmp0 = x[*idx++]; 3390d876e2b0SMark Adams sum1 -= v1[0] * tmp0; 3391d876e2b0SMark Adams sum2 -= v2[0] * tmp0; 3392d876e2b0SMark Adams sum3 -= v3[0] * tmp0; 33939371c9d4SSatish Balay v1++; 33949371c9d4SSatish Balay v2++; 33959371c9d4SSatish Balay v3++; 3396d876e2b0SMark Adams } 3397d876e2b0SMark Adams t[row] = sum1; 3398d876e2b0SMark Adams t[row + 1] = sum2; 3399d876e2b0SMark Adams t[row + 2] = sum3; 3400d876e2b0SMark Adams sz = ii[row + 1] - diag[row] - 3; 3401d876e2b0SMark Adams idx = a->j + diag[row] + 3; 3402d876e2b0SMark Adams v1 += 3; 3403d876e2b0SMark Adams v2 += 3; 3404d876e2b0SMark Adams v3 += 3; 3405d876e2b0SMark Adams for (n = 0; n < sz - 1; n += 2) { 3406d876e2b0SMark Adams i1 = idx[0]; 3407d876e2b0SMark Adams i2 = idx[1]; 3408d876e2b0SMark Adams idx += 2; 3409d876e2b0SMark Adams tmp0 = x[i1]; 3410d876e2b0SMark Adams tmp1 = x[i2]; 34119371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 34129371c9d4SSatish Balay v1 += 2; 34139371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 34149371c9d4SSatish Balay v2 += 2; 34159371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 34169371c9d4SSatish Balay v3 += 2; 3417d876e2b0SMark Adams } 34185850ef23SBarry Smith if (n == sz - 1) { 34195850ef23SBarry Smith tmp0 = x[*idx]; 34205850ef23SBarry Smith sum1 -= v1[0] * tmp0; 34215850ef23SBarry Smith sum2 -= v2[0] * tmp0; 34225850ef23SBarry Smith sum3 -= v3[0] * tmp0; 34235850ef23SBarry Smith } 3424d876e2b0SMark Adams x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6]; 3425d876e2b0SMark Adams x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7]; 3426d876e2b0SMark Adams x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8]; 34275850ef23SBarry Smith break; 34285850ef23SBarry Smith case 4: 34295850ef23SBarry Smith v2 = a->a + ii[row + 1]; 34305850ef23SBarry Smith v3 = a->a + ii[row + 2]; 34315850ef23SBarry Smith v4 = a->a + ii[row + 3]; 34325850ef23SBarry Smith sum1 = b[row]; 34335850ef23SBarry Smith sum2 = b[row + 1]; 34345850ef23SBarry Smith sum3 = b[row + 2]; 34355850ef23SBarry Smith sum4 = b[row + 3]; 34365850ef23SBarry Smith for (n = 0; n < sz - 1; n += 2) { 34375850ef23SBarry Smith i1 = idx[0]; 34385850ef23SBarry Smith i2 = idx[1]; 34395850ef23SBarry Smith idx += 2; 34405850ef23SBarry Smith tmp0 = x[i1]; 34415850ef23SBarry Smith tmp1 = x[i2]; 34429371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 34439371c9d4SSatish Balay v1 += 2; 34449371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 34459371c9d4SSatish Balay v2 += 2; 34469371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 34479371c9d4SSatish Balay v3 += 2; 34489371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 34499371c9d4SSatish Balay v4 += 2; 34505850ef23SBarry Smith } 3451d876e2b0SMark Adams if (n == sz - 1) { 3452d876e2b0SMark Adams tmp0 = x[*idx++]; 3453d876e2b0SMark Adams sum1 -= v1[0] * tmp0; 3454d876e2b0SMark Adams sum2 -= v2[0] * tmp0; 3455d876e2b0SMark Adams sum3 -= v3[0] * tmp0; 3456d876e2b0SMark Adams sum4 -= v4[0] * tmp0; 34579371c9d4SSatish Balay v1++; 34589371c9d4SSatish Balay v2++; 34599371c9d4SSatish Balay v3++; 34609371c9d4SSatish Balay v4++; 3461d876e2b0SMark Adams } 3462d876e2b0SMark Adams t[row] = sum1; 3463d876e2b0SMark Adams t[row + 1] = sum2; 3464d876e2b0SMark Adams t[row + 2] = sum3; 3465d876e2b0SMark Adams t[row + 3] = sum4; 3466d876e2b0SMark Adams sz = ii[row + 1] - diag[row] - 4; 3467d876e2b0SMark Adams idx = a->j + diag[row] + 4; 3468d876e2b0SMark Adams v1 += 4; 3469d876e2b0SMark Adams v2 += 4; 3470d876e2b0SMark Adams v3 += 4; 3471d876e2b0SMark Adams v4 += 4; 3472d876e2b0SMark Adams for (n = 0; n < sz - 1; n += 2) { 3473d876e2b0SMark Adams i1 = idx[0]; 3474d876e2b0SMark Adams i2 = idx[1]; 3475d876e2b0SMark Adams idx += 2; 3476d876e2b0SMark Adams tmp0 = x[i1]; 3477d876e2b0SMark Adams tmp1 = x[i2]; 34789371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 34799371c9d4SSatish Balay v1 += 2; 34809371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 34819371c9d4SSatish Balay v2 += 2; 34829371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 34839371c9d4SSatish Balay v3 += 2; 34849371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 34859371c9d4SSatish Balay v4 += 2; 3486d876e2b0SMark Adams } 34875850ef23SBarry Smith if (n == sz - 1) { 34885850ef23SBarry Smith tmp0 = x[*idx]; 34895850ef23SBarry Smith sum1 -= v1[0] * tmp0; 34905850ef23SBarry Smith sum2 -= v2[0] * tmp0; 34915850ef23SBarry Smith sum3 -= v3[0] * tmp0; 34925850ef23SBarry Smith sum4 -= v4[0] * tmp0; 34935850ef23SBarry Smith } 3494d876e2b0SMark Adams x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12]; 3495d876e2b0SMark Adams x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13]; 3496d876e2b0SMark Adams x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14]; 3497d876e2b0SMark Adams x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15]; 34985850ef23SBarry Smith break; 34995850ef23SBarry Smith case 5: 35005850ef23SBarry Smith v2 = a->a + ii[row + 1]; 35015850ef23SBarry Smith v3 = a->a + ii[row + 2]; 35025850ef23SBarry Smith v4 = a->a + ii[row + 3]; 35035850ef23SBarry Smith v5 = a->a + ii[row + 4]; 35045850ef23SBarry Smith sum1 = b[row]; 35055850ef23SBarry Smith sum2 = b[row + 1]; 35065850ef23SBarry Smith sum3 = b[row + 2]; 35075850ef23SBarry Smith sum4 = b[row + 3]; 35085850ef23SBarry Smith sum5 = b[row + 4]; 35095850ef23SBarry Smith for (n = 0; n < sz - 1; n += 2) { 35105850ef23SBarry Smith i1 = idx[0]; 35115850ef23SBarry Smith i2 = idx[1]; 35125850ef23SBarry Smith idx += 2; 35135850ef23SBarry Smith tmp0 = x[i1]; 35145850ef23SBarry Smith tmp1 = x[i2]; 35159371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 35169371c9d4SSatish Balay v1 += 2; 35179371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 35189371c9d4SSatish Balay v2 += 2; 35199371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 35209371c9d4SSatish Balay v3 += 2; 35219371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 35229371c9d4SSatish Balay v4 += 2; 35239371c9d4SSatish Balay sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 35249371c9d4SSatish Balay v5 += 2; 35255850ef23SBarry Smith } 35265850ef23SBarry Smith if (n == sz - 1) { 3527d876e2b0SMark Adams tmp0 = x[*idx++]; 35285850ef23SBarry Smith sum1 -= v1[0] * tmp0; 35295850ef23SBarry Smith sum2 -= v2[0] * tmp0; 35305850ef23SBarry Smith sum3 -= v3[0] * tmp0; 35315850ef23SBarry Smith sum4 -= v4[0] * tmp0; 35325850ef23SBarry Smith sum5 -= v5[0] * tmp0; 35339371c9d4SSatish Balay v1++; 35349371c9d4SSatish Balay v2++; 35359371c9d4SSatish Balay v3++; 35369371c9d4SSatish Balay v4++; 35379371c9d4SSatish Balay v5++; 35385850ef23SBarry Smith } 3539d876e2b0SMark Adams t[row] = sum1; 3540d876e2b0SMark Adams t[row + 1] = sum2; 3541d876e2b0SMark Adams t[row + 2] = sum3; 3542d876e2b0SMark Adams t[row + 3] = sum4; 3543d876e2b0SMark Adams t[row + 4] = sum5; 3544d876e2b0SMark Adams sz = ii[row + 1] - diag[row] - 5; 3545d876e2b0SMark Adams idx = a->j + diag[row] + 5; 3546d876e2b0SMark Adams v1 += 5; 3547d876e2b0SMark Adams v2 += 5; 3548d876e2b0SMark Adams v3 += 5; 3549d876e2b0SMark Adams v4 += 5; 3550d876e2b0SMark Adams v5 += 5; 35515850ef23SBarry Smith for (n = 0; n < sz - 1; n += 2) { 35525850ef23SBarry Smith i1 = idx[0]; 35535850ef23SBarry Smith i2 = idx[1]; 35545850ef23SBarry Smith idx += 2; 35555850ef23SBarry Smith tmp0 = x[i1]; 35565850ef23SBarry Smith tmp1 = x[i2]; 35579371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 35589371c9d4SSatish Balay v1 += 2; 35599371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 35609371c9d4SSatish Balay v2 += 2; 35619371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 35629371c9d4SSatish Balay v3 += 2; 35639371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 35649371c9d4SSatish Balay v4 += 2; 35659371c9d4SSatish Balay sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 35669371c9d4SSatish Balay v5 += 2; 35675850ef23SBarry Smith } 35685850ef23SBarry Smith if (n == sz - 1) { 35695850ef23SBarry Smith tmp0 = x[*idx]; 3570d876e2b0SMark Adams sum1 -= v1[0] * tmp0; 3571d876e2b0SMark Adams sum2 -= v2[0] * tmp0; 3572d876e2b0SMark Adams sum3 -= v3[0] * tmp0; 3573d876e2b0SMark Adams sum4 -= v4[0] * tmp0; 3574d876e2b0SMark Adams sum5 -= v5[0] * tmp0; 35755850ef23SBarry Smith } 3576d876e2b0SMark Adams x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20]; 3577d876e2b0SMark Adams x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21]; 3578d876e2b0SMark Adams x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22]; 3579d876e2b0SMark Adams x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23]; 3580d876e2b0SMark Adams x[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24]; 3581d876e2b0SMark Adams break; 3582d71ae5a4SJacob Faibussowitsch default: 3583d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 3584d876e2b0SMark Adams } 3585d876e2b0SMark Adams } 3586d876e2b0SMark Adams xb = t; 35879566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz)); /* undercounts diag inverse */ 3588d876e2b0SMark Adams } else xb = b; 3589d876e2b0SMark Adams 3590d876e2b0SMark Adams if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 3591d876e2b0SMark Adams ibdiag = a->inode.ibdiag + a->inode.bdiagsize; 3592d876e2b0SMark Adams for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) { 3593d876e2b0SMark Adams ibdiag -= sizes[i] * sizes[i]; 3594d876e2b0SMark Adams 3595d876e2b0SMark Adams /* set RHS */ 3596d876e2b0SMark Adams if (xb == b) { 3597d876e2b0SMark Adams /* whole (old way) */ 3598d876e2b0SMark Adams sz = ii[row + 1] - ii[row]; 3599d876e2b0SMark Adams idx = a->j + ii[row]; 3600d876e2b0SMark Adams switch (sizes[i]) { 3601d71ae5a4SJacob Faibussowitsch case 5: 3602d71ae5a4SJacob Faibussowitsch v5 = a->a + ii[row - 4]; /* fall through */ 3603d71ae5a4SJacob Faibussowitsch case 4: 3604d71ae5a4SJacob Faibussowitsch v4 = a->a + ii[row - 3]; /* fall through */ 3605d71ae5a4SJacob Faibussowitsch case 3: 3606d71ae5a4SJacob Faibussowitsch v3 = a->a + ii[row - 2]; /* fall through */ 3607d71ae5a4SJacob Faibussowitsch case 2: 3608d71ae5a4SJacob Faibussowitsch v2 = a->a + ii[row - 1]; /* fall through */ 3609d71ae5a4SJacob Faibussowitsch case 1: 3610d71ae5a4SJacob Faibussowitsch v1 = a->a + ii[row]; 3611d71ae5a4SJacob Faibussowitsch break; 3612d71ae5a4SJacob Faibussowitsch default: 3613d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 3614d876e2b0SMark Adams } 3615d876e2b0SMark Adams } else { 3616d876e2b0SMark Adams /* upper, no diag */ 3617d876e2b0SMark Adams sz = ii[row + 1] - diag[row] - 1; 3618d876e2b0SMark Adams idx = a->j + diag[row] + 1; 3619d876e2b0SMark Adams switch (sizes[i]) { 3620d71ae5a4SJacob Faibussowitsch case 5: 3621d71ae5a4SJacob Faibussowitsch v5 = a->a + diag[row - 4] + 5; /* fall through */ 3622d71ae5a4SJacob Faibussowitsch case 4: 3623d71ae5a4SJacob Faibussowitsch v4 = a->a + diag[row - 3] + 4; /* fall through */ 3624d71ae5a4SJacob Faibussowitsch case 3: 3625d71ae5a4SJacob Faibussowitsch v3 = a->a + diag[row - 2] + 3; /* fall through */ 3626d71ae5a4SJacob Faibussowitsch case 2: 3627d71ae5a4SJacob Faibussowitsch v2 = a->a + diag[row - 1] + 2; /* fall through */ 3628d71ae5a4SJacob Faibussowitsch case 1: 3629d71ae5a4SJacob Faibussowitsch v1 = a->a + diag[row] + 1; 3630d876e2b0SMark Adams } 3631d876e2b0SMark Adams } 3632d876e2b0SMark Adams /* set sum */ 3633d876e2b0SMark Adams switch (sizes[i]) { 3634d71ae5a4SJacob Faibussowitsch case 5: 3635d71ae5a4SJacob Faibussowitsch sum5 = xb[row - 4]; /* fall through */ 3636d71ae5a4SJacob Faibussowitsch case 4: 3637d71ae5a4SJacob Faibussowitsch sum4 = xb[row - 3]; /* fall through */ 3638d71ae5a4SJacob Faibussowitsch case 3: 3639d71ae5a4SJacob Faibussowitsch sum3 = xb[row - 2]; /* fall through */ 3640d71ae5a4SJacob Faibussowitsch case 2: 3641d71ae5a4SJacob Faibussowitsch sum2 = xb[row - 1]; /* fall through */ 3642d876e2b0SMark Adams case 1: 3643d876e2b0SMark Adams /* note that sum1 is associated with the last row */ 3644d876e2b0SMark Adams sum1 = xb[row]; 3645d876e2b0SMark Adams } 3646d876e2b0SMark Adams /* do sums */ 3647d876e2b0SMark Adams for (n = 0; n < sz - 1; n += 2) { 3648d876e2b0SMark Adams i1 = idx[0]; 3649d876e2b0SMark Adams i2 = idx[1]; 3650d876e2b0SMark Adams idx += 2; 3651d876e2b0SMark Adams tmp0 = x[i1]; 3652d876e2b0SMark Adams tmp1 = x[i2]; 3653d876e2b0SMark Adams switch (sizes[i]) { 3654d71ae5a4SJacob Faibussowitsch case 5: 3655d71ae5a4SJacob Faibussowitsch sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3656d71ae5a4SJacob Faibussowitsch v5 += 2; /* fall through */ 3657d71ae5a4SJacob Faibussowitsch case 4: 3658d71ae5a4SJacob Faibussowitsch sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3659d71ae5a4SJacob Faibussowitsch v4 += 2; /* fall through */ 3660d71ae5a4SJacob Faibussowitsch case 3: 3661d71ae5a4SJacob Faibussowitsch sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3662d71ae5a4SJacob Faibussowitsch v3 += 2; /* fall through */ 3663d71ae5a4SJacob Faibussowitsch case 2: 3664d71ae5a4SJacob Faibussowitsch sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3665d71ae5a4SJacob Faibussowitsch v2 += 2; /* fall through */ 3666d71ae5a4SJacob Faibussowitsch case 1: 3667d71ae5a4SJacob Faibussowitsch sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3668d71ae5a4SJacob Faibussowitsch v1 += 2; 3669d876e2b0SMark Adams } 3670d876e2b0SMark Adams } 3671d876e2b0SMark Adams /* ragged edge */ 3672d876e2b0SMark Adams if (n == sz - 1) { 3673d876e2b0SMark Adams tmp0 = x[*idx]; 3674d876e2b0SMark Adams switch (sizes[i]) { 3675d71ae5a4SJacob Faibussowitsch case 5: 3676d71ae5a4SJacob Faibussowitsch sum5 -= *v5 * tmp0; /* fall through */ 3677d71ae5a4SJacob Faibussowitsch case 4: 3678d71ae5a4SJacob Faibussowitsch sum4 -= *v4 * tmp0; /* fall through */ 3679d71ae5a4SJacob Faibussowitsch case 3: 3680d71ae5a4SJacob Faibussowitsch sum3 -= *v3 * tmp0; /* fall through */ 3681d71ae5a4SJacob Faibussowitsch case 2: 3682d71ae5a4SJacob Faibussowitsch sum2 -= *v2 * tmp0; /* fall through */ 3683d71ae5a4SJacob Faibussowitsch case 1: 3684d71ae5a4SJacob Faibussowitsch sum1 -= *v1 * tmp0; 3685d876e2b0SMark Adams } 3686d876e2b0SMark Adams } 3687d876e2b0SMark Adams /* update */ 3688d876e2b0SMark Adams if (xb == b) { 3689d876e2b0SMark Adams /* whole (old way) w/ diag */ 3690d876e2b0SMark Adams switch (sizes[i]) { 3691d876e2b0SMark Adams case 5: 36925850ef23SBarry Smith x[row--] += sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24]; 36935850ef23SBarry Smith x[row--] += sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23]; 36945850ef23SBarry Smith x[row--] += sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22]; 36955850ef23SBarry Smith x[row--] += sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21]; 36965850ef23SBarry Smith x[row--] += sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20]; 36975850ef23SBarry Smith break; 3698d876e2b0SMark Adams case 4: 3699d876e2b0SMark Adams x[row--] += sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15]; 3700d876e2b0SMark Adams x[row--] += sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14]; 3701d876e2b0SMark Adams x[row--] += sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13]; 3702d876e2b0SMark Adams x[row--] += sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12]; 3703d876e2b0SMark Adams break; 3704d876e2b0SMark Adams case 3: 3705d876e2b0SMark Adams x[row--] += sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8]; 3706d876e2b0SMark Adams x[row--] += sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7]; 3707d876e2b0SMark Adams x[row--] += sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6]; 3708d876e2b0SMark Adams break; 3709d876e2b0SMark Adams case 2: 3710d876e2b0SMark Adams x[row--] += sum2 * ibdiag[1] + sum1 * ibdiag[3]; 3711d876e2b0SMark Adams x[row--] += sum2 * ibdiag[0] + sum1 * ibdiag[2]; 3712d876e2b0SMark Adams break; 3713d71ae5a4SJacob Faibussowitsch case 1: 3714d71ae5a4SJacob Faibussowitsch x[row--] += sum1 * (*ibdiag); 3715d71ae5a4SJacob Faibussowitsch break; 3716d876e2b0SMark Adams } 3717d876e2b0SMark Adams } else { 3718d876e2b0SMark Adams /* no diag so set = */ 3719d876e2b0SMark Adams switch (sizes[i]) { 3720d876e2b0SMark Adams case 5: 3721d876e2b0SMark Adams x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24]; 3722d876e2b0SMark Adams x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23]; 3723d876e2b0SMark Adams x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22]; 3724d876e2b0SMark Adams x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21]; 3725d876e2b0SMark Adams x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20]; 3726d876e2b0SMark Adams break; 3727d876e2b0SMark Adams case 4: 3728d876e2b0SMark Adams x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15]; 3729d876e2b0SMark Adams x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14]; 3730d876e2b0SMark Adams x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13]; 3731d876e2b0SMark Adams x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12]; 3732d876e2b0SMark Adams break; 3733d876e2b0SMark Adams case 3: 3734d876e2b0SMark Adams x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8]; 3735d876e2b0SMark Adams x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7]; 3736d876e2b0SMark Adams x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6]; 3737d876e2b0SMark Adams break; 3738d876e2b0SMark Adams case 2: 3739d876e2b0SMark Adams x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3]; 3740d876e2b0SMark Adams x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2]; 3741d876e2b0SMark Adams break; 3742d71ae5a4SJacob Faibussowitsch case 1: 3743d71ae5a4SJacob Faibussowitsch x[row--] = sum1 * (*ibdiag); 3744d71ae5a4SJacob Faibussowitsch break; 37455850ef23SBarry Smith } 37465850ef23SBarry Smith } 3747d876e2b0SMark Adams } 3748d876e2b0SMark Adams if (xb == b) { 37499566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz)); 3750d876e2b0SMark Adams } else { 37519566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper, undercounts diag inverse */ 3752d876e2b0SMark Adams } 37535850ef23SBarry Smith } 37542af78befSBarry Smith } 375589c6957cSBarry Smith if (flag & SOR_EISENSTAT) { 375689c6957cSBarry Smith /* 375789c6957cSBarry Smith Apply (U + D)^-1 where D is now the block diagonal 375889c6957cSBarry Smith */ 375989c6957cSBarry Smith ibdiag = a->inode.ibdiag + a->inode.bdiagsize; 376089c6957cSBarry Smith for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) { 376189c6957cSBarry Smith ibdiag -= sizes[i] * sizes[i]; 376289c6957cSBarry Smith sz = ii[row + 1] - diag[row] - 1; 376389c6957cSBarry Smith v1 = a->a + diag[row] + 1; 376489c6957cSBarry Smith idx = a->j + diag[row] + 1; 37654108e4d5SBarry Smith /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 376689c6957cSBarry Smith switch (sizes[i]) { 376789c6957cSBarry Smith case 1: 376889c6957cSBarry Smith 376989c6957cSBarry Smith sum1 = b[row]; 377089c6957cSBarry Smith for (n = 0; n < sz - 1; n += 2) { 377189c6957cSBarry Smith i1 = idx[0]; 377289c6957cSBarry Smith i2 = idx[1]; 377389c6957cSBarry Smith idx += 2; 377489c6957cSBarry Smith tmp0 = x[i1]; 377589c6957cSBarry Smith tmp1 = x[i2]; 37769371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 37779371c9d4SSatish Balay v1 += 2; 377889c6957cSBarry Smith } 377989c6957cSBarry Smith 378089c6957cSBarry Smith if (n == sz - 1) { 378189c6957cSBarry Smith tmp0 = x[*idx]; 378289c6957cSBarry Smith sum1 -= *v1 * tmp0; 378389c6957cSBarry Smith } 37849371c9d4SSatish Balay x[row] = sum1 * (*ibdiag); 37859371c9d4SSatish Balay row--; 378689c6957cSBarry Smith break; 378789c6957cSBarry Smith 378889c6957cSBarry Smith case 2: 378989c6957cSBarry Smith 379089c6957cSBarry Smith sum1 = b[row]; 379189c6957cSBarry Smith sum2 = b[row - 1]; 379289c6957cSBarry Smith /* note that sum1 is associated with the second of the two rows */ 379389c6957cSBarry Smith v2 = a->a + diag[row - 1] + 2; 379489c6957cSBarry Smith for (n = 0; n < sz - 1; n += 2) { 379589c6957cSBarry Smith i1 = idx[0]; 379689c6957cSBarry Smith i2 = idx[1]; 379789c6957cSBarry Smith idx += 2; 379889c6957cSBarry Smith tmp0 = x[i1]; 379989c6957cSBarry Smith tmp1 = x[i2]; 38009371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 38019371c9d4SSatish Balay v1 += 2; 38029371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 38039371c9d4SSatish Balay v2 += 2; 380489c6957cSBarry Smith } 380589c6957cSBarry Smith 380689c6957cSBarry Smith if (n == sz - 1) { 380789c6957cSBarry Smith tmp0 = x[*idx]; 380889c6957cSBarry Smith sum1 -= *v1 * tmp0; 380989c6957cSBarry Smith sum2 -= *v2 * tmp0; 381089c6957cSBarry Smith } 3811938d4eb3SBarry Smith x[row] = sum2 * ibdiag[1] + sum1 * ibdiag[3]; 3812938d4eb3SBarry Smith x[row - 1] = sum2 * ibdiag[0] + sum1 * ibdiag[2]; 3813938d4eb3SBarry Smith row -= 2; 381489c6957cSBarry Smith break; 381589c6957cSBarry Smith case 3: 381689c6957cSBarry Smith 381789c6957cSBarry Smith sum1 = b[row]; 381889c6957cSBarry Smith sum2 = b[row - 1]; 381989c6957cSBarry Smith sum3 = b[row - 2]; 382089c6957cSBarry Smith v2 = a->a + diag[row - 1] + 2; 382189c6957cSBarry Smith v3 = a->a + diag[row - 2] + 3; 382289c6957cSBarry Smith for (n = 0; n < sz - 1; n += 2) { 382389c6957cSBarry Smith i1 = idx[0]; 382489c6957cSBarry Smith i2 = idx[1]; 382589c6957cSBarry Smith idx += 2; 382689c6957cSBarry Smith tmp0 = x[i1]; 382789c6957cSBarry Smith tmp1 = x[i2]; 38289371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 38299371c9d4SSatish Balay v1 += 2; 38309371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 38319371c9d4SSatish Balay v2 += 2; 38329371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 38339371c9d4SSatish Balay v3 += 2; 383489c6957cSBarry Smith } 383589c6957cSBarry Smith 383689c6957cSBarry Smith if (n == sz - 1) { 383789c6957cSBarry Smith tmp0 = x[*idx]; 383889c6957cSBarry Smith sum1 -= *v1 * tmp0; 383989c6957cSBarry Smith sum2 -= *v2 * tmp0; 384089c6957cSBarry Smith sum3 -= *v3 * tmp0; 384189c6957cSBarry Smith } 3842938d4eb3SBarry Smith x[row] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8]; 3843938d4eb3SBarry Smith x[row - 1] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7]; 3844938d4eb3SBarry Smith x[row - 2] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6]; 3845938d4eb3SBarry Smith row -= 3; 384689c6957cSBarry Smith break; 384789c6957cSBarry Smith case 4: 384889c6957cSBarry Smith 384989c6957cSBarry Smith sum1 = b[row]; 385089c6957cSBarry Smith sum2 = b[row - 1]; 385189c6957cSBarry Smith sum3 = b[row - 2]; 385289c6957cSBarry Smith sum4 = b[row - 3]; 385389c6957cSBarry Smith v2 = a->a + diag[row - 1] + 2; 385489c6957cSBarry Smith v3 = a->a + diag[row - 2] + 3; 385589c6957cSBarry Smith v4 = a->a + diag[row - 3] + 4; 385689c6957cSBarry Smith for (n = 0; n < sz - 1; n += 2) { 385789c6957cSBarry Smith i1 = idx[0]; 385889c6957cSBarry Smith i2 = idx[1]; 385989c6957cSBarry Smith idx += 2; 386089c6957cSBarry Smith tmp0 = x[i1]; 386189c6957cSBarry Smith tmp1 = x[i2]; 38629371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 38639371c9d4SSatish Balay v1 += 2; 38649371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 38659371c9d4SSatish Balay v2 += 2; 38669371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 38679371c9d4SSatish Balay v3 += 2; 38689371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 38699371c9d4SSatish Balay v4 += 2; 387089c6957cSBarry Smith } 387189c6957cSBarry Smith 387289c6957cSBarry Smith if (n == sz - 1) { 387389c6957cSBarry Smith tmp0 = x[*idx]; 387489c6957cSBarry Smith sum1 -= *v1 * tmp0; 387589c6957cSBarry Smith sum2 -= *v2 * tmp0; 387689c6957cSBarry Smith sum3 -= *v3 * tmp0; 387789c6957cSBarry Smith sum4 -= *v4 * tmp0; 387889c6957cSBarry Smith } 3879938d4eb3SBarry Smith x[row] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15]; 3880938d4eb3SBarry Smith x[row - 1] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14]; 3881938d4eb3SBarry Smith x[row - 2] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13]; 3882938d4eb3SBarry Smith x[row - 3] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12]; 3883938d4eb3SBarry Smith row -= 4; 388489c6957cSBarry Smith break; 388589c6957cSBarry Smith case 5: 388689c6957cSBarry Smith 388789c6957cSBarry Smith sum1 = b[row]; 388889c6957cSBarry Smith sum2 = b[row - 1]; 388989c6957cSBarry Smith sum3 = b[row - 2]; 389089c6957cSBarry Smith sum4 = b[row - 3]; 389189c6957cSBarry Smith sum5 = b[row - 4]; 389289c6957cSBarry Smith v2 = a->a + diag[row - 1] + 2; 389389c6957cSBarry Smith v3 = a->a + diag[row - 2] + 3; 389489c6957cSBarry Smith v4 = a->a + diag[row - 3] + 4; 389589c6957cSBarry Smith v5 = a->a + diag[row - 4] + 5; 389689c6957cSBarry Smith for (n = 0; n < sz - 1; n += 2) { 389789c6957cSBarry Smith i1 = idx[0]; 389889c6957cSBarry Smith i2 = idx[1]; 389989c6957cSBarry Smith idx += 2; 390089c6957cSBarry Smith tmp0 = x[i1]; 390189c6957cSBarry Smith tmp1 = x[i2]; 39029371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 39039371c9d4SSatish Balay v1 += 2; 39049371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 39059371c9d4SSatish Balay v2 += 2; 39069371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 39079371c9d4SSatish Balay v3 += 2; 39089371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 39099371c9d4SSatish Balay v4 += 2; 39109371c9d4SSatish Balay sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 39119371c9d4SSatish Balay v5 += 2; 391289c6957cSBarry Smith } 391389c6957cSBarry Smith 391489c6957cSBarry Smith if (n == sz - 1) { 391589c6957cSBarry Smith tmp0 = x[*idx]; 391689c6957cSBarry Smith sum1 -= *v1 * tmp0; 391789c6957cSBarry Smith sum2 -= *v2 * tmp0; 391889c6957cSBarry Smith sum3 -= *v3 * tmp0; 391989c6957cSBarry Smith sum4 -= *v4 * tmp0; 392089c6957cSBarry Smith sum5 -= *v5 * tmp0; 392189c6957cSBarry Smith } 3922938d4eb3SBarry Smith x[row] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24]; 3923938d4eb3SBarry Smith x[row - 1] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23]; 3924938d4eb3SBarry Smith x[row - 2] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22]; 3925938d4eb3SBarry Smith x[row - 3] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21]; 3926938d4eb3SBarry Smith x[row - 4] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20]; 3927938d4eb3SBarry Smith row -= 5; 392889c6957cSBarry Smith break; 3929d71ae5a4SJacob Faibussowitsch default: 3930d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 393189c6957cSBarry Smith } 393289c6957cSBarry Smith } 39339566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(a->nz)); 393489c6957cSBarry Smith 393589c6957cSBarry Smith /* 393689c6957cSBarry Smith t = b - D x where D is the block diagonal 393789c6957cSBarry Smith */ 393889c6957cSBarry Smith cnt = 0; 393989c6957cSBarry Smith for (i = 0, row = 0; i < m; i++) { 394089c6957cSBarry Smith switch (sizes[i]) { 394189c6957cSBarry Smith case 1: 39429371c9d4SSatish Balay t[row] = b[row] - bdiag[cnt++] * x[row]; 39439371c9d4SSatish Balay row++; 394489c6957cSBarry Smith break; 394589c6957cSBarry Smith case 2: 39469371c9d4SSatish Balay x1 = x[row]; 39479371c9d4SSatish Balay x2 = x[row + 1]; 394889c6957cSBarry Smith tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2]; 394989c6957cSBarry Smith tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3]; 395089c6957cSBarry Smith t[row] = b[row] - tmp1; 39519371c9d4SSatish Balay t[row + 1] = b[row + 1] - tmp2; 39529371c9d4SSatish Balay row += 2; 395389c6957cSBarry Smith cnt += 4; 395489c6957cSBarry Smith break; 395589c6957cSBarry Smith case 3: 39569371c9d4SSatish Balay x1 = x[row]; 39579371c9d4SSatish Balay x2 = x[row + 1]; 39589371c9d4SSatish Balay x3 = x[row + 2]; 395989c6957cSBarry Smith tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6]; 396089c6957cSBarry Smith tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7]; 396189c6957cSBarry Smith tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8]; 396289c6957cSBarry Smith t[row] = b[row] - tmp1; 396389c6957cSBarry Smith t[row + 1] = b[row + 1] - tmp2; 39649371c9d4SSatish Balay t[row + 2] = b[row + 2] - tmp3; 39659371c9d4SSatish Balay row += 3; 396689c6957cSBarry Smith cnt += 9; 396789c6957cSBarry Smith break; 396889c6957cSBarry Smith case 4: 39699371c9d4SSatish Balay x1 = x[row]; 39709371c9d4SSatish Balay x2 = x[row + 1]; 39719371c9d4SSatish Balay x3 = x[row + 2]; 39729371c9d4SSatish Balay x4 = x[row + 3]; 397389c6957cSBarry Smith tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12]; 397489c6957cSBarry Smith tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13]; 397589c6957cSBarry Smith tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14]; 397689c6957cSBarry Smith tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15]; 397789c6957cSBarry Smith t[row] = b[row] - tmp1; 397889c6957cSBarry Smith t[row + 1] = b[row + 1] - tmp2; 397989c6957cSBarry Smith t[row + 2] = b[row + 2] - tmp3; 39809371c9d4SSatish Balay t[row + 3] = b[row + 3] - tmp4; 39819371c9d4SSatish Balay row += 4; 398289c6957cSBarry Smith cnt += 16; 398389c6957cSBarry Smith break; 398489c6957cSBarry Smith case 5: 39859371c9d4SSatish Balay x1 = x[row]; 39869371c9d4SSatish Balay x2 = x[row + 1]; 39879371c9d4SSatish Balay x3 = x[row + 2]; 39889371c9d4SSatish Balay x4 = x[row + 3]; 39899371c9d4SSatish Balay x5 = x[row + 4]; 399089c6957cSBarry Smith tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20]; 399189c6957cSBarry Smith tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21]; 399289c6957cSBarry Smith tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22]; 399389c6957cSBarry Smith tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23]; 399489c6957cSBarry Smith tmp5 = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24]; 399589c6957cSBarry Smith t[row] = b[row] - tmp1; 399689c6957cSBarry Smith t[row + 1] = b[row + 1] - tmp2; 399789c6957cSBarry Smith t[row + 2] = b[row + 2] - tmp3; 399889c6957cSBarry Smith t[row + 3] = b[row + 3] - tmp4; 39999371c9d4SSatish Balay t[row + 4] = b[row + 4] - tmp5; 40009371c9d4SSatish Balay row += 5; 400189c6957cSBarry Smith cnt += 25; 400289c6957cSBarry Smith break; 4003d71ae5a4SJacob Faibussowitsch default: 4004d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 400589c6957cSBarry Smith } 400689c6957cSBarry Smith } 40079566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(m)); 400889c6957cSBarry Smith 400989c6957cSBarry Smith /* 401089c6957cSBarry Smith Apply (L + D)^-1 where D is the block diagonal 401189c6957cSBarry Smith */ 401289c6957cSBarry Smith for (i = 0, row = 0; i < m; i++) { 401389c6957cSBarry Smith sz = diag[row] - ii[row]; 401489c6957cSBarry Smith v1 = a->a + ii[row]; 401589c6957cSBarry Smith idx = a->j + ii[row]; 40164108e4d5SBarry Smith /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 401789c6957cSBarry Smith switch (sizes[i]) { 401889c6957cSBarry Smith case 1: 401989c6957cSBarry Smith 402089c6957cSBarry Smith sum1 = t[row]; 402189c6957cSBarry Smith for (n = 0; n < sz - 1; n += 2) { 402289c6957cSBarry Smith i1 = idx[0]; 402389c6957cSBarry Smith i2 = idx[1]; 402489c6957cSBarry Smith idx += 2; 402589c6957cSBarry Smith tmp0 = t[i1]; 402689c6957cSBarry Smith tmp1 = t[i2]; 40279371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 40289371c9d4SSatish Balay v1 += 2; 402989c6957cSBarry Smith } 403089c6957cSBarry Smith 403189c6957cSBarry Smith if (n == sz - 1) { 403289c6957cSBarry Smith tmp0 = t[*idx]; 403389c6957cSBarry Smith sum1 -= *v1 * tmp0; 403489c6957cSBarry Smith } 40359371c9d4SSatish Balay x[row] += t[row] = sum1 * (*ibdiag++); 40369371c9d4SSatish Balay row++; 403789c6957cSBarry Smith break; 403889c6957cSBarry Smith case 2: 403989c6957cSBarry Smith v2 = a->a + ii[row + 1]; 404089c6957cSBarry Smith sum1 = t[row]; 404189c6957cSBarry Smith sum2 = t[row + 1]; 404289c6957cSBarry Smith for (n = 0; n < sz - 1; n += 2) { 404389c6957cSBarry Smith i1 = idx[0]; 404489c6957cSBarry Smith i2 = idx[1]; 404589c6957cSBarry Smith idx += 2; 404689c6957cSBarry Smith tmp0 = t[i1]; 404789c6957cSBarry Smith tmp1 = t[i2]; 40489371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 40499371c9d4SSatish Balay v1 += 2; 40509371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 40519371c9d4SSatish Balay v2 += 2; 405289c6957cSBarry Smith } 405389c6957cSBarry Smith 405489c6957cSBarry Smith if (n == sz - 1) { 405589c6957cSBarry Smith tmp0 = t[*idx]; 405689c6957cSBarry Smith sum1 -= v1[0] * tmp0; 405789c6957cSBarry Smith sum2 -= v2[0] * tmp0; 405889c6957cSBarry Smith } 405989c6957cSBarry Smith x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[2]; 406089c6957cSBarry Smith x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3]; 40619371c9d4SSatish Balay ibdiag += 4; 40629371c9d4SSatish Balay row += 2; 406389c6957cSBarry Smith break; 406489c6957cSBarry Smith case 3: 406589c6957cSBarry Smith v2 = a->a + ii[row + 1]; 406689c6957cSBarry Smith v3 = a->a + ii[row + 2]; 406789c6957cSBarry Smith sum1 = t[row]; 406889c6957cSBarry Smith sum2 = t[row + 1]; 406989c6957cSBarry Smith sum3 = t[row + 2]; 407089c6957cSBarry Smith for (n = 0; n < sz - 1; n += 2) { 407189c6957cSBarry Smith i1 = idx[0]; 407289c6957cSBarry Smith i2 = idx[1]; 407389c6957cSBarry Smith idx += 2; 407489c6957cSBarry Smith tmp0 = t[i1]; 407589c6957cSBarry Smith tmp1 = t[i2]; 40769371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 40779371c9d4SSatish Balay v1 += 2; 40789371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 40799371c9d4SSatish Balay v2 += 2; 40809371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 40819371c9d4SSatish Balay v3 += 2; 408289c6957cSBarry Smith } 408389c6957cSBarry Smith 408489c6957cSBarry Smith if (n == sz - 1) { 408589c6957cSBarry Smith tmp0 = t[*idx]; 408689c6957cSBarry Smith sum1 -= v1[0] * tmp0; 408789c6957cSBarry Smith sum2 -= v2[0] * tmp0; 408889c6957cSBarry Smith sum3 -= v3[0] * tmp0; 408989c6957cSBarry Smith } 409089c6957cSBarry Smith x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6]; 409189c6957cSBarry Smith x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7]; 409289c6957cSBarry Smith x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8]; 40939371c9d4SSatish Balay ibdiag += 9; 40949371c9d4SSatish Balay row += 3; 409589c6957cSBarry Smith break; 409689c6957cSBarry Smith case 4: 409789c6957cSBarry Smith v2 = a->a + ii[row + 1]; 409889c6957cSBarry Smith v3 = a->a + ii[row + 2]; 409989c6957cSBarry Smith v4 = a->a + ii[row + 3]; 410089c6957cSBarry Smith sum1 = t[row]; 410189c6957cSBarry Smith sum2 = t[row + 1]; 410289c6957cSBarry Smith sum3 = t[row + 2]; 410389c6957cSBarry Smith sum4 = t[row + 3]; 410489c6957cSBarry Smith for (n = 0; n < sz - 1; n += 2) { 410589c6957cSBarry Smith i1 = idx[0]; 410689c6957cSBarry Smith i2 = idx[1]; 410789c6957cSBarry Smith idx += 2; 410889c6957cSBarry Smith tmp0 = t[i1]; 410989c6957cSBarry Smith tmp1 = t[i2]; 41109371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 41119371c9d4SSatish Balay v1 += 2; 41129371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 41139371c9d4SSatish Balay v2 += 2; 41149371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 41159371c9d4SSatish Balay v3 += 2; 41169371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 41179371c9d4SSatish Balay v4 += 2; 411889c6957cSBarry Smith } 411989c6957cSBarry Smith 412089c6957cSBarry Smith if (n == sz - 1) { 412189c6957cSBarry Smith tmp0 = t[*idx]; 412289c6957cSBarry Smith sum1 -= v1[0] * tmp0; 412389c6957cSBarry Smith sum2 -= v2[0] * tmp0; 412489c6957cSBarry Smith sum3 -= v3[0] * tmp0; 412589c6957cSBarry Smith sum4 -= v4[0] * tmp0; 412689c6957cSBarry Smith } 412789c6957cSBarry Smith x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12]; 412889c6957cSBarry Smith x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13]; 412989c6957cSBarry Smith x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14]; 413089c6957cSBarry Smith x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15]; 41319371c9d4SSatish Balay ibdiag += 16; 41329371c9d4SSatish Balay row += 4; 413389c6957cSBarry Smith break; 413489c6957cSBarry Smith case 5: 413589c6957cSBarry Smith v2 = a->a + ii[row + 1]; 413689c6957cSBarry Smith v3 = a->a + ii[row + 2]; 413789c6957cSBarry Smith v4 = a->a + ii[row + 3]; 413889c6957cSBarry Smith v5 = a->a + ii[row + 4]; 413989c6957cSBarry Smith sum1 = t[row]; 414089c6957cSBarry Smith sum2 = t[row + 1]; 414189c6957cSBarry Smith sum3 = t[row + 2]; 414289c6957cSBarry Smith sum4 = t[row + 3]; 414389c6957cSBarry Smith sum5 = t[row + 4]; 414489c6957cSBarry Smith for (n = 0; n < sz - 1; n += 2) { 414589c6957cSBarry Smith i1 = idx[0]; 414689c6957cSBarry Smith i2 = idx[1]; 414789c6957cSBarry Smith idx += 2; 414889c6957cSBarry Smith tmp0 = t[i1]; 414989c6957cSBarry Smith tmp1 = t[i2]; 41509371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 41519371c9d4SSatish Balay v1 += 2; 41529371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 41539371c9d4SSatish Balay v2 += 2; 41549371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 41559371c9d4SSatish Balay v3 += 2; 41569371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 41579371c9d4SSatish Balay v4 += 2; 41589371c9d4SSatish Balay sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 41599371c9d4SSatish Balay v5 += 2; 416089c6957cSBarry Smith } 416189c6957cSBarry Smith 416289c6957cSBarry Smith if (n == sz - 1) { 416389c6957cSBarry Smith tmp0 = t[*idx]; 416489c6957cSBarry Smith sum1 -= v1[0] * tmp0; 416589c6957cSBarry Smith sum2 -= v2[0] * tmp0; 416689c6957cSBarry Smith sum3 -= v3[0] * tmp0; 416789c6957cSBarry Smith sum4 -= v4[0] * tmp0; 416889c6957cSBarry Smith sum5 -= v5[0] * tmp0; 416989c6957cSBarry Smith } 417089c6957cSBarry Smith x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20]; 417189c6957cSBarry Smith x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21]; 417289c6957cSBarry Smith x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22]; 417389c6957cSBarry Smith x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23]; 417489c6957cSBarry Smith x[row + 4] += t[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24]; 41759371c9d4SSatish Balay ibdiag += 25; 41769371c9d4SSatish Balay row += 5; 417789c6957cSBarry Smith break; 4178d71ae5a4SJacob Faibussowitsch default: 4179d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 418089c6957cSBarry Smith } 418189c6957cSBarry Smith } 41829566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(a->nz)); 41835850ef23SBarry Smith } 41849566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 41859566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 41863ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 41872af78befSBarry Smith } 41882af78befSBarry Smith 4189ff6a9541SJacob Faibussowitsch static PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A, Vec bb, Vec xx) 4190d71ae5a4SJacob Faibussowitsch { 419189c6957cSBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 419289c6957cSBarry Smith PetscScalar *x, tmp1, tmp2, tmp3, tmp4, tmp5, x1, x2, x3, x4, x5; 419389c6957cSBarry Smith const MatScalar *bdiag = a->inode.bdiag; 419489c6957cSBarry Smith const PetscScalar *b; 419589c6957cSBarry Smith PetscInt m = a->inode.node_count, cnt = 0, i, row; 419689c6957cSBarry Smith const PetscInt *sizes = a->inode.size; 41972af78befSBarry Smith 419889c6957cSBarry Smith PetscFunctionBegin; 419908401ef6SPierre Jolivet PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 42009566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 42019566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 420289c6957cSBarry Smith cnt = 0; 420389c6957cSBarry Smith for (i = 0, row = 0; i < m; i++) { 420489c6957cSBarry Smith switch (sizes[i]) { 420589c6957cSBarry Smith case 1: 42069371c9d4SSatish Balay x[row] = b[row] * bdiag[cnt++]; 42079371c9d4SSatish Balay row++; 420889c6957cSBarry Smith break; 420989c6957cSBarry Smith case 2: 42109371c9d4SSatish Balay x1 = b[row]; 42119371c9d4SSatish Balay x2 = b[row + 1]; 421289c6957cSBarry Smith tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2]; 421389c6957cSBarry Smith tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3]; 421489c6957cSBarry Smith x[row++] = tmp1; 421589c6957cSBarry Smith x[row++] = tmp2; 421689c6957cSBarry Smith cnt += 4; 421789c6957cSBarry Smith break; 421889c6957cSBarry Smith case 3: 42199371c9d4SSatish Balay x1 = b[row]; 42209371c9d4SSatish Balay x2 = b[row + 1]; 42219371c9d4SSatish Balay x3 = b[row + 2]; 422289c6957cSBarry Smith tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6]; 422389c6957cSBarry Smith tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7]; 422489c6957cSBarry Smith tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8]; 422589c6957cSBarry Smith x[row++] = tmp1; 422689c6957cSBarry Smith x[row++] = tmp2; 422789c6957cSBarry Smith x[row++] = tmp3; 422889c6957cSBarry Smith cnt += 9; 422989c6957cSBarry Smith break; 423089c6957cSBarry Smith case 4: 42319371c9d4SSatish Balay x1 = b[row]; 42329371c9d4SSatish Balay x2 = b[row + 1]; 42339371c9d4SSatish Balay x3 = b[row + 2]; 42349371c9d4SSatish Balay x4 = b[row + 3]; 423589c6957cSBarry Smith tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12]; 423689c6957cSBarry Smith tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13]; 423789c6957cSBarry Smith tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14]; 423889c6957cSBarry Smith tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15]; 423989c6957cSBarry Smith x[row++] = tmp1; 424089c6957cSBarry Smith x[row++] = tmp2; 424189c6957cSBarry Smith x[row++] = tmp3; 424289c6957cSBarry Smith x[row++] = tmp4; 424389c6957cSBarry Smith cnt += 16; 424489c6957cSBarry Smith break; 424589c6957cSBarry Smith case 5: 42469371c9d4SSatish Balay x1 = b[row]; 42479371c9d4SSatish Balay x2 = b[row + 1]; 42489371c9d4SSatish Balay x3 = b[row + 2]; 42499371c9d4SSatish Balay x4 = b[row + 3]; 42509371c9d4SSatish Balay x5 = b[row + 4]; 425189c6957cSBarry Smith tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20]; 425289c6957cSBarry Smith tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21]; 425389c6957cSBarry Smith tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22]; 425489c6957cSBarry Smith tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23]; 425589c6957cSBarry Smith tmp5 = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24]; 425689c6957cSBarry Smith x[row++] = tmp1; 425789c6957cSBarry Smith x[row++] = tmp2; 425889c6957cSBarry Smith x[row++] = tmp3; 425989c6957cSBarry Smith x[row++] = tmp4; 426089c6957cSBarry Smith x[row++] = tmp5; 426189c6957cSBarry Smith cnt += 25; 426289c6957cSBarry Smith break; 4263d71ae5a4SJacob Faibussowitsch default: 4264d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 426589c6957cSBarry Smith } 426689c6957cSBarry Smith } 42679566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * cnt)); 42689566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 42699566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 42703ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 427189c6957cSBarry Smith } 427289c6957cSBarry Smith 4273d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJ_Inode_ResetOps(Mat A) 4274d71ae5a4SJacob Faibussowitsch { 4275b215bc84SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4276b215bc84SStefano Zampini 4277b215bc84SStefano Zampini PetscFunctionBegin; 4278b215bc84SStefano Zampini a->inode.node_count = 0; 4279b215bc84SStefano Zampini a->inode.use = PETSC_FALSE; 4280b215bc84SStefano Zampini a->inode.checked = PETSC_FALSE; 4281b215bc84SStefano Zampini a->inode.mat_nonzerostate = -1; 4282b215bc84SStefano Zampini A->ops->getrowij = MatGetRowIJ_SeqAIJ; 4283b215bc84SStefano Zampini A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ; 4284b215bc84SStefano Zampini A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ; 4285b215bc84SStefano Zampini A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ; 4286b215bc84SStefano Zampini A->ops->coloringpatch = NULL; 4287b215bc84SStefano Zampini A->ops->multdiagonalblock = NULL; 4288ad540459SPierre Jolivet if (A->factortype) A->ops->solve = MatSolve_SeqAIJ_inplace; 42893ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4290b215bc84SStefano Zampini } 4291b215bc84SStefano Zampini 42924c1414c8SBarry Smith /* 42934c1414c8SBarry Smith samestructure indicates that the matrix has not changed its nonzero structure so we 42944c1414c8SBarry Smith do not need to recompute the inodes 42954c1414c8SBarry Smith */ 4296d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCheckInode(Mat A) 4297d71ae5a4SJacob Faibussowitsch { 42984c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 42998758e1faSBarry Smith PetscInt i, j, m, nzx, nzy, *ns, node_count, blk_size; 4300ace3abfcSBarry Smith PetscBool flag; 43018758e1faSBarry Smith const PetscInt *idx, *idy, *ii; 43024c1414c8SBarry Smith 43034c1414c8SBarry Smith PetscFunctionBegin; 4304b215bc84SStefano Zampini if (!a->inode.use) { 43059566063dSJacob Faibussowitsch PetscCall(MatSeqAIJ_Inode_ResetOps(A)); 43069566063dSJacob Faibussowitsch PetscCall(PetscFree(a->inode.size)); 43073ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4308b215bc84SStefano Zampini } 43093ba16761SJacob Faibussowitsch if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(PETSC_SUCCESS); 43104c1414c8SBarry Smith 4311d0f46423SBarry Smith m = A->rmap->n; 43129566063dSJacob Faibussowitsch if (!a->inode.size) PetscCall(PetscMalloc1(m + 1, &a->inode.size)); 4313b215bc84SStefano Zampini ns = a->inode.size; 43144c1414c8SBarry Smith 43154c1414c8SBarry Smith i = 0; 43164c1414c8SBarry Smith node_count = 0; 43174c1414c8SBarry Smith idx = a->j; 43184c1414c8SBarry Smith ii = a->i; 43196f2c871aSStefano Zampini if (idx) { 43204c1414c8SBarry Smith while (i < m) { /* For each row */ 43214c1414c8SBarry Smith nzx = ii[i + 1] - ii[i]; /* Number of nonzeros */ 43224c1414c8SBarry Smith /* Limits the number of elements in a node to 'a->inode.limit' */ 43234c1414c8SBarry Smith for (j = i + 1, idy = idx, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) { 43244c1414c8SBarry Smith nzy = ii[j + 1] - ii[j]; /* Same number of nonzeros */ 43254c1414c8SBarry Smith if (nzy != nzx) break; 43264c1414c8SBarry Smith idy += nzx; /* Same nonzero pattern */ 43279566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(idx, idy, nzx, &flag)); 43284c1414c8SBarry Smith if (!flag) break; 43294c1414c8SBarry Smith } 43304c1414c8SBarry Smith ns[node_count++] = blk_size; 43314c1414c8SBarry Smith idx += blk_size * nzx; 43324c1414c8SBarry Smith i = j; 43334c1414c8SBarry Smith } 43346f2c871aSStefano Zampini } 43354c1414c8SBarry Smith /* If not enough inodes found,, do not use inode version of the routines */ 43366f2c871aSStefano Zampini if (!m || !idx || node_count > .8 * m) { 43379566063dSJacob Faibussowitsch PetscCall(MatSeqAIJ_Inode_ResetOps(A)); 43389566063dSJacob Faibussowitsch PetscCall(PetscFree(a->inode.size)); 43399566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m)); 43404c1414c8SBarry Smith } else { 4341d5f3da31SBarry Smith if (!A->factortype) { 4342375a6242SBarry Smith A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4343375a6242SBarry Smith if (A->rmap->n == A->cmap->n) { 43444108e4d5SBarry Smith A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 43454108e4d5SBarry Smith A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 43464108e4d5SBarry Smith A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 43474108e4d5SBarry Smith A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 43484108e4d5SBarry Smith A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 4349375a6242SBarry Smith } 4350d3ac4fa3SBarry Smith } else { 4351d3ac4fa3SBarry Smith A->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4352d3ac4fa3SBarry Smith } 43534c1414c8SBarry Smith a->inode.node_count = node_count; 43549566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit)); 43554c1414c8SBarry Smith } 4356be6adb11SBarry Smith a->inode.checked = PETSC_TRUE; 4357a02bda8eSBarry Smith a->inode.mat_nonzerostate = A->nonzerostate; 43583ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 43594c1414c8SBarry Smith } 43604c1414c8SBarry Smith 4361d71ae5a4SJacob Faibussowitsch PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A, MatDuplicateOption cpvalues, Mat *C) 4362d71ae5a4SJacob Faibussowitsch { 4363150f0143SBarry Smith Mat B = *C; 4364150f0143SBarry Smith Mat_SeqAIJ *c = (Mat_SeqAIJ *)B->data, *a = (Mat_SeqAIJ *)A->data; 4365150f0143SBarry Smith PetscInt m = A->rmap->n; 4366150f0143SBarry Smith 4367150f0143SBarry Smith PetscFunctionBegin; 4368150f0143SBarry Smith c->inode.use = a->inode.use; 4369150f0143SBarry Smith c->inode.limit = a->inode.limit; 4370150f0143SBarry Smith c->inode.max_limit = a->inode.max_limit; 4371ec710b6aSStefano Zampini c->inode.checked = PETSC_FALSE; 4372ec710b6aSStefano Zampini c->inode.size = NULL; 4373ec710b6aSStefano Zampini c->inode.node_count = 0; 4374ec710b6aSStefano Zampini c->inode.ibdiagvalid = PETSC_FALSE; 4375ec710b6aSStefano Zampini c->inode.ibdiag = NULL; 4376ec710b6aSStefano Zampini c->inode.bdiag = NULL; 4377ec710b6aSStefano Zampini c->inode.mat_nonzerostate = -1; 4378b215bc84SStefano Zampini if (a->inode.use) { 4379ec710b6aSStefano Zampini if (a->inode.checked && a->inode.size) { 43809566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m + 1, &c->inode.size)); 43819566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(c->inode.size, a->inode.size, m + 1)); 4382ec710b6aSStefano Zampini 4383ec710b6aSStefano Zampini c->inode.checked = PETSC_TRUE; 4384ec710b6aSStefano Zampini c->inode.node_count = a->inode.node_count; 4385ec710b6aSStefano Zampini c->inode.mat_nonzerostate = (*C)->nonzerostate; 4386ec710b6aSStefano Zampini } 4387a02bda8eSBarry Smith /* note the table of functions below should match that in MatSeqAIJCheckInode() */ 43882c451681SBarry Smith if (!B->factortype) { 43892c451681SBarry Smith B->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 43902c451681SBarry Smith B->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 43912c451681SBarry Smith B->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 43922c451681SBarry Smith B->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 43932c451681SBarry Smith B->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 43942c451681SBarry Smith B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4395150f0143SBarry Smith } else { 43962c451681SBarry Smith B->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4397150f0143SBarry Smith } 4398150f0143SBarry Smith } 43993ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4400150f0143SBarry Smith } 4401150f0143SBarry Smith 4402d71ae5a4SJacob Faibussowitsch static inline PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols, PetscInt nzl, PetscInt nzu, PetscInt nz, const PetscInt *ai, const PetscInt *aj, const PetscInt *adiag, PetscInt row) 4403d71ae5a4SJacob Faibussowitsch { 44048758e1faSBarry Smith PetscInt k; 44058758e1faSBarry Smith const PetscInt *vi; 44066e111a19SKarl Rupp 440717454e89SShri Abhyankar PetscFunctionBegin; 440817454e89SShri Abhyankar vi = aj + ai[row]; 440917454e89SShri Abhyankar for (k = 0; k < nzl; k++) cols[k] = vi[k]; 441017454e89SShri Abhyankar vi = aj + adiag[row]; 441117454e89SShri Abhyankar cols[nzl] = vi[0]; 441217454e89SShri Abhyankar vi = aj + adiag[row + 1] + 1; 441317454e89SShri Abhyankar for (k = 0; k < nzu; k++) cols[nzl + 1 + k] = vi[k]; 44143ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 441517454e89SShri Abhyankar } 44166936b636SHong Zhang /* 4417a02bda8eSBarry Smith MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix. 4418a02bda8eSBarry Smith Modified from MatSeqAIJCheckInode(). 44196936b636SHong Zhang 44206936b636SHong Zhang Input Parameters: 4421abb87a52SBarry Smith . Mat A - ILU or LU matrix factor 4422abb87a52SBarry Smith 44236936b636SHong Zhang */ 4424d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A) 4425d71ae5a4SJacob Faibussowitsch { 4426019b515eSShri Abhyankar Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4427019b515eSShri Abhyankar PetscInt i, j, m, nzl1, nzu1, nzl2, nzu2, nzx, nzy, node_count, blk_size; 44288758e1faSBarry Smith PetscInt *cols1, *cols2, *ns; 44298758e1faSBarry Smith const PetscInt *ai = a->i, *aj = a->j, *adiag = a->diag; 4430ace3abfcSBarry Smith PetscBool flag; 4431019b515eSShri Abhyankar 4432019b515eSShri Abhyankar PetscFunctionBegin; 44333ba16761SJacob Faibussowitsch if (!a->inode.use) PetscFunctionReturn(PETSC_SUCCESS); 44343ba16761SJacob Faibussowitsch if (a->inode.checked) PetscFunctionReturn(PETSC_SUCCESS); 4435019b515eSShri Abhyankar 4436019b515eSShri Abhyankar m = A->rmap->n; 44372205254eSKarl Rupp if (a->inode.size) ns = a->inode.size; 443848a46eb9SPierre Jolivet else PetscCall(PetscMalloc1(m + 1, &ns)); 4439019b515eSShri Abhyankar 4440019b515eSShri Abhyankar i = 0; 4441019b515eSShri Abhyankar node_count = 0; 44429566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(m, &cols1, m, &cols2)); 4443019b515eSShri Abhyankar while (i < m) { /* For each row */ 4444019b515eSShri Abhyankar nzl1 = ai[i + 1] - ai[i]; /* Number of nonzeros in L */ 4445019b515eSShri Abhyankar nzu1 = adiag[i] - adiag[i + 1] - 1; /* Number of nonzeros in U excluding diagonal*/ 4446019b515eSShri Abhyankar nzx = nzl1 + nzu1 + 1; 44473ba16761SJacob Faibussowitsch PetscCall(MatGetRow_FactoredLU(cols1, nzl1, nzu1, nzx, ai, aj, adiag, i)); 4448019b515eSShri Abhyankar 4449019b515eSShri Abhyankar /* Limits the number of elements in a node to 'a->inode.limit' */ 4450019b515eSShri Abhyankar for (j = i + 1, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) { 4451019b515eSShri Abhyankar nzl2 = ai[j + 1] - ai[j]; 4452019b515eSShri Abhyankar nzu2 = adiag[j] - adiag[j + 1] - 1; 4453019b515eSShri Abhyankar nzy = nzl2 + nzu2 + 1; 4454019b515eSShri Abhyankar if (nzy != nzx) break; 44559566063dSJacob Faibussowitsch PetscCall(MatGetRow_FactoredLU(cols2, nzl2, nzu2, nzy, ai, aj, adiag, j)); 44569566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(cols1, cols2, nzx, &flag)); 44578758e1faSBarry Smith if (!flag) break; 4458019b515eSShri Abhyankar } 4459019b515eSShri Abhyankar ns[node_count++] = blk_size; 4460019b515eSShri Abhyankar i = j; 4461019b515eSShri Abhyankar } 44629566063dSJacob Faibussowitsch PetscCall(PetscFree2(cols1, cols2)); 4463019b515eSShri Abhyankar /* If not enough inodes found,, do not use inode version of the routines */ 4464be6adb11SBarry Smith if (!m || node_count > .8 * m) { 44659566063dSJacob Faibussowitsch PetscCall(PetscFree(ns)); 44662205254eSKarl Rupp 4467019b515eSShri Abhyankar a->inode.node_count = 0; 44680298fd71SBarry Smith a->inode.size = NULL; 4469019b515eSShri Abhyankar a->inode.use = PETSC_FALSE; 44702205254eSKarl Rupp 44719566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m)); 4472019b515eSShri Abhyankar } else { 4473f4259b30SLisandro Dalcin A->ops->mult = NULL; 4474f4259b30SLisandro Dalcin A->ops->sor = NULL; 4475f4259b30SLisandro Dalcin A->ops->multadd = NULL; 4476f4259b30SLisandro Dalcin A->ops->getrowij = NULL; 4477f4259b30SLisandro Dalcin A->ops->restorerowij = NULL; 4478f4259b30SLisandro Dalcin A->ops->getcolumnij = NULL; 4479f4259b30SLisandro Dalcin A->ops->restorecolumnij = NULL; 4480f4259b30SLisandro Dalcin A->ops->coloringpatch = NULL; 4481f4259b30SLisandro Dalcin A->ops->multdiagonalblock = NULL; 4482019b515eSShri Abhyankar a->inode.node_count = node_count; 4483019b515eSShri Abhyankar a->inode.size = ns; 44842205254eSKarl Rupp 44859566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit)); 4486019b515eSShri Abhyankar } 4487be6adb11SBarry Smith a->inode.checked = PETSC_TRUE; 44883ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4489019b515eSShri Abhyankar } 4490019b515eSShri Abhyankar 4491d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A) 4492d71ae5a4SJacob Faibussowitsch { 4493acf2f550SJed Brown Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4494acf2f550SJed Brown 4495acf2f550SJed Brown PetscFunctionBegin; 4496acf2f550SJed Brown a->inode.ibdiagvalid = PETSC_FALSE; 44973ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4498acf2f550SJed Brown } 4499acf2f550SJed Brown 45004c1414c8SBarry Smith /* 45014c1414c8SBarry Smith This is really ugly. if inodes are used this replaces the 45024c1414c8SBarry Smith permutations with ones that correspond to rows/cols of the matrix 45034c1414c8SBarry Smith rather then inode blocks 45044c1414c8SBarry Smith */ 4505d71ae5a4SJacob Faibussowitsch PetscErrorCode MatInodeAdjustForInodes(Mat A, IS *rperm, IS *cperm) 4506d71ae5a4SJacob Faibussowitsch { 45074c1414c8SBarry Smith PetscFunctionBegin; 4508cac4c232SBarry Smith PetscTryMethod(A, "MatInodeAdjustForInodes_C", (Mat, IS *, IS *), (A, rperm, cperm)); 45093ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 45104c1414c8SBarry Smith } 45114c1414c8SBarry Smith 4512d71ae5a4SJacob Faibussowitsch PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A, IS *rperm, IS *cperm) 4513d71ae5a4SJacob Faibussowitsch { 45144c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 45155d0c19d7SBarry Smith PetscInt m = A->rmap->n, n = A->cmap->n, i, j, nslim_row = a->inode.node_count; 45165d0c19d7SBarry Smith const PetscInt *ridx, *cidx; 45174c1414c8SBarry Smith PetscInt row, col, *permr, *permc, *ns_row = a->inode.size, *tns, start_val, end_val, indx; 45184c1414c8SBarry Smith PetscInt nslim_col, *ns_col; 45194c1414c8SBarry Smith IS ris = *rperm, cis = *cperm; 45204c1414c8SBarry Smith 45214c1414c8SBarry Smith PetscFunctionBegin; 45223ba16761SJacob Faibussowitsch if (!a->inode.size) PetscFunctionReturn(PETSC_SUCCESS); /* no inodes so return */ 45233ba16761SJacob Faibussowitsch if (a->inode.node_count == m) PetscFunctionReturn(PETSC_SUCCESS); /* all inodes are of size 1 */ 45244c1414c8SBarry Smith 45259566063dSJacob Faibussowitsch PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col)); 45269566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(((nslim_row > nslim_col) ? nslim_row : nslim_col) + 1, &tns)); 45279566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(m, &permr, n, &permc)); 45284c1414c8SBarry Smith 45299566063dSJacob Faibussowitsch PetscCall(ISGetIndices(ris, &ridx)); 45309566063dSJacob Faibussowitsch PetscCall(ISGetIndices(cis, &cidx)); 45314c1414c8SBarry Smith 45324c1414c8SBarry Smith /* Form the inode structure for the rows of permuted matric using inv perm*/ 45334c1414c8SBarry Smith for (i = 0, tns[0] = 0; i < nslim_row; ++i) tns[i + 1] = tns[i] + ns_row[i]; 45344c1414c8SBarry Smith 45354c1414c8SBarry Smith /* Construct the permutations for rows*/ 45364c1414c8SBarry Smith for (i = 0, row = 0; i < nslim_row; ++i) { 45374c1414c8SBarry Smith indx = ridx[i]; 45384c1414c8SBarry Smith start_val = tns[indx]; 45394c1414c8SBarry Smith end_val = tns[indx + 1]; 45404c1414c8SBarry Smith for (j = start_val; j < end_val; ++j, ++row) permr[row] = j; 45414c1414c8SBarry Smith } 45424c1414c8SBarry Smith 45434c1414c8SBarry Smith /* Form the inode structure for the columns of permuted matrix using inv perm*/ 45444c1414c8SBarry Smith for (i = 0, tns[0] = 0; i < nslim_col; ++i) tns[i + 1] = tns[i] + ns_col[i]; 45454c1414c8SBarry Smith 45464c1414c8SBarry Smith /* Construct permutations for columns */ 45474c1414c8SBarry Smith for (i = 0, col = 0; i < nslim_col; ++i) { 45484c1414c8SBarry Smith indx = cidx[i]; 45494c1414c8SBarry Smith start_val = tns[indx]; 45504c1414c8SBarry Smith end_val = tns[indx + 1]; 45514c1414c8SBarry Smith for (j = start_val; j < end_val; ++j, ++col) permc[col] = j; 45524c1414c8SBarry Smith } 45534c1414c8SBarry Smith 45549566063dSJacob Faibussowitsch PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permr, PETSC_COPY_VALUES, rperm)); 45559566063dSJacob Faibussowitsch PetscCall(ISSetPermutation(*rperm)); 45569566063dSJacob Faibussowitsch PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permc, PETSC_COPY_VALUES, cperm)); 45579566063dSJacob Faibussowitsch PetscCall(ISSetPermutation(*cperm)); 45584c1414c8SBarry Smith 45599566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(ris, &ridx)); 45609566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(cis, &cidx)); 45614c1414c8SBarry Smith 45629566063dSJacob Faibussowitsch PetscCall(PetscFree(ns_col)); 45639566063dSJacob Faibussowitsch PetscCall(PetscFree2(permr, permc)); 45649566063dSJacob Faibussowitsch PetscCall(ISDestroy(&cis)); 45659566063dSJacob Faibussowitsch PetscCall(ISDestroy(&ris)); 45669566063dSJacob Faibussowitsch PetscCall(PetscFree(tns)); 45673ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 45684c1414c8SBarry Smith } 45694c1414c8SBarry Smith 45704c1414c8SBarry Smith /*@C 457111a5261eSBarry Smith MatInodeGetInodeSizes - Returns the inode information of a matrix with inodes 45724c1414c8SBarry Smith 45733f9fe445SBarry Smith Not Collective 45744c1414c8SBarry Smith 45754c1414c8SBarry Smith Input Parameter: 457611a5261eSBarry Smith . A - the Inode matrix or matrix derived from the Inode class -- e.g., `MATSEQAIJ` 45774c1414c8SBarry Smith 4578d8d19677SJose E. Roman Output Parameters: 45794c1414c8SBarry Smith + node_count - no of inodes present in the matrix. 45802ef1f0ffSBarry Smith . sizes - an array of size `node_count`, with the sizes of each inode. 45814c1414c8SBarry Smith - limit - the max size used to generate the inodes. 45824c1414c8SBarry Smith 45834c1414c8SBarry Smith Level: advanced 45844c1414c8SBarry Smith 458511a5261eSBarry Smith Note: 45864c1414c8SBarry Smith It should be called after the matrix is assembled. 45874c1414c8SBarry Smith The contents of the sizes[] array should not be changed. 45882ef1f0ffSBarry Smith `NULL` may be passed for information not needed 45894c1414c8SBarry Smith 45902ef1f0ffSBarry Smith .seealso: [](chapter_matrices), `Mat`, `MatGetInfo()` 45914c1414c8SBarry Smith @*/ 4592d71ae5a4SJacob Faibussowitsch PetscErrorCode MatInodeGetInodeSizes(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit) 4593d71ae5a4SJacob Faibussowitsch { 45945f80ce2aSJacob Faibussowitsch PetscErrorCode (*f)(Mat, PetscInt *, PetscInt **, PetscInt *); 45954c1414c8SBarry Smith 45964c1414c8SBarry Smith PetscFunctionBegin; 45975f80ce2aSJacob Faibussowitsch PetscCheck(A->assembled, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix"); 45989566063dSJacob Faibussowitsch PetscCall(PetscObjectQueryFunction((PetscObject)A, "MatInodeGetInodeSizes_C", &f)); 45999566063dSJacob Faibussowitsch if (f) PetscCall((*f)(A, node_count, sizes, limit)); 46003ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 46014c1414c8SBarry Smith } 46024c1414c8SBarry Smith 4603d71ae5a4SJacob Faibussowitsch PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit) 4604d71ae5a4SJacob Faibussowitsch { 46054c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 46064c1414c8SBarry Smith 46074c1414c8SBarry Smith PetscFunctionBegin; 46084c1414c8SBarry Smith if (node_count) *node_count = a->inode.node_count; 46094c1414c8SBarry Smith if (sizes) *sizes = a->inode.size; 46104c1414c8SBarry Smith if (limit) *limit = a->inode.limit; 46113ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 46124c1414c8SBarry Smith } 4613