14c1414c8SBarry Smith /* 24c1414c8SBarry Smith This file provides high performance routines for the Inode format (compressed sparse row) 34c1414c8SBarry Smith by taking advantage of rows with identical nonzero structure (I-nodes). 44c1414c8SBarry Smith */ 5c6db04a5SJed Brown #include <../src/mat/impls/aij/seq/aij.h> 6fb56d528SJed Brown #if defined(PETSC_HAVE_XMMINTRIN_H) 7fb56d528SJed Brown #include <xmmintrin.h> 8fb56d528SJed Brown #endif 94c1414c8SBarry Smith 10d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatCreateColInode_Private(Mat A, PetscInt *size, PetscInt **ns) 11d71ae5a4SJacob Faibussowitsch { 124c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 134c1414c8SBarry Smith PetscInt i, count, m, n, min_mn, *ns_row, *ns_col; 144c1414c8SBarry Smith 154c1414c8SBarry Smith PetscFunctionBegin; 16d0f46423SBarry Smith n = A->cmap->n; 17d0f46423SBarry Smith m = A->rmap->n; 184d12350bSJunchao Zhang PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 194d12350bSJunchao Zhang ns_row = a->inode.size_csr; 204c1414c8SBarry Smith 214c1414c8SBarry Smith min_mn = (m < n) ? m : n; 224c1414c8SBarry Smith if (!ns) { 234d12350bSJunchao Zhang for (count = 0, i = 0; count < min_mn; count += (ns_row[i + 1] - ns_row[i]), i++); 24fbccb6d4SPierre Jolivet for (; count + 1 < n; count++, i++); 25ad540459SPierre Jolivet if (count < n) i++; 264c1414c8SBarry Smith *size = i; 273ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 284c1414c8SBarry Smith } 299566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(n + 1, &ns_col)); 304d12350bSJunchao Zhang ns_col[0] = 0; 314c1414c8SBarry Smith 324c1414c8SBarry Smith /* Use the same row structure wherever feasible. */ 334d12350bSJunchao Zhang for (count = 0, i = 0; count < min_mn; count += (ns_row[i + 1] - ns_row[i]), i++) ns_col[i + 1] = ns_row[i + 1]; 344c1414c8SBarry Smith 354c1414c8SBarry Smith /* if m < n; pad up the remainder with inode_limit */ 364d12350bSJunchao Zhang for (; count + 1 < n; count++, i++) ns_col[i + 1] = ns_col[i] + 1; 37aaa8cc7dSPierre Jolivet /* The last node is the odd ball. pad it up with the remaining rows; */ 384c1414c8SBarry Smith if (count < n) { 394d12350bSJunchao Zhang ns_col[i + 1] = ns_col[i] + (n - count); 404c1414c8SBarry Smith i++; 414c1414c8SBarry Smith } else if (count > n) { 424c1414c8SBarry Smith /* Adjust for the over estimation */ 434d12350bSJunchao Zhang ns_col[i] += n - count; 444c1414c8SBarry Smith } 454c1414c8SBarry Smith *size = i; 464c1414c8SBarry Smith *ns = ns_col; 473ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 484c1414c8SBarry Smith } 494c1414c8SBarry Smith 504c1414c8SBarry Smith /* 514c1414c8SBarry Smith This builds symmetric version of nonzero structure, 524c1414c8SBarry Smith */ 53d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift) 54d71ae5a4SJacob Faibussowitsch { 554c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 568758e1faSBarry Smith PetscInt *work, *ia, *ja, nz, nslim_row, nslim_col, m, row, col, n; 574d12350bSJunchao Zhang PetscInt *tns, *tvc, *ns_row = a->inode.size_csr, *ns_col, nsz, i1, i2; 588758e1faSBarry Smith const PetscInt *j, *jmax, *ai = a->i, *aj = a->j; 594c1414c8SBarry Smith 604c1414c8SBarry Smith PetscFunctionBegin; 614c1414c8SBarry Smith nslim_row = a->inode.node_count; 62d0f46423SBarry Smith m = A->rmap->n; 63d0f46423SBarry Smith n = A->cmap->n; 6408401ef6SPierre Jolivet PetscCheck(m == n, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square"); 654d12350bSJunchao Zhang PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 664c1414c8SBarry Smith 674c1414c8SBarry Smith /* Use the row_inode as column_inode */ 684c1414c8SBarry Smith nslim_col = nslim_row; 694c1414c8SBarry Smith ns_col = ns_row; 704c1414c8SBarry Smith 7135cb6cd3SPierre Jolivet /* allocate space for reformatted inode structure */ 729566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc)); 734d12350bSJunchao Zhang for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + (ns_row[i1 + 1] - ns_row[i1]); 744c1414c8SBarry Smith 754c1414c8SBarry Smith for (i1 = 0, col = 0; i1 < nslim_col; ++i1) { 764d12350bSJunchao Zhang nsz = ns_col[i1 + 1] - ns_col[i1]; 772205254eSKarl Rupp for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1; 784c1414c8SBarry Smith } 794c1414c8SBarry Smith /* allocate space for row pointers */ 809566063dSJacob Faibussowitsch PetscCall(PetscCalloc1(nslim_row + 1, &ia)); 814c1414c8SBarry Smith *iia = ia; 829566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(nslim_row + 1, &work)); 834c1414c8SBarry Smith 844c1414c8SBarry Smith /* determine the number of columns in each row */ 854c1414c8SBarry Smith ia[0] = oshift; 864d12350bSJunchao Zhang for (i1 = 0; i1 < nslim_row; i1++) { 874d12350bSJunchao Zhang row = ns_row[i1]; 884c1414c8SBarry Smith j = aj + ai[row] + ishift; 894c1414c8SBarry Smith jmax = aj + ai[row + 1] + ishift; 9083fed2edSSatish Balay if (j == jmax) continue; /* empty row */ 914c1414c8SBarry Smith col = *j++ + ishift; 924c1414c8SBarry Smith i2 = tvc[col]; 936aad120cSJose E. Roman while (i2 < i1 && j < jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elements */ 944c1414c8SBarry Smith ia[i1 + 1]++; 954c1414c8SBarry Smith ia[i2 + 1]++; 964c1414c8SBarry Smith i2++; /* Start col of next node */ 9790d2dec7SBarry Smith while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j; 984c1414c8SBarry Smith i2 = tvc[col]; 994c1414c8SBarry Smith } 1004c1414c8SBarry Smith if (i2 == i1) ia[i2 + 1]++; /* now the diagonal element */ 1014c1414c8SBarry Smith } 1024c1414c8SBarry Smith 1034c1414c8SBarry Smith /* shift ia[i] to point to next row */ 1044c1414c8SBarry Smith for (i1 = 1; i1 < nslim_row + 1; i1++) { 1054c1414c8SBarry Smith row = ia[i1 - 1]; 1064c1414c8SBarry Smith ia[i1] += row; 1074c1414c8SBarry Smith work[i1 - 1] = row - oshift; 1084c1414c8SBarry Smith } 1094c1414c8SBarry Smith 1104c1414c8SBarry Smith /* allocate space for column pointers */ 1114c1414c8SBarry Smith nz = ia[nslim_row] + (!ishift); 1129566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(nz, &ja)); 1134c1414c8SBarry Smith *jja = ja; 1144c1414c8SBarry Smith 1154c1414c8SBarry Smith /* loop over lower triangular part putting into ja */ 1164d12350bSJunchao Zhang for (i1 = 0; i1 < nslim_row; i1++) { 1174d12350bSJunchao Zhang row = ns_row[i1]; 1184c1414c8SBarry Smith j = aj + ai[row] + ishift; 1194c1414c8SBarry Smith jmax = aj + ai[row + 1] + ishift; 12083fed2edSSatish Balay if (j == jmax) continue; /* empty row */ 1214c1414c8SBarry Smith col = *j++ + ishift; 1224c1414c8SBarry Smith i2 = tvc[col]; 1234c1414c8SBarry Smith while (i2 < i1 && j < jmax) { 1244c1414c8SBarry Smith ja[work[i2]++] = i1 + oshift; 1254c1414c8SBarry Smith ja[work[i1]++] = i2 + oshift; 1264c1414c8SBarry Smith ++i2; 12790d2dec7SBarry Smith while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j; /* Skip rest col indices in this node */ 1284c1414c8SBarry Smith i2 = tvc[col]; 1294c1414c8SBarry Smith } 1304c1414c8SBarry Smith if (i2 == i1) ja[work[i1]++] = i2 + oshift; 1314c1414c8SBarry Smith } 1329566063dSJacob Faibussowitsch PetscCall(PetscFree(work)); 1339566063dSJacob Faibussowitsch PetscCall(PetscFree2(tns, tvc)); 1343ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1354c1414c8SBarry Smith } 1364c1414c8SBarry Smith 1374c1414c8SBarry Smith /* 1384c1414c8SBarry Smith This builds nonsymmetric version of nonzero structure, 1394c1414c8SBarry Smith */ 140d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift) 141d71ae5a4SJacob Faibussowitsch { 1424c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1438758e1faSBarry Smith PetscInt *work, *ia, *ja, nz, nslim_row, n, row, col, *ns_col, nslim_col; 1448758e1faSBarry Smith PetscInt *tns, *tvc, nsz, i1, i2; 1454d12350bSJunchao Zhang const PetscInt *j, *ai = a->i, *aj = a->j, *ns_row = a->inode.size_csr; 1464c1414c8SBarry Smith 1474c1414c8SBarry Smith PetscFunctionBegin; 1484d12350bSJunchao Zhang PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 1494c1414c8SBarry Smith nslim_row = a->inode.node_count; 150d0f46423SBarry Smith n = A->cmap->n; 1514c1414c8SBarry Smith 1524c1414c8SBarry Smith /* Create The column_inode for this matrix */ 1539566063dSJacob Faibussowitsch PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col)); 1544c1414c8SBarry Smith 15535cb6cd3SPierre Jolivet /* allocate space for reformatted column_inode structure */ 1569566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc)); 1574d12350bSJunchao Zhang for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + (ns_col[i1 + 1] - ns_col[i1]); 1584c1414c8SBarry Smith 1594c1414c8SBarry Smith for (i1 = 0, col = 0; i1 < nslim_col; ++i1) { 1604d12350bSJunchao Zhang nsz = ns_col[i1 + 1] - ns_col[i1]; 1612205254eSKarl Rupp for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1; 1624c1414c8SBarry Smith } 1634c1414c8SBarry Smith /* allocate space for row pointers */ 1649566063dSJacob Faibussowitsch PetscCall(PetscCalloc1(nslim_row + 1, &ia)); 1654c1414c8SBarry Smith *iia = ia; 1669566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(nslim_row + 1, &work)); 1674c1414c8SBarry Smith 1684c1414c8SBarry Smith /* determine the number of columns in each row */ 1694c1414c8SBarry Smith ia[0] = oshift; 1704d12350bSJunchao Zhang for (i1 = 0; i1 < nslim_row; i1++) { 1714d12350bSJunchao Zhang row = ns_row[i1]; 1724c1414c8SBarry Smith j = aj + ai[row] + ishift; 17383fed2edSSatish Balay nz = ai[row + 1] - ai[row]; 17483fed2edSSatish Balay if (!nz) continue; /* empty row */ 1754c1414c8SBarry Smith col = *j++ + ishift; 1764c1414c8SBarry Smith i2 = tvc[col]; 1776aad120cSJose E. Roman while (nz-- > 0) { /* off-diagonal elements */ 1784c1414c8SBarry Smith ia[i1 + 1]++; 1794c1414c8SBarry Smith i2++; /* Start col of next node */ 180a8e3a797SJed Brown while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 1814c1414c8SBarry Smith if (nz > 0) i2 = tvc[col]; 1824c1414c8SBarry Smith } 1834c1414c8SBarry Smith } 1844c1414c8SBarry Smith 1854c1414c8SBarry Smith /* shift ia[i] to point to next row */ 1864c1414c8SBarry Smith for (i1 = 1; i1 < nslim_row + 1; i1++) { 1874c1414c8SBarry Smith row = ia[i1 - 1]; 1884c1414c8SBarry Smith ia[i1] += row; 1894c1414c8SBarry Smith work[i1 - 1] = row - oshift; 1904c1414c8SBarry Smith } 1914c1414c8SBarry Smith 1924c1414c8SBarry Smith /* allocate space for column pointers */ 1934c1414c8SBarry Smith nz = ia[nslim_row] + (!ishift); 1949566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(nz, &ja)); 1954c1414c8SBarry Smith *jja = ja; 1964c1414c8SBarry Smith 1974c1414c8SBarry Smith /* loop over matrix putting into ja */ 1984d12350bSJunchao Zhang for (i1 = 0; i1 < nslim_row; i1++) { 1994d12350bSJunchao Zhang row = ns_row[i1]; 2004c1414c8SBarry Smith j = aj + ai[row] + ishift; 20183fed2edSSatish Balay nz = ai[row + 1] - ai[row]; 20283fed2edSSatish Balay if (!nz) continue; /* empty row */ 2034c1414c8SBarry Smith col = *j++ + ishift; 2044c1414c8SBarry Smith i2 = tvc[col]; 2054c1414c8SBarry Smith while (nz-- > 0) { 2064c1414c8SBarry Smith ja[work[i1]++] = i2 + oshift; 2074c1414c8SBarry Smith ++i2; 208a8e3a797SJed Brown while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 2094c1414c8SBarry Smith if (nz > 0) i2 = tvc[col]; 2104c1414c8SBarry Smith } 2114c1414c8SBarry Smith } 2129566063dSJacob Faibussowitsch PetscCall(PetscFree(ns_col)); 2139566063dSJacob Faibussowitsch PetscCall(PetscFree(work)); 2149566063dSJacob Faibussowitsch PetscCall(PetscFree2(tns, tvc)); 2153ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2164c1414c8SBarry Smith } 2174c1414c8SBarry Smith 218d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) 219d71ae5a4SJacob Faibussowitsch { 2204c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2214c1414c8SBarry Smith 2224c1414c8SBarry Smith PetscFunctionBegin; 22350ba90b4SBarry Smith if (n) *n = a->inode.node_count; 2243ba16761SJacob Faibussowitsch if (!ia) PetscFunctionReturn(PETSC_SUCCESS); 2258f7157efSSatish Balay if (!blockcompressed) { 2269566063dSJacob Faibussowitsch PetscCall(MatGetRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done)); 2278f7157efSSatish Balay } else if (symmetric) { 2289566063dSJacob Faibussowitsch PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift)); 2294c1414c8SBarry Smith } else { 2309566063dSJacob Faibussowitsch PetscCall(MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift)); 2314c1414c8SBarry Smith } 2323ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2334c1414c8SBarry Smith } 2344c1414c8SBarry Smith 235d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) 236d71ae5a4SJacob Faibussowitsch { 2374c1414c8SBarry Smith PetscFunctionBegin; 2383ba16761SJacob Faibussowitsch if (!ia) PetscFunctionReturn(PETSC_SUCCESS); 2398f7157efSSatish Balay 2408f7157efSSatish Balay if (!blockcompressed) { 2419566063dSJacob Faibussowitsch PetscCall(MatRestoreRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done)); 2428f7157efSSatish Balay } else { 2439566063dSJacob Faibussowitsch PetscCall(PetscFree(*ia)); 2449566063dSJacob Faibussowitsch PetscCall(PetscFree(*ja)); 2458f7157efSSatish Balay } 2463ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2474c1414c8SBarry Smith } 2484c1414c8SBarry Smith 249d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift) 250d71ae5a4SJacob Faibussowitsch { 2514c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2524c1414c8SBarry Smith PetscInt *work, *ia, *ja, *j, nz, nslim_row, n, row, col, *ns_col, nslim_col; 2534d12350bSJunchao Zhang PetscInt *tns, *tvc, *ns_row = a->inode.size_csr, nsz, i1, i2, *ai = a->i, *aj = a->j; 2544c1414c8SBarry Smith 2554c1414c8SBarry Smith PetscFunctionBegin; 2564d12350bSJunchao Zhang PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 2574c1414c8SBarry Smith nslim_row = a->inode.node_count; 258d0f46423SBarry Smith n = A->cmap->n; 2594c1414c8SBarry Smith 2604c1414c8SBarry Smith /* Create The column_inode for this matrix */ 2619566063dSJacob Faibussowitsch PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col)); 2624c1414c8SBarry Smith 26335cb6cd3SPierre Jolivet /* allocate space for reformatted column_inode structure */ 2649566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc)); 2654d12350bSJunchao Zhang for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + (ns_col[i1 + 1] - ns_col[i1]); 2664c1414c8SBarry Smith 2674c1414c8SBarry Smith for (i1 = 0, col = 0; i1 < nslim_col; ++i1) { 2684d12350bSJunchao Zhang nsz = ns_col[i1 + 1] - ns_col[i1]; 2692205254eSKarl Rupp for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1; 2704c1414c8SBarry Smith } 2714c1414c8SBarry Smith /* allocate space for column pointers */ 2729566063dSJacob Faibussowitsch PetscCall(PetscCalloc1(nslim_col + 1, &ia)); 2734c1414c8SBarry Smith *iia = ia; 2749566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(nslim_col + 1, &work)); 2754c1414c8SBarry Smith 2764c1414c8SBarry Smith /* determine the number of columns in each row */ 2774c1414c8SBarry Smith ia[0] = oshift; 2784d12350bSJunchao Zhang for (i1 = 0; i1 < nslim_row; i1++) { 2794d12350bSJunchao Zhang row = ns_row[i1]; 2804c1414c8SBarry Smith j = aj + ai[row] + ishift; 2814c1414c8SBarry Smith col = *j++ + ishift; 2824c1414c8SBarry Smith i2 = tvc[col]; 2834c1414c8SBarry Smith nz = ai[row + 1] - ai[row]; 2846aad120cSJose E. Roman while (nz-- > 0) { /* off-diagonal elements */ 2854c1414c8SBarry Smith /* ia[i1+1]++; */ 2864c1414c8SBarry Smith ia[i2 + 1]++; 2874c1414c8SBarry Smith i2++; 288a8e3a797SJed Brown while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 2894c1414c8SBarry Smith if (nz > 0) i2 = tvc[col]; 2904c1414c8SBarry Smith } 2914c1414c8SBarry Smith } 2924c1414c8SBarry Smith 2934c1414c8SBarry Smith /* shift ia[i] to point to next col */ 2944c1414c8SBarry Smith for (i1 = 1; i1 < nslim_col + 1; i1++) { 2954c1414c8SBarry Smith col = ia[i1 - 1]; 2964c1414c8SBarry Smith ia[i1] += col; 2974c1414c8SBarry Smith work[i1 - 1] = col - oshift; 2984c1414c8SBarry Smith } 2994c1414c8SBarry Smith 3004c1414c8SBarry Smith /* allocate space for column pointers */ 3014c1414c8SBarry Smith nz = ia[nslim_col] + (!ishift); 3029566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(nz, &ja)); 3034c1414c8SBarry Smith *jja = ja; 3044c1414c8SBarry Smith 3054c1414c8SBarry Smith /* loop over matrix putting into ja */ 3064d12350bSJunchao Zhang for (i1 = 0; i1 < nslim_row; i1++) { 3074d12350bSJunchao Zhang row = ns_row[i1]; 3084c1414c8SBarry Smith j = aj + ai[row] + ishift; 3094c1414c8SBarry Smith col = *j++ + ishift; 3104c1414c8SBarry Smith i2 = tvc[col]; 3114c1414c8SBarry Smith nz = ai[row + 1] - ai[row]; 3124c1414c8SBarry Smith while (nz-- > 0) { 3134c1414c8SBarry Smith /* ja[work[i1]++] = i2 + oshift; */ 3144c1414c8SBarry Smith ja[work[i2]++] = i1 + oshift; 3154c1414c8SBarry Smith i2++; 316a8e3a797SJed Brown while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 3174c1414c8SBarry Smith if (nz > 0) i2 = tvc[col]; 3184c1414c8SBarry Smith } 3194c1414c8SBarry Smith } 3209566063dSJacob Faibussowitsch PetscCall(PetscFree(ns_col)); 3219566063dSJacob Faibussowitsch PetscCall(PetscFree(work)); 3229566063dSJacob Faibussowitsch PetscCall(PetscFree2(tns, tvc)); 3233ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3244c1414c8SBarry Smith } 3254c1414c8SBarry Smith 326d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) 327d71ae5a4SJacob Faibussowitsch { 3284c1414c8SBarry Smith PetscFunctionBegin; 3299566063dSJacob Faibussowitsch PetscCall(MatCreateColInode_Private(A, n, NULL)); 3303ba16761SJacob Faibussowitsch if (!ia) PetscFunctionReturn(PETSC_SUCCESS); 3314c1414c8SBarry Smith 3328f7157efSSatish Balay if (!blockcompressed) { 3339566063dSJacob Faibussowitsch PetscCall(MatGetColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done)); 3348f7157efSSatish Balay } else if (symmetric) { 335a5b23f4aSJose E. Roman /* Since the indices are symmetric it doesn't matter */ 3369566063dSJacob Faibussowitsch PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift)); 3374c1414c8SBarry Smith } else { 3389566063dSJacob Faibussowitsch PetscCall(MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift)); 3394c1414c8SBarry Smith } 3403ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3414c1414c8SBarry Smith } 3424c1414c8SBarry Smith 343d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) 344d71ae5a4SJacob Faibussowitsch { 3454c1414c8SBarry Smith PetscFunctionBegin; 3463ba16761SJacob Faibussowitsch if (!ia) PetscFunctionReturn(PETSC_SUCCESS); 3478f7157efSSatish Balay if (!blockcompressed) { 3489566063dSJacob Faibussowitsch PetscCall(MatRestoreColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done)); 3498f7157efSSatish Balay } else { 3509566063dSJacob Faibussowitsch PetscCall(PetscFree(*ia)); 3519566063dSJacob Faibussowitsch PetscCall(PetscFree(*ja)); 3528f7157efSSatish Balay } 3533ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3544c1414c8SBarry Smith } 3554c1414c8SBarry Smith 356d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqAIJ_Inode(Mat A, Vec xx, Vec yy) 357d71ae5a4SJacob Faibussowitsch { 3584c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 359d9fead3dSBarry Smith PetscScalar *y; 360dd6ea824SBarry Smith const PetscScalar *x; 361708a0e70SJunchao Zhang PetscInt row, node_max, nonzerorow = 0; 362708a0e70SJunchao Zhang PetscInt *ns; 3634c1414c8SBarry Smith 3644c1414c8SBarry Smith PetscFunctionBegin; 3654d12350bSJunchao Zhang PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 3664c1414c8SBarry Smith node_max = a->inode.node_count; 3674d12350bSJunchao Zhang ns = a->inode.size_csr; /* Node Size array */ 3689566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 3699566063dSJacob Faibussowitsch PetscCall(VecGetArray(yy, &y)); 3704c1414c8SBarry Smith 371708a0e70SJunchao Zhang PetscPragmaUseOMPKernels(parallel for private(row) reduction(+:nonzerorow)) 372708a0e70SJunchao Zhang for (PetscInt i = 0; i < node_max; ++i) { 373708a0e70SJunchao Zhang PetscInt i1, i2, nsz, n, sz; 374708a0e70SJunchao Zhang const MatScalar *v1, *v2, *v3, *v4, *v5; 375708a0e70SJunchao Zhang PetscScalar sum1, sum2, sum3, sum4, sum5, tmp0, tmp1; 376708a0e70SJunchao Zhang const PetscInt *idx; 377708a0e70SJunchao Zhang 378708a0e70SJunchao Zhang #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 379708a0e70SJunchao Zhang #pragma disjoint(*x, *y, *v1, *v2, *v3, *v4, *v5) 380708a0e70SJunchao Zhang #endif 381708a0e70SJunchao Zhang row = ns[i]; 3824d12350bSJunchao Zhang nsz = ns[i + 1] - ns[i]; 383708a0e70SJunchao Zhang n = a->i[row + 1] - a->i[row]; 38498c9bda7SSatish Balay nonzerorow += (n > 0) * nsz; 385708a0e70SJunchao Zhang 386708a0e70SJunchao Zhang idx = &a->j[a->i[row]]; 387708a0e70SJunchao Zhang v1 = &a->a[a->i[row]]; 38850d8bf02SJed Brown PetscPrefetchBlock(idx + nsz * n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the indices for the block row after the current one */ 38950d8bf02SJed Brown PetscPrefetchBlock(v1 + nsz * n, nsz * n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one */ 3904c1414c8SBarry Smith sz = n; /* No of non zeros in this row */ 3914c1414c8SBarry Smith /* Switch on the size of Node */ 3924c1414c8SBarry Smith switch (nsz) { /* Each loop in 'case' is unrolled */ 3934c1414c8SBarry Smith case 1: 39475567043SBarry Smith sum1 = 0.; 3954c1414c8SBarry Smith 3964c1414c8SBarry Smith for (n = 0; n < sz - 1; n += 2) { 3974c1414c8SBarry Smith i1 = idx[0]; /* The instructions are ordered to */ 3984c1414c8SBarry Smith i2 = idx[1]; /* make the compiler's job easy */ 3994c1414c8SBarry Smith idx += 2; 4004c1414c8SBarry Smith tmp0 = x[i1]; 4014c1414c8SBarry Smith tmp1 = x[i2]; 4029371c9d4SSatish Balay sum1 += v1[0] * tmp0 + v1[1] * tmp1; 4039371c9d4SSatish Balay v1 += 2; 4044c1414c8SBarry Smith } 4054c1414c8SBarry Smith 4064c1414c8SBarry Smith if (n == sz - 1) { /* Take care of the last nonzero */ 4074c1414c8SBarry Smith tmp0 = x[*idx++]; 4084c1414c8SBarry Smith sum1 += *v1++ * tmp0; 4094c1414c8SBarry Smith } 4104c1414c8SBarry Smith y[row++] = sum1; 4114c1414c8SBarry Smith break; 4124c1414c8SBarry Smith case 2: 41375567043SBarry Smith sum1 = 0.; 41475567043SBarry Smith sum2 = 0.; 4154c1414c8SBarry Smith v2 = v1 + n; 4164c1414c8SBarry Smith 4174c1414c8SBarry Smith for (n = 0; n < sz - 1; n += 2) { 4184c1414c8SBarry Smith i1 = idx[0]; 4194c1414c8SBarry Smith i2 = idx[1]; 4204c1414c8SBarry Smith idx += 2; 4214c1414c8SBarry Smith tmp0 = x[i1]; 4224c1414c8SBarry Smith tmp1 = x[i2]; 4239371c9d4SSatish Balay sum1 += v1[0] * tmp0 + v1[1] * tmp1; 4249371c9d4SSatish Balay v1 += 2; 4259371c9d4SSatish Balay sum2 += v2[0] * tmp0 + v2[1] * tmp1; 4269371c9d4SSatish Balay v2 += 2; 4274c1414c8SBarry Smith } 4284c1414c8SBarry Smith if (n == sz - 1) { 4294c1414c8SBarry Smith tmp0 = x[*idx++]; 4304c1414c8SBarry Smith sum1 += *v1++ * tmp0; 4314c1414c8SBarry Smith sum2 += *v2++ * tmp0; 4324c1414c8SBarry Smith } 4334c1414c8SBarry Smith y[row++] = sum1; 4344c1414c8SBarry Smith y[row++] = sum2; 4354c1414c8SBarry Smith v1 = v2; /* Since the next block to be processed starts there*/ 4364c1414c8SBarry Smith idx += sz; 4374c1414c8SBarry Smith break; 4384c1414c8SBarry Smith case 3: 43975567043SBarry Smith sum1 = 0.; 44075567043SBarry Smith sum2 = 0.; 44175567043SBarry Smith sum3 = 0.; 4424c1414c8SBarry Smith v2 = v1 + n; 4434c1414c8SBarry Smith v3 = v2 + n; 4444c1414c8SBarry Smith 4454c1414c8SBarry Smith for (n = 0; n < sz - 1; n += 2) { 4464c1414c8SBarry Smith i1 = idx[0]; 4474c1414c8SBarry Smith i2 = idx[1]; 4484c1414c8SBarry Smith idx += 2; 4494c1414c8SBarry Smith tmp0 = x[i1]; 4504c1414c8SBarry Smith tmp1 = x[i2]; 4519371c9d4SSatish Balay sum1 += v1[0] * tmp0 + v1[1] * tmp1; 4529371c9d4SSatish Balay v1 += 2; 4539371c9d4SSatish Balay sum2 += v2[0] * tmp0 + v2[1] * tmp1; 4549371c9d4SSatish Balay v2 += 2; 4559371c9d4SSatish Balay sum3 += v3[0] * tmp0 + v3[1] * tmp1; 4569371c9d4SSatish Balay v3 += 2; 4574c1414c8SBarry Smith } 4584c1414c8SBarry Smith if (n == sz - 1) { 4594c1414c8SBarry Smith tmp0 = x[*idx++]; 4604c1414c8SBarry Smith sum1 += *v1++ * tmp0; 4614c1414c8SBarry Smith sum2 += *v2++ * tmp0; 4624c1414c8SBarry Smith sum3 += *v3++ * tmp0; 4634c1414c8SBarry Smith } 4644c1414c8SBarry Smith y[row++] = sum1; 4654c1414c8SBarry Smith y[row++] = sum2; 4664c1414c8SBarry Smith y[row++] = sum3; 4674c1414c8SBarry Smith v1 = v3; /* Since the next block to be processed starts there*/ 4684c1414c8SBarry Smith idx += 2 * sz; 4694c1414c8SBarry Smith break; 4704c1414c8SBarry Smith case 4: 47175567043SBarry Smith sum1 = 0.; 47275567043SBarry Smith sum2 = 0.; 47375567043SBarry Smith sum3 = 0.; 47475567043SBarry Smith sum4 = 0.; 4754c1414c8SBarry Smith v2 = v1 + n; 4764c1414c8SBarry Smith v3 = v2 + n; 4774c1414c8SBarry Smith v4 = v3 + n; 4784c1414c8SBarry Smith 4794c1414c8SBarry Smith for (n = 0; n < sz - 1; n += 2) { 4804c1414c8SBarry Smith i1 = idx[0]; 4814c1414c8SBarry Smith i2 = idx[1]; 4824c1414c8SBarry Smith idx += 2; 4834c1414c8SBarry Smith tmp0 = x[i1]; 4844c1414c8SBarry Smith tmp1 = x[i2]; 4859371c9d4SSatish Balay sum1 += v1[0] * tmp0 + v1[1] * tmp1; 4869371c9d4SSatish Balay v1 += 2; 4879371c9d4SSatish Balay sum2 += v2[0] * tmp0 + v2[1] * tmp1; 4889371c9d4SSatish Balay v2 += 2; 4899371c9d4SSatish Balay sum3 += v3[0] * tmp0 + v3[1] * tmp1; 4909371c9d4SSatish Balay v3 += 2; 4919371c9d4SSatish Balay sum4 += v4[0] * tmp0 + v4[1] * tmp1; 4929371c9d4SSatish Balay v4 += 2; 4934c1414c8SBarry Smith } 4944c1414c8SBarry Smith if (n == sz - 1) { 4954c1414c8SBarry Smith tmp0 = x[*idx++]; 4964c1414c8SBarry Smith sum1 += *v1++ * tmp0; 4974c1414c8SBarry Smith sum2 += *v2++ * tmp0; 4984c1414c8SBarry Smith sum3 += *v3++ * tmp0; 4994c1414c8SBarry Smith sum4 += *v4++ * tmp0; 5004c1414c8SBarry Smith } 5014c1414c8SBarry Smith y[row++] = sum1; 5024c1414c8SBarry Smith y[row++] = sum2; 5034c1414c8SBarry Smith y[row++] = sum3; 5044c1414c8SBarry Smith y[row++] = sum4; 5054c1414c8SBarry Smith v1 = v4; /* Since the next block to be processed starts there*/ 5064c1414c8SBarry Smith idx += 3 * sz; 5074c1414c8SBarry Smith break; 5084c1414c8SBarry Smith case 5: 50975567043SBarry Smith sum1 = 0.; 51075567043SBarry Smith sum2 = 0.; 51175567043SBarry Smith sum3 = 0.; 51275567043SBarry Smith sum4 = 0.; 51375567043SBarry Smith sum5 = 0.; 5144c1414c8SBarry Smith v2 = v1 + n; 5154c1414c8SBarry Smith v3 = v2 + n; 5164c1414c8SBarry Smith v4 = v3 + n; 5174c1414c8SBarry Smith v5 = v4 + n; 5184c1414c8SBarry Smith 5194c1414c8SBarry Smith for (n = 0; n < sz - 1; n += 2) { 5204c1414c8SBarry Smith i1 = idx[0]; 5214c1414c8SBarry Smith i2 = idx[1]; 5224c1414c8SBarry Smith idx += 2; 5234c1414c8SBarry Smith tmp0 = x[i1]; 5244c1414c8SBarry Smith tmp1 = x[i2]; 5259371c9d4SSatish Balay sum1 += v1[0] * tmp0 + v1[1] * tmp1; 5269371c9d4SSatish Balay v1 += 2; 5279371c9d4SSatish Balay sum2 += v2[0] * tmp0 + v2[1] * tmp1; 5289371c9d4SSatish Balay v2 += 2; 5299371c9d4SSatish Balay sum3 += v3[0] * tmp0 + v3[1] * tmp1; 5309371c9d4SSatish Balay v3 += 2; 5319371c9d4SSatish Balay sum4 += v4[0] * tmp0 + v4[1] * tmp1; 5329371c9d4SSatish Balay v4 += 2; 5339371c9d4SSatish Balay sum5 += v5[0] * tmp0 + v5[1] * tmp1; 5349371c9d4SSatish Balay v5 += 2; 5354c1414c8SBarry Smith } 5364c1414c8SBarry Smith if (n == sz - 1) { 5374c1414c8SBarry Smith tmp0 = x[*idx++]; 5384c1414c8SBarry Smith sum1 += *v1++ * tmp0; 5394c1414c8SBarry Smith sum2 += *v2++ * tmp0; 5404c1414c8SBarry Smith sum3 += *v3++ * tmp0; 5414c1414c8SBarry Smith sum4 += *v4++ * tmp0; 5424c1414c8SBarry Smith sum5 += *v5++ * tmp0; 5434c1414c8SBarry Smith } 5444c1414c8SBarry Smith y[row++] = sum1; 5454c1414c8SBarry Smith y[row++] = sum2; 5464c1414c8SBarry Smith y[row++] = sum3; 5474c1414c8SBarry Smith y[row++] = sum4; 5484c1414c8SBarry Smith y[row++] = sum5; 5494c1414c8SBarry Smith v1 = v5; /* Since the next block to be processed starts there */ 5504c1414c8SBarry Smith idx += 4 * sz; 5514c1414c8SBarry Smith break; 552d71ae5a4SJacob Faibussowitsch default: 553708a0e70SJunchao Zhang SETERRABORT(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nsz); 5544c1414c8SBarry Smith } 5554c1414c8SBarry Smith } 5569566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 5579566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(yy, &y)); 5589566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz - nonzerorow)); 5593ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 5604c1414c8SBarry Smith } 5612ef1f0ffSBarry Smith 5624108e4d5SBarry Smith /* Almost same code as the MatMult_SeqAIJ_Inode() */ 563d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A, Vec xx, Vec zz, Vec yy) 564d71ae5a4SJacob Faibussowitsch { 5654c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 5664c1414c8SBarry Smith PetscScalar sum1, sum2, sum3, sum4, sum5, tmp0, tmp1; 5678758e1faSBarry Smith const MatScalar *v1, *v2, *v3, *v4, *v5; 5688758e1faSBarry Smith const PetscScalar *x; 5698758e1faSBarry Smith PetscScalar *y, *z, *zt; 5708758e1faSBarry Smith PetscInt i1, i2, n, i, row, node_max, nsz, sz; 5718758e1faSBarry Smith const PetscInt *idx, *ns, *ii; 5724c1414c8SBarry Smith 5734c1414c8SBarry Smith PetscFunctionBegin; 5744d12350bSJunchao Zhang PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 5754c1414c8SBarry Smith node_max = a->inode.node_count; 5764d12350bSJunchao Zhang ns = a->inode.size_csr; /* Node Size array */ 5772205254eSKarl Rupp 5789566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 5799566063dSJacob Faibussowitsch PetscCall(VecGetArrayPair(zz, yy, &z, &y)); 5804c1414c8SBarry Smith zt = z; 5814c1414c8SBarry Smith 5824c1414c8SBarry Smith idx = a->j; 5834c1414c8SBarry Smith v1 = a->a; 5844c1414c8SBarry Smith ii = a->i; 5854c1414c8SBarry Smith 5864d12350bSJunchao Zhang for (i = 0; i < node_max; ++i) { 5874d12350bSJunchao Zhang row = ns[i]; 5884d12350bSJunchao Zhang nsz = ns[i + 1] - ns[i]; 5894c1414c8SBarry Smith n = ii[1] - ii[0]; 5904c1414c8SBarry Smith ii += nsz; 5914c1414c8SBarry Smith sz = n; /* No of non zeros in this row */ 5924c1414c8SBarry Smith /* Switch on the size of Node */ 5934c1414c8SBarry Smith switch (nsz) { /* Each loop in 'case' is unrolled */ 5944c1414c8SBarry Smith case 1: 5954c1414c8SBarry Smith sum1 = *zt++; 5964c1414c8SBarry Smith 5974c1414c8SBarry Smith for (n = 0; n < sz - 1; n += 2) { 5984c1414c8SBarry Smith i1 = idx[0]; /* The instructions are ordered to */ 5994c1414c8SBarry Smith i2 = idx[1]; /* make the compiler's job easy */ 6004c1414c8SBarry Smith idx += 2; 6014c1414c8SBarry Smith tmp0 = x[i1]; 6024c1414c8SBarry Smith tmp1 = x[i2]; 6039371c9d4SSatish Balay sum1 += v1[0] * tmp0 + v1[1] * tmp1; 6049371c9d4SSatish Balay v1 += 2; 6054c1414c8SBarry Smith } 6064c1414c8SBarry Smith 6074c1414c8SBarry Smith if (n == sz - 1) { /* Take care of the last nonzero */ 6084c1414c8SBarry Smith tmp0 = x[*idx++]; 6094c1414c8SBarry Smith sum1 += *v1++ * tmp0; 6104c1414c8SBarry Smith } 6114c1414c8SBarry Smith y[row++] = sum1; 6124c1414c8SBarry Smith break; 6134c1414c8SBarry Smith case 2: 6144c1414c8SBarry Smith sum1 = *zt++; 6154c1414c8SBarry Smith sum2 = *zt++; 6164c1414c8SBarry Smith v2 = v1 + n; 6174c1414c8SBarry Smith 6184c1414c8SBarry Smith for (n = 0; n < sz - 1; n += 2) { 6194c1414c8SBarry Smith i1 = idx[0]; 6204c1414c8SBarry Smith i2 = idx[1]; 6214c1414c8SBarry Smith idx += 2; 6224c1414c8SBarry Smith tmp0 = x[i1]; 6234c1414c8SBarry Smith tmp1 = x[i2]; 6249371c9d4SSatish Balay sum1 += v1[0] * tmp0 + v1[1] * tmp1; 6259371c9d4SSatish Balay v1 += 2; 6269371c9d4SSatish Balay sum2 += v2[0] * tmp0 + v2[1] * tmp1; 6279371c9d4SSatish Balay v2 += 2; 6284c1414c8SBarry Smith } 6294c1414c8SBarry Smith if (n == sz - 1) { 6304c1414c8SBarry Smith tmp0 = x[*idx++]; 6314c1414c8SBarry Smith sum1 += *v1++ * tmp0; 6324c1414c8SBarry Smith sum2 += *v2++ * tmp0; 6334c1414c8SBarry Smith } 6344c1414c8SBarry Smith y[row++] = sum1; 6354c1414c8SBarry Smith y[row++] = sum2; 6364c1414c8SBarry Smith v1 = v2; /* Since the next block to be processed starts there*/ 6374c1414c8SBarry Smith idx += sz; 6384c1414c8SBarry Smith break; 6394c1414c8SBarry Smith case 3: 6404c1414c8SBarry Smith sum1 = *zt++; 6414c1414c8SBarry Smith sum2 = *zt++; 6424c1414c8SBarry Smith sum3 = *zt++; 6434c1414c8SBarry Smith v2 = v1 + n; 6444c1414c8SBarry Smith v3 = v2 + n; 6454c1414c8SBarry Smith 6464c1414c8SBarry Smith for (n = 0; n < sz - 1; n += 2) { 6474c1414c8SBarry Smith i1 = idx[0]; 6484c1414c8SBarry Smith i2 = idx[1]; 6494c1414c8SBarry Smith idx += 2; 6504c1414c8SBarry Smith tmp0 = x[i1]; 6514c1414c8SBarry Smith tmp1 = x[i2]; 6529371c9d4SSatish Balay sum1 += v1[0] * tmp0 + v1[1] * tmp1; 6539371c9d4SSatish Balay v1 += 2; 6549371c9d4SSatish Balay sum2 += v2[0] * tmp0 + v2[1] * tmp1; 6559371c9d4SSatish Balay v2 += 2; 6569371c9d4SSatish Balay sum3 += v3[0] * tmp0 + v3[1] * tmp1; 6579371c9d4SSatish Balay v3 += 2; 6584c1414c8SBarry Smith } 6594c1414c8SBarry Smith if (n == sz - 1) { 6604c1414c8SBarry Smith tmp0 = x[*idx++]; 6614c1414c8SBarry Smith sum1 += *v1++ * tmp0; 6624c1414c8SBarry Smith sum2 += *v2++ * tmp0; 6634c1414c8SBarry Smith sum3 += *v3++ * tmp0; 6644c1414c8SBarry Smith } 6654c1414c8SBarry Smith y[row++] = sum1; 6664c1414c8SBarry Smith y[row++] = sum2; 6674c1414c8SBarry Smith y[row++] = sum3; 6684c1414c8SBarry Smith v1 = v3; /* Since the next block to be processed starts there*/ 6694c1414c8SBarry Smith idx += 2 * sz; 6704c1414c8SBarry Smith break; 6714c1414c8SBarry Smith case 4: 6724c1414c8SBarry Smith sum1 = *zt++; 6734c1414c8SBarry Smith sum2 = *zt++; 6744c1414c8SBarry Smith sum3 = *zt++; 6754c1414c8SBarry Smith sum4 = *zt++; 6764c1414c8SBarry Smith v2 = v1 + n; 6774c1414c8SBarry Smith v3 = v2 + n; 6784c1414c8SBarry Smith v4 = v3 + n; 6794c1414c8SBarry Smith 6804c1414c8SBarry Smith for (n = 0; n < sz - 1; n += 2) { 6814c1414c8SBarry Smith i1 = idx[0]; 6824c1414c8SBarry Smith i2 = idx[1]; 6834c1414c8SBarry Smith idx += 2; 6844c1414c8SBarry Smith tmp0 = x[i1]; 6854c1414c8SBarry Smith tmp1 = x[i2]; 6869371c9d4SSatish Balay sum1 += v1[0] * tmp0 + v1[1] * tmp1; 6879371c9d4SSatish Balay v1 += 2; 6889371c9d4SSatish Balay sum2 += v2[0] * tmp0 + v2[1] * tmp1; 6899371c9d4SSatish Balay v2 += 2; 6909371c9d4SSatish Balay sum3 += v3[0] * tmp0 + v3[1] * tmp1; 6919371c9d4SSatish Balay v3 += 2; 6929371c9d4SSatish Balay sum4 += v4[0] * tmp0 + v4[1] * tmp1; 6939371c9d4SSatish Balay v4 += 2; 6944c1414c8SBarry Smith } 6954c1414c8SBarry Smith if (n == sz - 1) { 6964c1414c8SBarry Smith tmp0 = x[*idx++]; 6974c1414c8SBarry Smith sum1 += *v1++ * tmp0; 6984c1414c8SBarry Smith sum2 += *v2++ * tmp0; 6994c1414c8SBarry Smith sum3 += *v3++ * tmp0; 7004c1414c8SBarry Smith sum4 += *v4++ * tmp0; 7014c1414c8SBarry Smith } 7024c1414c8SBarry Smith y[row++] = sum1; 7034c1414c8SBarry Smith y[row++] = sum2; 7044c1414c8SBarry Smith y[row++] = sum3; 7054c1414c8SBarry Smith y[row++] = sum4; 7064c1414c8SBarry Smith v1 = v4; /* Since the next block to be processed starts there*/ 7074c1414c8SBarry Smith idx += 3 * sz; 7084c1414c8SBarry Smith break; 7094c1414c8SBarry Smith case 5: 7104c1414c8SBarry Smith sum1 = *zt++; 7114c1414c8SBarry Smith sum2 = *zt++; 7124c1414c8SBarry Smith sum3 = *zt++; 7134c1414c8SBarry Smith sum4 = *zt++; 7144c1414c8SBarry Smith sum5 = *zt++; 7154c1414c8SBarry Smith v2 = v1 + n; 7164c1414c8SBarry Smith v3 = v2 + n; 7174c1414c8SBarry Smith v4 = v3 + n; 7184c1414c8SBarry Smith v5 = v4 + n; 7194c1414c8SBarry Smith 7204c1414c8SBarry Smith for (n = 0; n < sz - 1; n += 2) { 7214c1414c8SBarry Smith i1 = idx[0]; 7224c1414c8SBarry Smith i2 = idx[1]; 7234c1414c8SBarry Smith idx += 2; 7244c1414c8SBarry Smith tmp0 = x[i1]; 7254c1414c8SBarry Smith tmp1 = x[i2]; 7269371c9d4SSatish Balay sum1 += v1[0] * tmp0 + v1[1] * tmp1; 7279371c9d4SSatish Balay v1 += 2; 7289371c9d4SSatish Balay sum2 += v2[0] * tmp0 + v2[1] * tmp1; 7299371c9d4SSatish Balay v2 += 2; 7309371c9d4SSatish Balay sum3 += v3[0] * tmp0 + v3[1] * tmp1; 7319371c9d4SSatish Balay v3 += 2; 7329371c9d4SSatish Balay sum4 += v4[0] * tmp0 + v4[1] * tmp1; 7339371c9d4SSatish Balay v4 += 2; 7349371c9d4SSatish Balay sum5 += v5[0] * tmp0 + v5[1] * tmp1; 7359371c9d4SSatish Balay v5 += 2; 7364c1414c8SBarry Smith } 7374c1414c8SBarry Smith if (n == sz - 1) { 7384c1414c8SBarry Smith tmp0 = x[*idx++]; 7394c1414c8SBarry Smith sum1 += *v1++ * tmp0; 7404c1414c8SBarry Smith sum2 += *v2++ * tmp0; 7414c1414c8SBarry Smith sum3 += *v3++ * tmp0; 7424c1414c8SBarry Smith sum4 += *v4++ * tmp0; 7434c1414c8SBarry Smith sum5 += *v5++ * tmp0; 7444c1414c8SBarry Smith } 7454c1414c8SBarry Smith y[row++] = sum1; 7464c1414c8SBarry Smith y[row++] = sum2; 7474c1414c8SBarry Smith y[row++] = sum3; 7484c1414c8SBarry Smith y[row++] = sum4; 7494c1414c8SBarry Smith y[row++] = sum5; 7504c1414c8SBarry Smith v1 = v5; /* Since the next block to be processed starts there */ 7514c1414c8SBarry Smith idx += 4 * sz; 7524c1414c8SBarry Smith break; 753d71ae5a4SJacob Faibussowitsch default: 754d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported"); 7554c1414c8SBarry Smith } 7564c1414c8SBarry Smith } 7579566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 7589566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayPair(zz, yy, &z, &y)); 7599566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz)); 7603ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 7614c1414c8SBarry Smith } 7624c1414c8SBarry Smith 763ff6a9541SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A, Vec bb, Vec xx) 764d71ae5a4SJacob Faibussowitsch { 7654c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 7664c1414c8SBarry Smith IS iscol = a->col, isrow = a->row; 7675d0c19d7SBarry Smith const PetscInt *r, *c, *rout, *cout; 7688758e1faSBarry Smith PetscInt i, j, n = A->rmap->n, nz; 7698758e1faSBarry Smith PetscInt node_max, *ns, row, nsz, aii, i0, i1; 7708758e1faSBarry Smith const PetscInt *ai = a->i, *a_j = a->j, *vi, *ad, *aj; 771d9fead3dSBarry Smith PetscScalar *x, *tmp, *tmps, tmp0, tmp1; 772d9fead3dSBarry Smith PetscScalar sum1, sum2, sum3, sum4, sum5; 773dd6ea824SBarry Smith const MatScalar *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa; 774dd6ea824SBarry Smith const PetscScalar *b; 7754c1414c8SBarry Smith 7764c1414c8SBarry Smith PetscFunctionBegin; 7774d12350bSJunchao Zhang PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 7784c1414c8SBarry Smith node_max = a->inode.node_count; 7794d12350bSJunchao Zhang ns = a->inode.size_csr; /* Node Size array */ 7804c1414c8SBarry Smith 7819566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 7829566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(xx, &x)); 7834c1414c8SBarry Smith tmp = a->solve_work; 7844c1414c8SBarry Smith 7859371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 7869371c9d4SSatish Balay r = rout; 7879371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 7889371c9d4SSatish Balay c = cout + (n - 1); 7894c1414c8SBarry Smith 7904c1414c8SBarry Smith /* forward solve the lower triangular */ 7914c1414c8SBarry Smith tmps = tmp; 7924c1414c8SBarry Smith aa = a_a; 7934c1414c8SBarry Smith aj = a_j; 7944c1414c8SBarry Smith ad = a->diag; 7954c1414c8SBarry Smith 7964c1414c8SBarry Smith for (i = 0, row = 0; i < node_max; ++i) { 7974d12350bSJunchao Zhang row = ns[i]; 7984d12350bSJunchao Zhang nsz = ns[i + 1] - ns[i]; 7994c1414c8SBarry Smith aii = ai[row]; 8004c1414c8SBarry Smith v1 = aa + aii; 8014c1414c8SBarry Smith vi = aj + aii; 8024c1414c8SBarry Smith nz = ad[row] - aii; 80326549573SJed Brown if (i < node_max - 1) { 80426549573SJed Brown /* Prefetch the block after the current one, the prefetch itself can't cause a memory error, 80591c35059SPierre Jolivet * but our indexing to determine its size could. */ 80650d8bf02SJed Brown PetscPrefetchBlock(aj + ai[row + nsz], ad[row + nsz] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */ 80726549573SJed Brown /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */ 8084d12350bSJunchao Zhang PetscPrefetchBlock(aa + ai[row + nsz], ad[ns[i + 2] - 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); 80926549573SJed Brown /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */ 81026549573SJed Brown } 8114c1414c8SBarry Smith 8124c1414c8SBarry Smith switch (nsz) { /* Each loop in 'case' is unrolled */ 8134c1414c8SBarry Smith case 1: 8144c1414c8SBarry Smith sum1 = b[*r++]; 8154c1414c8SBarry Smith for (j = 0; j < nz - 1; j += 2) { 8164c1414c8SBarry Smith i0 = vi[0]; 8174c1414c8SBarry Smith i1 = vi[1]; 8184c1414c8SBarry Smith vi += 2; 8194c1414c8SBarry Smith tmp0 = tmps[i0]; 8204c1414c8SBarry Smith tmp1 = tmps[i1]; 8219371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 8229371c9d4SSatish Balay v1 += 2; 8234c1414c8SBarry Smith } 8244c1414c8SBarry Smith if (j == nz - 1) { 8254c1414c8SBarry Smith tmp0 = tmps[*vi++]; 8264c1414c8SBarry Smith sum1 -= *v1++ * tmp0; 8274c1414c8SBarry Smith } 8284c1414c8SBarry Smith tmp[row++] = sum1; 8294c1414c8SBarry Smith break; 8304c1414c8SBarry Smith case 2: 8314c1414c8SBarry Smith sum1 = b[*r++]; 8324c1414c8SBarry Smith sum2 = b[*r++]; 8334c1414c8SBarry Smith v2 = aa + ai[row + 1]; 8344c1414c8SBarry Smith 8354c1414c8SBarry Smith for (j = 0; j < nz - 1; j += 2) { 8364c1414c8SBarry Smith i0 = vi[0]; 8374c1414c8SBarry Smith i1 = vi[1]; 8384c1414c8SBarry Smith vi += 2; 8394c1414c8SBarry Smith tmp0 = tmps[i0]; 8404c1414c8SBarry Smith tmp1 = tmps[i1]; 8419371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 8429371c9d4SSatish Balay v1 += 2; 8439371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 8449371c9d4SSatish Balay v2 += 2; 8454c1414c8SBarry Smith } 8464c1414c8SBarry Smith if (j == nz - 1) { 8474c1414c8SBarry Smith tmp0 = tmps[*vi++]; 8484c1414c8SBarry Smith sum1 -= *v1++ * tmp0; 8494c1414c8SBarry Smith sum2 -= *v2++ * tmp0; 8504c1414c8SBarry Smith } 8514c1414c8SBarry Smith sum2 -= *v2++ * sum1; 8524c1414c8SBarry Smith tmp[row++] = sum1; 8534c1414c8SBarry Smith tmp[row++] = sum2; 8544c1414c8SBarry Smith break; 8554c1414c8SBarry Smith case 3: 8564c1414c8SBarry Smith sum1 = b[*r++]; 8574c1414c8SBarry Smith sum2 = b[*r++]; 8584c1414c8SBarry Smith sum3 = b[*r++]; 8594c1414c8SBarry Smith v2 = aa + ai[row + 1]; 8604c1414c8SBarry Smith v3 = aa + ai[row + 2]; 8614c1414c8SBarry Smith 8624c1414c8SBarry Smith for (j = 0; j < nz - 1; j += 2) { 8634c1414c8SBarry Smith i0 = vi[0]; 8644c1414c8SBarry Smith i1 = vi[1]; 8654c1414c8SBarry Smith vi += 2; 8664c1414c8SBarry Smith tmp0 = tmps[i0]; 8674c1414c8SBarry Smith tmp1 = tmps[i1]; 8689371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 8699371c9d4SSatish Balay v1 += 2; 8709371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 8719371c9d4SSatish Balay v2 += 2; 8729371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 8739371c9d4SSatish Balay v3 += 2; 8744c1414c8SBarry Smith } 8754c1414c8SBarry Smith if (j == nz - 1) { 8764c1414c8SBarry Smith tmp0 = tmps[*vi++]; 8774c1414c8SBarry Smith sum1 -= *v1++ * tmp0; 8784c1414c8SBarry Smith sum2 -= *v2++ * tmp0; 8794c1414c8SBarry Smith sum3 -= *v3++ * tmp0; 8804c1414c8SBarry Smith } 8814c1414c8SBarry Smith sum2 -= *v2++ * sum1; 8824c1414c8SBarry Smith sum3 -= *v3++ * sum1; 8834c1414c8SBarry Smith sum3 -= *v3++ * sum2; 8842205254eSKarl Rupp 8854c1414c8SBarry Smith tmp[row++] = sum1; 8864c1414c8SBarry Smith tmp[row++] = sum2; 8874c1414c8SBarry Smith tmp[row++] = sum3; 8884c1414c8SBarry Smith break; 8894c1414c8SBarry Smith 8904c1414c8SBarry Smith case 4: 8914c1414c8SBarry Smith sum1 = b[*r++]; 8924c1414c8SBarry Smith sum2 = b[*r++]; 8934c1414c8SBarry Smith sum3 = b[*r++]; 8944c1414c8SBarry Smith sum4 = b[*r++]; 8954c1414c8SBarry Smith v2 = aa + ai[row + 1]; 8964c1414c8SBarry Smith v3 = aa + ai[row + 2]; 8974c1414c8SBarry Smith v4 = aa + ai[row + 3]; 8984c1414c8SBarry Smith 8994c1414c8SBarry Smith for (j = 0; j < nz - 1; j += 2) { 9004c1414c8SBarry Smith i0 = vi[0]; 9014c1414c8SBarry Smith i1 = vi[1]; 9024c1414c8SBarry Smith vi += 2; 9034c1414c8SBarry Smith tmp0 = tmps[i0]; 9044c1414c8SBarry Smith tmp1 = tmps[i1]; 9059371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 9069371c9d4SSatish Balay v1 += 2; 9079371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 9089371c9d4SSatish Balay v2 += 2; 9099371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 9109371c9d4SSatish Balay v3 += 2; 9119371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 9129371c9d4SSatish Balay v4 += 2; 9134c1414c8SBarry Smith } 9144c1414c8SBarry Smith if (j == nz - 1) { 9154c1414c8SBarry Smith tmp0 = tmps[*vi++]; 9164c1414c8SBarry Smith sum1 -= *v1++ * tmp0; 9174c1414c8SBarry Smith sum2 -= *v2++ * tmp0; 9184c1414c8SBarry Smith sum3 -= *v3++ * tmp0; 9194c1414c8SBarry Smith sum4 -= *v4++ * tmp0; 9204c1414c8SBarry Smith } 9214c1414c8SBarry Smith sum2 -= *v2++ * sum1; 9224c1414c8SBarry Smith sum3 -= *v3++ * sum1; 9234c1414c8SBarry Smith sum4 -= *v4++ * sum1; 9244c1414c8SBarry Smith sum3 -= *v3++ * sum2; 9254c1414c8SBarry Smith sum4 -= *v4++ * sum2; 9264c1414c8SBarry Smith sum4 -= *v4++ * sum3; 9274c1414c8SBarry Smith 9284c1414c8SBarry Smith tmp[row++] = sum1; 9294c1414c8SBarry Smith tmp[row++] = sum2; 9304c1414c8SBarry Smith tmp[row++] = sum3; 9314c1414c8SBarry Smith tmp[row++] = sum4; 9324c1414c8SBarry Smith break; 9334c1414c8SBarry Smith case 5: 9344c1414c8SBarry Smith sum1 = b[*r++]; 9354c1414c8SBarry Smith sum2 = b[*r++]; 9364c1414c8SBarry Smith sum3 = b[*r++]; 9374c1414c8SBarry Smith sum4 = b[*r++]; 9384c1414c8SBarry Smith sum5 = b[*r++]; 9394c1414c8SBarry Smith v2 = aa + ai[row + 1]; 9404c1414c8SBarry Smith v3 = aa + ai[row + 2]; 9414c1414c8SBarry Smith v4 = aa + ai[row + 3]; 9424c1414c8SBarry Smith v5 = aa + ai[row + 4]; 9434c1414c8SBarry Smith 9444c1414c8SBarry Smith for (j = 0; j < nz - 1; j += 2) { 9454c1414c8SBarry Smith i0 = vi[0]; 9464c1414c8SBarry Smith i1 = vi[1]; 9474c1414c8SBarry Smith vi += 2; 9484c1414c8SBarry Smith tmp0 = tmps[i0]; 9494c1414c8SBarry Smith tmp1 = tmps[i1]; 9509371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 9519371c9d4SSatish Balay v1 += 2; 9529371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 9539371c9d4SSatish Balay v2 += 2; 9549371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 9559371c9d4SSatish Balay v3 += 2; 9569371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 9579371c9d4SSatish Balay v4 += 2; 9589371c9d4SSatish Balay sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 9599371c9d4SSatish Balay v5 += 2; 9604c1414c8SBarry Smith } 9614c1414c8SBarry Smith if (j == nz - 1) { 9624c1414c8SBarry Smith tmp0 = tmps[*vi++]; 9634c1414c8SBarry Smith sum1 -= *v1++ * tmp0; 9644c1414c8SBarry Smith sum2 -= *v2++ * tmp0; 9654c1414c8SBarry Smith sum3 -= *v3++ * tmp0; 9664c1414c8SBarry Smith sum4 -= *v4++ * tmp0; 9674c1414c8SBarry Smith sum5 -= *v5++ * tmp0; 9684c1414c8SBarry Smith } 9694c1414c8SBarry Smith 9704c1414c8SBarry Smith sum2 -= *v2++ * sum1; 9714c1414c8SBarry Smith sum3 -= *v3++ * sum1; 9724c1414c8SBarry Smith sum4 -= *v4++ * sum1; 9734c1414c8SBarry Smith sum5 -= *v5++ * sum1; 9744c1414c8SBarry Smith sum3 -= *v3++ * sum2; 9754c1414c8SBarry Smith sum4 -= *v4++ * sum2; 9764c1414c8SBarry Smith sum5 -= *v5++ * sum2; 9774c1414c8SBarry Smith sum4 -= *v4++ * sum3; 9784c1414c8SBarry Smith sum5 -= *v5++ * sum3; 9794c1414c8SBarry Smith sum5 -= *v5++ * sum4; 9804c1414c8SBarry Smith 9814c1414c8SBarry Smith tmp[row++] = sum1; 9824c1414c8SBarry Smith tmp[row++] = sum2; 9834c1414c8SBarry Smith tmp[row++] = sum3; 9844c1414c8SBarry Smith tmp[row++] = sum4; 9854c1414c8SBarry Smith tmp[row++] = sum5; 9864c1414c8SBarry Smith break; 987d71ae5a4SJacob Faibussowitsch default: 988d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported "); 9894c1414c8SBarry Smith } 9904c1414c8SBarry Smith } 9914c1414c8SBarry Smith /* backward solve the upper triangular */ 9924d12350bSJunchao Zhang for (i = node_max - 1; i >= 0; i--) { 9934d12350bSJunchao Zhang row = ns[i + 1]; 9944d12350bSJunchao Zhang nsz = ns[i + 1] - ns[i]; 9954c1414c8SBarry Smith aii = ai[row + 1] - 1; 9964c1414c8SBarry Smith v1 = aa + aii; 9974c1414c8SBarry Smith vi = aj + aii; 9984c1414c8SBarry Smith nz = aii - ad[row]; 9994c1414c8SBarry Smith switch (nsz) { /* Each loop in 'case' is unrolled */ 10004c1414c8SBarry Smith case 1: 10014c1414c8SBarry Smith sum1 = tmp[row]; 10024c1414c8SBarry Smith 10034c1414c8SBarry Smith for (j = nz; j > 1; j -= 2) { 10044c1414c8SBarry Smith vi -= 2; 10054c1414c8SBarry Smith i0 = vi[2]; 10064c1414c8SBarry Smith i1 = vi[1]; 10074c1414c8SBarry Smith tmp0 = tmps[i0]; 10084c1414c8SBarry Smith tmp1 = tmps[i1]; 10094c1414c8SBarry Smith v1 -= 2; 10104c1414c8SBarry Smith sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 10114c1414c8SBarry Smith } 10124c1414c8SBarry Smith if (j == 1) { 10134c1414c8SBarry Smith tmp0 = tmps[*vi--]; 10144c1414c8SBarry Smith sum1 -= *v1-- * tmp0; 10154c1414c8SBarry Smith } 10169371c9d4SSatish Balay x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 10179371c9d4SSatish Balay row--; 10184c1414c8SBarry Smith break; 10194c1414c8SBarry Smith case 2: 10204c1414c8SBarry Smith sum1 = tmp[row]; 10214c1414c8SBarry Smith sum2 = tmp[row - 1]; 10224c1414c8SBarry Smith v2 = aa + ai[row] - 1; 10234c1414c8SBarry Smith for (j = nz; j > 1; j -= 2) { 10244c1414c8SBarry Smith vi -= 2; 10254c1414c8SBarry Smith i0 = vi[2]; 10264c1414c8SBarry Smith i1 = vi[1]; 10274c1414c8SBarry Smith tmp0 = tmps[i0]; 10284c1414c8SBarry Smith tmp1 = tmps[i1]; 10294c1414c8SBarry Smith v1 -= 2; 10304c1414c8SBarry Smith v2 -= 2; 10314c1414c8SBarry Smith sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 10324c1414c8SBarry Smith sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 10334c1414c8SBarry Smith } 10344c1414c8SBarry Smith if (j == 1) { 10354c1414c8SBarry Smith tmp0 = tmps[*vi--]; 10364c1414c8SBarry Smith sum1 -= *v1-- * tmp0; 10374c1414c8SBarry Smith sum2 -= *v2-- * tmp0; 10384c1414c8SBarry Smith } 10394c1414c8SBarry Smith 10409371c9d4SSatish Balay tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 10419371c9d4SSatish Balay row--; 10424c1414c8SBarry Smith sum2 -= *v2-- * tmp0; 10439371c9d4SSatish Balay x[*c--] = tmp[row] = sum2 * a_a[ad[row]]; 10449371c9d4SSatish Balay row--; 10454c1414c8SBarry Smith break; 10464c1414c8SBarry Smith case 3: 10474c1414c8SBarry Smith sum1 = tmp[row]; 10484c1414c8SBarry Smith sum2 = tmp[row - 1]; 10494c1414c8SBarry Smith sum3 = tmp[row - 2]; 10504c1414c8SBarry Smith v2 = aa + ai[row] - 1; 10514c1414c8SBarry Smith v3 = aa + ai[row - 1] - 1; 10524c1414c8SBarry Smith for (j = nz; j > 1; j -= 2) { 10534c1414c8SBarry Smith vi -= 2; 10544c1414c8SBarry Smith i0 = vi[2]; 10554c1414c8SBarry Smith i1 = vi[1]; 10564c1414c8SBarry Smith tmp0 = tmps[i0]; 10574c1414c8SBarry Smith tmp1 = tmps[i1]; 10584c1414c8SBarry Smith v1 -= 2; 10594c1414c8SBarry Smith v2 -= 2; 10604c1414c8SBarry Smith v3 -= 2; 10614c1414c8SBarry Smith sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 10624c1414c8SBarry Smith sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 10634c1414c8SBarry Smith sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 10644c1414c8SBarry Smith } 10654c1414c8SBarry Smith if (j == 1) { 10664c1414c8SBarry Smith tmp0 = tmps[*vi--]; 10674c1414c8SBarry Smith sum1 -= *v1-- * tmp0; 10684c1414c8SBarry Smith sum2 -= *v2-- * tmp0; 10694c1414c8SBarry Smith sum3 -= *v3-- * tmp0; 10704c1414c8SBarry Smith } 10719371c9d4SSatish Balay tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 10729371c9d4SSatish Balay row--; 10734c1414c8SBarry Smith sum2 -= *v2-- * tmp0; 10744c1414c8SBarry Smith sum3 -= *v3-- * tmp0; 10759371c9d4SSatish Balay tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]]; 10769371c9d4SSatish Balay row--; 10774c1414c8SBarry Smith sum3 -= *v3-- * tmp0; 10789371c9d4SSatish Balay x[*c--] = tmp[row] = sum3 * a_a[ad[row]]; 10799371c9d4SSatish Balay row--; 10804c1414c8SBarry Smith 10814c1414c8SBarry Smith break; 10824c1414c8SBarry Smith case 4: 10834c1414c8SBarry Smith sum1 = tmp[row]; 10844c1414c8SBarry Smith sum2 = tmp[row - 1]; 10854c1414c8SBarry Smith sum3 = tmp[row - 2]; 10864c1414c8SBarry Smith sum4 = tmp[row - 3]; 10874c1414c8SBarry Smith v2 = aa + ai[row] - 1; 10884c1414c8SBarry Smith v3 = aa + ai[row - 1] - 1; 10894c1414c8SBarry Smith v4 = aa + ai[row - 2] - 1; 10904c1414c8SBarry Smith 10914c1414c8SBarry Smith for (j = nz; j > 1; j -= 2) { 10924c1414c8SBarry Smith vi -= 2; 10934c1414c8SBarry Smith i0 = vi[2]; 10944c1414c8SBarry Smith i1 = vi[1]; 10954c1414c8SBarry Smith tmp0 = tmps[i0]; 10964c1414c8SBarry Smith tmp1 = tmps[i1]; 10974c1414c8SBarry Smith v1 -= 2; 10984c1414c8SBarry Smith v2 -= 2; 10994c1414c8SBarry Smith v3 -= 2; 11004c1414c8SBarry Smith v4 -= 2; 11014c1414c8SBarry Smith sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 11024c1414c8SBarry Smith sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 11034c1414c8SBarry Smith sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 11044c1414c8SBarry Smith sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 11054c1414c8SBarry Smith } 11064c1414c8SBarry Smith if (j == 1) { 11074c1414c8SBarry Smith tmp0 = tmps[*vi--]; 11084c1414c8SBarry Smith sum1 -= *v1-- * tmp0; 11094c1414c8SBarry Smith sum2 -= *v2-- * tmp0; 11104c1414c8SBarry Smith sum3 -= *v3-- * tmp0; 11114c1414c8SBarry Smith sum4 -= *v4-- * tmp0; 11124c1414c8SBarry Smith } 11134c1414c8SBarry Smith 11149371c9d4SSatish Balay tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 11159371c9d4SSatish Balay row--; 11164c1414c8SBarry Smith sum2 -= *v2-- * tmp0; 11174c1414c8SBarry Smith sum3 -= *v3-- * tmp0; 11184c1414c8SBarry Smith sum4 -= *v4-- * tmp0; 11199371c9d4SSatish Balay tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]]; 11209371c9d4SSatish Balay row--; 11214c1414c8SBarry Smith sum3 -= *v3-- * tmp0; 11224c1414c8SBarry Smith sum4 -= *v4-- * tmp0; 11239371c9d4SSatish Balay tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]]; 11249371c9d4SSatish Balay row--; 11254c1414c8SBarry Smith sum4 -= *v4-- * tmp0; 11269371c9d4SSatish Balay x[*c--] = tmp[row] = sum4 * a_a[ad[row]]; 11279371c9d4SSatish Balay row--; 11284c1414c8SBarry Smith break; 11294c1414c8SBarry Smith case 5: 11304c1414c8SBarry Smith sum1 = tmp[row]; 11314c1414c8SBarry Smith sum2 = tmp[row - 1]; 11324c1414c8SBarry Smith sum3 = tmp[row - 2]; 11334c1414c8SBarry Smith sum4 = tmp[row - 3]; 11344c1414c8SBarry Smith sum5 = tmp[row - 4]; 11354c1414c8SBarry Smith v2 = aa + ai[row] - 1; 11364c1414c8SBarry Smith v3 = aa + ai[row - 1] - 1; 11374c1414c8SBarry Smith v4 = aa + ai[row - 2] - 1; 11384c1414c8SBarry Smith v5 = aa + ai[row - 3] - 1; 11394c1414c8SBarry Smith for (j = nz; j > 1; j -= 2) { 11404c1414c8SBarry Smith vi -= 2; 11414c1414c8SBarry Smith i0 = vi[2]; 11424c1414c8SBarry Smith i1 = vi[1]; 11434c1414c8SBarry Smith tmp0 = tmps[i0]; 11444c1414c8SBarry Smith tmp1 = tmps[i1]; 11454c1414c8SBarry Smith v1 -= 2; 11464c1414c8SBarry Smith v2 -= 2; 11474c1414c8SBarry Smith v3 -= 2; 11484c1414c8SBarry Smith v4 -= 2; 11494c1414c8SBarry Smith v5 -= 2; 11504c1414c8SBarry Smith sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 11514c1414c8SBarry Smith sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 11524c1414c8SBarry Smith sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 11534c1414c8SBarry Smith sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 11544c1414c8SBarry Smith sum5 -= v5[2] * tmp0 + v5[1] * tmp1; 11554c1414c8SBarry Smith } 11564c1414c8SBarry Smith if (j == 1) { 11574c1414c8SBarry Smith tmp0 = tmps[*vi--]; 11584c1414c8SBarry Smith sum1 -= *v1-- * tmp0; 11594c1414c8SBarry Smith sum2 -= *v2-- * tmp0; 11604c1414c8SBarry Smith sum3 -= *v3-- * tmp0; 11614c1414c8SBarry Smith sum4 -= *v4-- * tmp0; 11624c1414c8SBarry Smith sum5 -= *v5-- * tmp0; 11634c1414c8SBarry Smith } 11644c1414c8SBarry Smith 11659371c9d4SSatish Balay tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 11669371c9d4SSatish Balay row--; 11674c1414c8SBarry Smith sum2 -= *v2-- * tmp0; 11684c1414c8SBarry Smith sum3 -= *v3-- * tmp0; 11694c1414c8SBarry Smith sum4 -= *v4-- * tmp0; 11704c1414c8SBarry Smith sum5 -= *v5-- * tmp0; 11719371c9d4SSatish Balay tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]]; 11729371c9d4SSatish Balay row--; 11734c1414c8SBarry Smith sum3 -= *v3-- * tmp0; 11744c1414c8SBarry Smith sum4 -= *v4-- * tmp0; 11754c1414c8SBarry Smith sum5 -= *v5-- * tmp0; 11769371c9d4SSatish Balay tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]]; 11779371c9d4SSatish Balay row--; 11784c1414c8SBarry Smith sum4 -= *v4-- * tmp0; 11794c1414c8SBarry Smith sum5 -= *v5-- * tmp0; 11809371c9d4SSatish Balay tmp0 = x[*c--] = tmp[row] = sum4 * a_a[ad[row]]; 11819371c9d4SSatish Balay row--; 11824c1414c8SBarry Smith sum5 -= *v5-- * tmp0; 11839371c9d4SSatish Balay x[*c--] = tmp[row] = sum5 * a_a[ad[row]]; 11849371c9d4SSatish Balay row--; 11854c1414c8SBarry Smith break; 1186d71ae5a4SJacob Faibussowitsch default: 1187d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported "); 11884c1414c8SBarry Smith } 11894c1414c8SBarry Smith } 11909566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 11919566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 11929566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 11939566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(xx, &x)); 11949566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n)); 11953ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 11964c1414c8SBarry Smith } 11974c1414c8SBarry Smith 1198d71ae5a4SJacob Faibussowitsch PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B, Mat A, const MatFactorInfo *info) 1199d71ae5a4SJacob Faibussowitsch { 120028f1b45aSHong Zhang Mat C = B; 120128f1b45aSHong Zhang Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data; 120228f1b45aSHong Zhang IS isrow = b->row, isicol = b->icol; 120328f1b45aSHong Zhang const PetscInt *r, *ic, *ics; 120428f1b45aSHong Zhang const PetscInt n = A->rmap->n, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag; 120528f1b45aSHong Zhang PetscInt i, j, k, nz, nzL, row, *pj; 120628f1b45aSHong Zhang const PetscInt *ajtmp, *bjtmp; 12079877982aSShri Abhyankar MatScalar *pc, *pc1, *pc2, *pc3, *pc4, mul1, mul2, mul3, mul4, *pv, *rtmp1, *rtmp2, *rtmp3, *rtmp4; 12089877982aSShri Abhyankar const MatScalar *aa = a->a, *v, *v1, *v2, *v3, *v4; 120928f1b45aSHong Zhang FactorShiftCtx sctx; 12104f81c4b7SBarry Smith const PetscInt *ddiag; 121128f1b45aSHong Zhang PetscReal rs; 121228f1b45aSHong Zhang MatScalar d; 12134f81c4b7SBarry Smith PetscInt inod, nodesz, node_max, col; 12144f81c4b7SBarry Smith const PetscInt *ns; 121507b50cabSHong Zhang PetscInt *tmp_vec1, *tmp_vec2, *nsmap; 12160e95ead3SHong Zhang 121728f1b45aSHong Zhang PetscFunctionBegin; 121828f1b45aSHong Zhang /* MatPivotSetUp(): initialize shift context sctx */ 12199566063dSJacob Faibussowitsch PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx))); 122028f1b45aSHong Zhang 1221f4db908eSBarry Smith if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 122228f1b45aSHong Zhang ddiag = a->diag; 122328f1b45aSHong Zhang sctx.shift_top = info->zeropivot; 122428f1b45aSHong Zhang for (i = 0; i < n; i++) { 122528f1b45aSHong Zhang /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 122628f1b45aSHong Zhang d = (aa)[ddiag[i]]; 122728f1b45aSHong Zhang rs = -PetscAbsScalar(d) - PetscRealPart(d); 122828f1b45aSHong Zhang v = aa + ai[i]; 122928f1b45aSHong Zhang nz = ai[i + 1] - ai[i]; 12302205254eSKarl Rupp for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]); 123128f1b45aSHong Zhang if (rs > sctx.shift_top) sctx.shift_top = rs; 123228f1b45aSHong Zhang } 123328f1b45aSHong Zhang sctx.shift_top *= 1.1; 123428f1b45aSHong Zhang sctx.nshift_max = 5; 123528f1b45aSHong Zhang sctx.shift_lo = 0.; 123628f1b45aSHong Zhang sctx.shift_hi = 1.; 123728f1b45aSHong Zhang } 123828f1b45aSHong Zhang 12399566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow, &r)); 12409566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isicol, &ic)); 124168785679SHong Zhang 12429566063dSJacob Faibussowitsch PetscCall(PetscCalloc4(n, &rtmp1, n, &rtmp2, n, &rtmp3, n, &rtmp4)); 124328f1b45aSHong Zhang ics = ic; 124428f1b45aSHong Zhang 124528f1b45aSHong Zhang node_max = a->inode.node_count; 12464d12350bSJunchao Zhang ns = a->inode.size_csr; 124728b400f6SJacob Faibussowitsch PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information"); 124828f1b45aSHong Zhang 12499877982aSShri Abhyankar /* If max inode size > 4, split it into two inodes.*/ 125068785679SHong Zhang /* also map the inode sizes according to the ordering */ 12519566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(n + 1, &tmp_vec1)); 125268785679SHong Zhang for (i = 0, j = 0; i < node_max; ++i, ++j) { 12534d12350bSJunchao Zhang nodesz = ns[i + 1] - ns[i]; 12544d12350bSJunchao Zhang if (nodesz > 4) { 1255048b5e81SShri Abhyankar tmp_vec1[j] = 4; 125668785679SHong Zhang ++j; 12574d12350bSJunchao Zhang tmp_vec1[j] = nodesz - tmp_vec1[j - 1]; 125868785679SHong Zhang } else { 12594d12350bSJunchao Zhang tmp_vec1[j] = nodesz; 126068785679SHong Zhang } 126168785679SHong Zhang } 126268785679SHong Zhang /* Use the correct node_max */ 126368785679SHong Zhang node_max = j; 126468785679SHong Zhang 126568785679SHong Zhang /* Now reorder the inode info based on mat re-ordering info */ 126668785679SHong Zhang /* First create a row -> inode_size_array_index map */ 12679566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(n + 1, &nsmap)); 12689566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(node_max + 1, &tmp_vec2)); 12694d12350bSJunchao Zhang tmp_vec2[0] = 0; 127068785679SHong Zhang for (i = 0, row = 0; i < node_max; i++) { 127168785679SHong Zhang nodesz = tmp_vec1[i]; 1272ad540459SPierre Jolivet for (j = 0; j < nodesz; j++, row++) nsmap[row] = i; 127368785679SHong Zhang } 127468785679SHong Zhang /* Using nsmap, create a reordered ns structure */ 127568785679SHong Zhang for (i = 0, j = 0; i < node_max; i++) { 127668785679SHong Zhang nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 12774d12350bSJunchao Zhang tmp_vec2[i + 1] = tmp_vec2[i] + nodesz; 127868785679SHong Zhang j += nodesz; 127968785679SHong Zhang } 12809566063dSJacob Faibussowitsch PetscCall(PetscFree(nsmap)); 12819566063dSJacob Faibussowitsch PetscCall(PetscFree(tmp_vec1)); 1282b89f182dSHong Zhang 128368785679SHong Zhang /* Now use the correct ns */ 128468785679SHong Zhang ns = tmp_vec2; 128568785679SHong Zhang 128628f1b45aSHong Zhang do { 128707b50cabSHong Zhang sctx.newshift = PETSC_FALSE; 128828f1b45aSHong Zhang /* Now loop over each block-row, and do the factorization */ 128928f1b45aSHong Zhang for (inod = 0, i = 0; inod < node_max; inod++) { /* i: row index; inod: inode index */ 12904d12350bSJunchao Zhang nodesz = ns[inod + 1] - ns[inod]; 129128f1b45aSHong Zhang 129228f1b45aSHong Zhang switch (nodesz) { 129328f1b45aSHong Zhang case 1: 1294b89f182dSHong Zhang /* zero rtmp1 */ 129528f1b45aSHong Zhang /* L part */ 129628f1b45aSHong Zhang nz = bi[i + 1] - bi[i]; 129728f1b45aSHong Zhang bjtmp = bj + bi[i]; 1298b89f182dSHong Zhang for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0; 129928f1b45aSHong Zhang 130028f1b45aSHong Zhang /* U part */ 130128f1b45aSHong Zhang nz = bdiag[i] - bdiag[i + 1]; 130228f1b45aSHong Zhang bjtmp = bj + bdiag[i + 1] + 1; 1303b89f182dSHong Zhang for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0; 130428f1b45aSHong Zhang 130528f1b45aSHong Zhang /* load in initial (unfactored row) */ 130628f1b45aSHong Zhang nz = ai[r[i] + 1] - ai[r[i]]; 130728f1b45aSHong Zhang ajtmp = aj + ai[r[i]]; 130828f1b45aSHong Zhang v = aa + ai[r[i]]; 13092205254eSKarl Rupp for (j = 0; j < nz; j++) rtmp1[ics[ajtmp[j]]] = v[j]; 13102205254eSKarl Rupp 131128f1b45aSHong Zhang /* ZeropivotApply() */ 1312b89f182dSHong Zhang rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */ 131328f1b45aSHong Zhang 131428f1b45aSHong Zhang /* elimination */ 131528f1b45aSHong Zhang bjtmp = bj + bi[i]; 131628f1b45aSHong Zhang row = *bjtmp++; 131728f1b45aSHong Zhang nzL = bi[i + 1] - bi[i]; 131828f1b45aSHong Zhang for (k = 0; k < nzL; k++) { 1319b89f182dSHong Zhang pc = rtmp1 + row; 132028f1b45aSHong Zhang if (*pc != 0.0) { 132128f1b45aSHong Zhang pv = b->a + bdiag[row]; 1322b89f182dSHong Zhang mul1 = *pc * (*pv); 1323b89f182dSHong Zhang *pc = mul1; 132428f1b45aSHong Zhang pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 132528f1b45aSHong Zhang pv = b->a + bdiag[row + 1] + 1; 132628f1b45aSHong Zhang nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 1327b89f182dSHong Zhang for (j = 0; j < nz; j++) rtmp1[pj[j]] -= mul1 * pv[j]; 13289566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(1 + 2.0 * nz)); 132928f1b45aSHong Zhang } 133028f1b45aSHong Zhang row = *bjtmp++; 133128f1b45aSHong Zhang } 133228f1b45aSHong Zhang 133328f1b45aSHong Zhang /* finished row so stick it into b->a */ 133428f1b45aSHong Zhang rs = 0.0; 133528f1b45aSHong Zhang /* L part */ 133628f1b45aSHong Zhang pv = b->a + bi[i]; 133728f1b45aSHong Zhang pj = b->j + bi[i]; 133828f1b45aSHong Zhang nz = bi[i + 1] - bi[i]; 133928f1b45aSHong Zhang for (j = 0; j < nz; j++) { 13409371c9d4SSatish Balay pv[j] = rtmp1[pj[j]]; 13419371c9d4SSatish Balay rs += PetscAbsScalar(pv[j]); 134228f1b45aSHong Zhang } 134328f1b45aSHong Zhang 134428f1b45aSHong Zhang /* U part */ 134528f1b45aSHong Zhang pv = b->a + bdiag[i + 1] + 1; 134628f1b45aSHong Zhang pj = b->j + bdiag[i + 1] + 1; 134728f1b45aSHong Zhang nz = bdiag[i] - bdiag[i + 1] - 1; 134828f1b45aSHong Zhang for (j = 0; j < nz; j++) { 13499371c9d4SSatish Balay pv[j] = rtmp1[pj[j]]; 13509371c9d4SSatish Balay rs += PetscAbsScalar(pv[j]); 135128f1b45aSHong Zhang } 135228f1b45aSHong Zhang 1353b89f182dSHong Zhang /* Check zero pivot */ 135428f1b45aSHong Zhang sctx.rs = rs; 1355b89f182dSHong Zhang sctx.pv = rtmp1[i]; 13569566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 135707b50cabSHong Zhang if (sctx.newshift) break; 135828f1b45aSHong Zhang 1359a5b23f4aSJose E. Roman /* Mark diagonal and invert diagonal for simpler triangular solves */ 136028f1b45aSHong Zhang pv = b->a + bdiag[i]; 1361b89f182dSHong Zhang *pv = 1.0 / sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */ 136228f1b45aSHong Zhang break; 136328f1b45aSHong Zhang 136428f1b45aSHong Zhang case 2: 1365b89f182dSHong Zhang /* zero rtmp1 and rtmp2 */ 136628f1b45aSHong Zhang /* L part */ 136728f1b45aSHong Zhang nz = bi[i + 1] - bi[i]; 136828f1b45aSHong Zhang bjtmp = bj + bi[i]; 136928f1b45aSHong Zhang for (j = 0; j < nz; j++) { 137068785679SHong Zhang col = bjtmp[j]; 13719371c9d4SSatish Balay rtmp1[col] = 0.0; 13729371c9d4SSatish Balay rtmp2[col] = 0.0; 137328f1b45aSHong Zhang } 137428f1b45aSHong Zhang 137528f1b45aSHong Zhang /* U part */ 137628f1b45aSHong Zhang nz = bdiag[i] - bdiag[i + 1]; 137728f1b45aSHong Zhang bjtmp = bj + bdiag[i + 1] + 1; 137828f1b45aSHong Zhang for (j = 0; j < nz; j++) { 137968785679SHong Zhang col = bjtmp[j]; 13809371c9d4SSatish Balay rtmp1[col] = 0.0; 13819371c9d4SSatish Balay rtmp2[col] = 0.0; 138228f1b45aSHong Zhang } 138328f1b45aSHong Zhang 138428f1b45aSHong Zhang /* load in initial (unfactored row) */ 138528f1b45aSHong Zhang nz = ai[r[i] + 1] - ai[r[i]]; 138628f1b45aSHong Zhang ajtmp = aj + ai[r[i]]; 13879371c9d4SSatish Balay v1 = aa + ai[r[i]]; 13881a303e4dSPierre Jolivet v2 = aa + ai[r[i + 1]]; 138928f1b45aSHong Zhang for (j = 0; j < nz; j++) { 139068785679SHong Zhang col = ics[ajtmp[j]]; 13919371c9d4SSatish Balay rtmp1[col] = v1[j]; 13929371c9d4SSatish Balay rtmp2[col] = v2[j]; 139328f1b45aSHong Zhang } 139428f1b45aSHong Zhang /* ZeropivotApply(): shift the diagonal of the matrix */ 13959371c9d4SSatish Balay rtmp1[i] += sctx.shift_amount; 13969371c9d4SSatish Balay rtmp2[i + 1] += sctx.shift_amount; 139728f1b45aSHong Zhang 139828f1b45aSHong Zhang /* elimination */ 139928f1b45aSHong Zhang bjtmp = bj + bi[i]; 140028f1b45aSHong Zhang row = *bjtmp++; /* pivot row */ 140128f1b45aSHong Zhang nzL = bi[i + 1] - bi[i]; 140228f1b45aSHong Zhang for (k = 0; k < nzL; k++) { 1403b89f182dSHong Zhang pc1 = rtmp1 + row; 1404b89f182dSHong Zhang pc2 = rtmp2 + row; 140528f1b45aSHong Zhang if (*pc1 != 0.0 || *pc2 != 0.0) { 140628f1b45aSHong Zhang pv = b->a + bdiag[row]; 14079371c9d4SSatish Balay mul1 = *pc1 * (*pv); 14089371c9d4SSatish Balay mul2 = *pc2 * (*pv); 14099371c9d4SSatish Balay *pc1 = mul1; 14109371c9d4SSatish Balay *pc2 = mul2; 141128f1b45aSHong Zhang 141228f1b45aSHong Zhang pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 141328f1b45aSHong Zhang pv = b->a + bdiag[row + 1] + 1; 141428f1b45aSHong Zhang nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 141528f1b45aSHong Zhang for (j = 0; j < nz; j++) { 141668785679SHong Zhang col = pj[j]; 1417b89f182dSHong Zhang rtmp1[col] -= mul1 * pv[j]; 1418b89f182dSHong Zhang rtmp2[col] -= mul2 * pv[j]; 141928f1b45aSHong Zhang } 14209566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2 + 4.0 * nz)); 142128f1b45aSHong Zhang } 142228f1b45aSHong Zhang row = *bjtmp++; 142328f1b45aSHong Zhang } 142428f1b45aSHong Zhang 1425b89f182dSHong Zhang /* finished row i; check zero pivot, then stick row i into b->a */ 142628f1b45aSHong Zhang rs = 0.0; 142728f1b45aSHong Zhang /* L part */ 1428b89f182dSHong Zhang pc1 = b->a + bi[i]; 142928f1b45aSHong Zhang pj = b->j + bi[i]; 143028f1b45aSHong Zhang nz = bi[i + 1] - bi[i]; 143128f1b45aSHong Zhang for (j = 0; j < nz; j++) { 143268785679SHong Zhang col = pj[j]; 14339371c9d4SSatish Balay pc1[j] = rtmp1[col]; 14349371c9d4SSatish Balay rs += PetscAbsScalar(pc1[j]); 143528f1b45aSHong Zhang } 143628f1b45aSHong Zhang /* U part */ 1437b89f182dSHong Zhang pc1 = b->a + bdiag[i + 1] + 1; 143828f1b45aSHong Zhang pj = b->j + bdiag[i + 1] + 1; 14390e7a5c2bSHong Zhang nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */ 144028f1b45aSHong Zhang for (j = 0; j < nz; j++) { 144168785679SHong Zhang col = pj[j]; 14429371c9d4SSatish Balay pc1[j] = rtmp1[col]; 14439371c9d4SSatish Balay rs += PetscAbsScalar(pc1[j]); 144428f1b45aSHong Zhang } 144528f1b45aSHong Zhang 144628f1b45aSHong Zhang sctx.rs = rs; 1447b89f182dSHong Zhang sctx.pv = rtmp1[i]; 14489566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 144907b50cabSHong Zhang if (sctx.newshift) break; 1450b89f182dSHong Zhang pc1 = b->a + bdiag[i]; /* Mark diagonal */ 1451b89f182dSHong Zhang *pc1 = 1.0 / sctx.pv; 1452b89f182dSHong Zhang 1453b89f182dSHong Zhang /* Now take care of diagonal 2x2 block. */ 1454b89f182dSHong Zhang pc2 = rtmp2 + i; 1455b89f182dSHong Zhang if (*pc2 != 0.0) { 1456b89f182dSHong Zhang mul1 = (*pc2) * (*pc1); /* *pc1=diag[i] is inverted! */ 1457b89f182dSHong Zhang *pc2 = mul1; /* insert L entry */ 1458b89f182dSHong Zhang pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */ 1459b89f182dSHong Zhang nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */ 1460b89f182dSHong Zhang for (j = 0; j < nz; j++) { 14619371c9d4SSatish Balay col = pj[j]; 14629371c9d4SSatish Balay rtmp2[col] -= mul1 * rtmp1[col]; 146328f1b45aSHong Zhang } 14649566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(1 + 2.0 * nz)); 1465b89f182dSHong Zhang } 1466b89f182dSHong Zhang 1467b89f182dSHong Zhang /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1468b89f182dSHong Zhang rs = 0.0; 1469b89f182dSHong Zhang /* L part */ 1470b89f182dSHong Zhang pc2 = b->a + bi[i + 1]; 1471b89f182dSHong Zhang pj = b->j + bi[i + 1]; 1472b89f182dSHong Zhang nz = bi[i + 2] - bi[i + 1]; 1473b89f182dSHong Zhang for (j = 0; j < nz; j++) { 1474b89f182dSHong Zhang col = pj[j]; 14759371c9d4SSatish Balay pc2[j] = rtmp2[col]; 14769371c9d4SSatish Balay rs += PetscAbsScalar(pc2[j]); 1477b89f182dSHong Zhang } 1478b89f182dSHong Zhang /* U part */ 1479b89f182dSHong Zhang pc2 = b->a + bdiag[i + 2] + 1; 14800e7a5c2bSHong Zhang pj = b->j + bdiag[i + 2] + 1; 14810e7a5c2bSHong Zhang nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */ 1482b89f182dSHong Zhang for (j = 0; j < nz; j++) { 1483b89f182dSHong Zhang col = pj[j]; 14849371c9d4SSatish Balay pc2[j] = rtmp2[col]; 14859371c9d4SSatish Balay rs += PetscAbsScalar(pc2[j]); 1486b89f182dSHong Zhang } 1487b89f182dSHong Zhang 148828f1b45aSHong Zhang sctx.rs = rs; 1489b89f182dSHong Zhang sctx.pv = rtmp2[i + 1]; 14909566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1)); 149107b50cabSHong Zhang if (sctx.newshift) break; 149228f1b45aSHong Zhang pc2 = b->a + bdiag[i + 1]; 1493b89f182dSHong Zhang *pc2 = 1.0 / sctx.pv; 149428f1b45aSHong Zhang break; 1495b89f182dSHong Zhang 149668785679SHong Zhang case 3: 149768785679SHong Zhang /* zero rtmp */ 149868785679SHong Zhang /* L part */ 149968785679SHong Zhang nz = bi[i + 1] - bi[i]; 150068785679SHong Zhang bjtmp = bj + bi[i]; 150168785679SHong Zhang for (j = 0; j < nz; j++) { 150268785679SHong Zhang col = bjtmp[j]; 15039371c9d4SSatish Balay rtmp1[col] = 0.0; 15049371c9d4SSatish Balay rtmp2[col] = 0.0; 15059371c9d4SSatish Balay rtmp3[col] = 0.0; 150668785679SHong Zhang } 150768785679SHong Zhang 150868785679SHong Zhang /* U part */ 150968785679SHong Zhang nz = bdiag[i] - bdiag[i + 1]; 151068785679SHong Zhang bjtmp = bj + bdiag[i + 1] + 1; 151168785679SHong Zhang for (j = 0; j < nz; j++) { 151268785679SHong Zhang col = bjtmp[j]; 15139371c9d4SSatish Balay rtmp1[col] = 0.0; 15149371c9d4SSatish Balay rtmp2[col] = 0.0; 15159371c9d4SSatish Balay rtmp3[col] = 0.0; 151668785679SHong Zhang } 151768785679SHong Zhang 151868785679SHong Zhang /* load in initial (unfactored row) */ 151968785679SHong Zhang nz = ai[r[i] + 1] - ai[r[i]]; 152068785679SHong Zhang ajtmp = aj + ai[r[i]]; 15219371c9d4SSatish Balay v1 = aa + ai[r[i]]; 15221a303e4dSPierre Jolivet v2 = aa + ai[r[i + 1]]; 15231a303e4dSPierre Jolivet v3 = aa + ai[r[i + 2]]; 152468785679SHong Zhang for (j = 0; j < nz; j++) { 152568785679SHong Zhang col = ics[ajtmp[j]]; 15269371c9d4SSatish Balay rtmp1[col] = v1[j]; 15279371c9d4SSatish Balay rtmp2[col] = v2[j]; 15289371c9d4SSatish Balay rtmp3[col] = v3[j]; 152968785679SHong Zhang } 153068785679SHong Zhang /* ZeropivotApply(): shift the diagonal of the matrix */ 15319371c9d4SSatish Balay rtmp1[i] += sctx.shift_amount; 15329371c9d4SSatish Balay rtmp2[i + 1] += sctx.shift_amount; 15339371c9d4SSatish Balay rtmp3[i + 2] += sctx.shift_amount; 153468785679SHong Zhang 153568785679SHong Zhang /* elimination */ 153668785679SHong Zhang bjtmp = bj + bi[i]; 153768785679SHong Zhang row = *bjtmp++; /* pivot row */ 153868785679SHong Zhang nzL = bi[i + 1] - bi[i]; 153968785679SHong Zhang for (k = 0; k < nzL; k++) { 1540b89f182dSHong Zhang pc1 = rtmp1 + row; 1541b89f182dSHong Zhang pc2 = rtmp2 + row; 1542b89f182dSHong Zhang pc3 = rtmp3 + row; 154368785679SHong Zhang if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) { 154468785679SHong Zhang pv = b->a + bdiag[row]; 15459371c9d4SSatish Balay mul1 = *pc1 * (*pv); 15469371c9d4SSatish Balay mul2 = *pc2 * (*pv); 15479371c9d4SSatish Balay mul3 = *pc3 * (*pv); 15489371c9d4SSatish Balay *pc1 = mul1; 15499371c9d4SSatish Balay *pc2 = mul2; 15509371c9d4SSatish Balay *pc3 = mul3; 155168785679SHong Zhang 155268785679SHong Zhang pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 155368785679SHong Zhang pv = b->a + bdiag[row + 1] + 1; 155468785679SHong Zhang nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 155568785679SHong Zhang for (j = 0; j < nz; j++) { 155668785679SHong Zhang col = pj[j]; 1557b89f182dSHong Zhang rtmp1[col] -= mul1 * pv[j]; 1558b89f182dSHong Zhang rtmp2[col] -= mul2 * pv[j]; 1559b89f182dSHong Zhang rtmp3[col] -= mul3 * pv[j]; 156068785679SHong Zhang } 15619566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(3 + 6.0 * nz)); 156268785679SHong Zhang } 156368785679SHong Zhang row = *bjtmp++; 156468785679SHong Zhang } 156568785679SHong Zhang 1566b89f182dSHong Zhang /* finished row i; check zero pivot, then stick row i into b->a */ 1567b89f182dSHong Zhang rs = 0.0; 1568b89f182dSHong Zhang /* L part */ 1569b89f182dSHong Zhang pc1 = b->a + bi[i]; 1570b89f182dSHong Zhang pj = b->j + bi[i]; 1571b89f182dSHong Zhang nz = bi[i + 1] - bi[i]; 1572b89f182dSHong Zhang for (j = 0; j < nz; j++) { 1573b89f182dSHong Zhang col = pj[j]; 15749371c9d4SSatish Balay pc1[j] = rtmp1[col]; 15759371c9d4SSatish Balay rs += PetscAbsScalar(pc1[j]); 1576b89f182dSHong Zhang } 1577b89f182dSHong Zhang /* U part */ 1578b89f182dSHong Zhang pc1 = b->a + bdiag[i + 1] + 1; 1579b89f182dSHong Zhang pj = b->j + bdiag[i + 1] + 1; 15800e7a5c2bSHong Zhang nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */ 1581b89f182dSHong Zhang for (j = 0; j < nz; j++) { 1582b89f182dSHong Zhang col = pj[j]; 15839371c9d4SSatish Balay pc1[j] = rtmp1[col]; 15849371c9d4SSatish Balay rs += PetscAbsScalar(pc1[j]); 1585b89f182dSHong Zhang } 158668785679SHong Zhang 1587b89f182dSHong Zhang sctx.rs = rs; 1588b89f182dSHong Zhang sctx.pv = rtmp1[i]; 15899566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 159007b50cabSHong Zhang if (sctx.newshift) break; 1591b89f182dSHong Zhang pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1592b89f182dSHong Zhang *pc1 = 1.0 / sctx.pv; 1593b89f182dSHong Zhang 1594b89f182dSHong Zhang /* Now take care of 1st column of diagonal 3x3 block. */ 1595b89f182dSHong Zhang pc2 = rtmp2 + i; 1596b89f182dSHong Zhang pc3 = rtmp3 + i; 1597b89f182dSHong Zhang if (*pc2 != 0.0 || *pc3 != 0.0) { 15989371c9d4SSatish Balay mul2 = (*pc2) * (*pc1); 15999371c9d4SSatish Balay *pc2 = mul2; 16009371c9d4SSatish Balay mul3 = (*pc3) * (*pc1); 16019371c9d4SSatish Balay *pc3 = mul3; 160268785679SHong Zhang pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */ 160368785679SHong Zhang nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */ 160468785679SHong Zhang for (j = 0; j < nz; j++) { 160568785679SHong Zhang col = pj[j]; 1606b89f182dSHong Zhang rtmp2[col] -= mul2 * rtmp1[col]; 1607b89f182dSHong Zhang rtmp3[col] -= mul3 * rtmp1[col]; 160868785679SHong Zhang } 16099566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2 + 4.0 * nz)); 161068785679SHong Zhang } 161168785679SHong Zhang 1612b89f182dSHong Zhang /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1613b89f182dSHong Zhang rs = 0.0; 1614b89f182dSHong Zhang /* L part */ 1615b89f182dSHong Zhang pc2 = b->a + bi[i + 1]; 1616b89f182dSHong Zhang pj = b->j + bi[i + 1]; 1617b89f182dSHong Zhang nz = bi[i + 2] - bi[i + 1]; 1618b89f182dSHong Zhang for (j = 0; j < nz; j++) { 1619b89f182dSHong Zhang col = pj[j]; 16209371c9d4SSatish Balay pc2[j] = rtmp2[col]; 16219371c9d4SSatish Balay rs += PetscAbsScalar(pc2[j]); 1622b89f182dSHong Zhang } 1623b89f182dSHong Zhang /* U part */ 1624b89f182dSHong Zhang pc2 = b->a + bdiag[i + 2] + 1; 16250e7a5c2bSHong Zhang pj = b->j + bdiag[i + 2] + 1; 16260e7a5c2bSHong Zhang nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */ 1627b89f182dSHong Zhang for (j = 0; j < nz; j++) { 1628b89f182dSHong Zhang col = pj[j]; 16299371c9d4SSatish Balay pc2[j] = rtmp2[col]; 16309371c9d4SSatish Balay rs += PetscAbsScalar(pc2[j]); 1631b89f182dSHong Zhang } 1632b89f182dSHong Zhang 1633b89f182dSHong Zhang sctx.rs = rs; 1634b89f182dSHong Zhang sctx.pv = rtmp2[i + 1]; 16359566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1)); 163607b50cabSHong Zhang if (sctx.newshift) break; 1637b89f182dSHong Zhang pc2 = b->a + bdiag[i + 1]; 1638b89f182dSHong Zhang *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */ 1639b89f182dSHong Zhang 1640b89f182dSHong Zhang /* Now take care of 2nd column of diagonal 3x3 block. */ 1641b89f182dSHong Zhang pc3 = rtmp3 + i + 1; 164268785679SHong Zhang if (*pc3 != 0.0) { 16439371c9d4SSatish Balay mul3 = (*pc3) * (*pc2); 16449371c9d4SSatish Balay *pc3 = mul3; 164568785679SHong Zhang pj = b->j + bdiag[i + 2] + 1; /* beginning of U(i+1,:) */ 164668785679SHong Zhang nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */ 164768785679SHong Zhang for (j = 0; j < nz; j++) { 164868785679SHong Zhang col = pj[j]; 1649b89f182dSHong Zhang rtmp3[col] -= mul3 * rtmp2[col]; 165068785679SHong Zhang } 16519566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(1 + 2.0 * nz)); 165268785679SHong Zhang } 165368785679SHong Zhang 1654b89f182dSHong Zhang /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 165568785679SHong Zhang rs = 0.0; 165668785679SHong Zhang /* L part */ 1657b89f182dSHong Zhang pc3 = b->a + bi[i + 2]; 1658b89f182dSHong Zhang pj = b->j + bi[i + 2]; 1659b89f182dSHong Zhang nz = bi[i + 3] - bi[i + 2]; 166068785679SHong Zhang for (j = 0; j < nz; j++) { 166168785679SHong Zhang col = pj[j]; 16629371c9d4SSatish Balay pc3[j] = rtmp3[col]; 16639371c9d4SSatish Balay rs += PetscAbsScalar(pc3[j]); 166468785679SHong Zhang } 166568785679SHong Zhang /* U part */ 1666b89f182dSHong Zhang pc3 = b->a + bdiag[i + 3] + 1; 16670e7a5c2bSHong Zhang pj = b->j + bdiag[i + 3] + 1; 16680e7a5c2bSHong Zhang nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */ 166968785679SHong Zhang for (j = 0; j < nz; j++) { 167068785679SHong Zhang col = pj[j]; 16719371c9d4SSatish Balay pc3[j] = rtmp3[col]; 16729371c9d4SSatish Balay rs += PetscAbsScalar(pc3[j]); 167368785679SHong Zhang } 167468785679SHong Zhang 167568785679SHong Zhang sctx.rs = rs; 1676b89f182dSHong Zhang sctx.pv = rtmp3[i + 2]; 16779566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2)); 167807b50cabSHong Zhang if (sctx.newshift) break; 167968785679SHong Zhang pc3 = b->a + bdiag[i + 2]; 1680b89f182dSHong Zhang *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */ 168168785679SHong Zhang break; 16829877982aSShri Abhyankar case 4: 16839877982aSShri Abhyankar /* zero rtmp */ 16849877982aSShri Abhyankar /* L part */ 16859877982aSShri Abhyankar nz = bi[i + 1] - bi[i]; 16869877982aSShri Abhyankar bjtmp = bj + bi[i]; 16879877982aSShri Abhyankar for (j = 0; j < nz; j++) { 16889877982aSShri Abhyankar col = bjtmp[j]; 16899371c9d4SSatish Balay rtmp1[col] = 0.0; 16909371c9d4SSatish Balay rtmp2[col] = 0.0; 16919371c9d4SSatish Balay rtmp3[col] = 0.0; 16929371c9d4SSatish Balay rtmp4[col] = 0.0; 16939877982aSShri Abhyankar } 16949877982aSShri Abhyankar 16959877982aSShri Abhyankar /* U part */ 16969877982aSShri Abhyankar nz = bdiag[i] - bdiag[i + 1]; 16979877982aSShri Abhyankar bjtmp = bj + bdiag[i + 1] + 1; 16989877982aSShri Abhyankar for (j = 0; j < nz; j++) { 16999877982aSShri Abhyankar col = bjtmp[j]; 17009371c9d4SSatish Balay rtmp1[col] = 0.0; 17019371c9d4SSatish Balay rtmp2[col] = 0.0; 17029371c9d4SSatish Balay rtmp3[col] = 0.0; 17039371c9d4SSatish Balay rtmp4[col] = 0.0; 17049877982aSShri Abhyankar } 17059877982aSShri Abhyankar 17069877982aSShri Abhyankar /* load in initial (unfactored row) */ 17079877982aSShri Abhyankar nz = ai[r[i] + 1] - ai[r[i]]; 17089877982aSShri Abhyankar ajtmp = aj + ai[r[i]]; 17099371c9d4SSatish Balay v1 = aa + ai[r[i]]; 17101a303e4dSPierre Jolivet v2 = aa + ai[r[i + 1]]; 17111a303e4dSPierre Jolivet v3 = aa + ai[r[i + 2]]; 17121a303e4dSPierre Jolivet v4 = aa + ai[r[i + 3]]; 17139877982aSShri Abhyankar for (j = 0; j < nz; j++) { 17149877982aSShri Abhyankar col = ics[ajtmp[j]]; 17159371c9d4SSatish Balay rtmp1[col] = v1[j]; 17169371c9d4SSatish Balay rtmp2[col] = v2[j]; 17179371c9d4SSatish Balay rtmp3[col] = v3[j]; 17189371c9d4SSatish Balay rtmp4[col] = v4[j]; 17199877982aSShri Abhyankar } 17209877982aSShri Abhyankar /* ZeropivotApply(): shift the diagonal of the matrix */ 17219371c9d4SSatish Balay rtmp1[i] += sctx.shift_amount; 17229371c9d4SSatish Balay rtmp2[i + 1] += sctx.shift_amount; 17239371c9d4SSatish Balay rtmp3[i + 2] += sctx.shift_amount; 17249371c9d4SSatish Balay rtmp4[i + 3] += sctx.shift_amount; 17259877982aSShri Abhyankar 17269877982aSShri Abhyankar /* elimination */ 17279877982aSShri Abhyankar bjtmp = bj + bi[i]; 17289877982aSShri Abhyankar row = *bjtmp++; /* pivot row */ 17299877982aSShri Abhyankar nzL = bi[i + 1] - bi[i]; 17309877982aSShri Abhyankar for (k = 0; k < nzL; k++) { 17319877982aSShri Abhyankar pc1 = rtmp1 + row; 17329877982aSShri Abhyankar pc2 = rtmp2 + row; 17339877982aSShri Abhyankar pc3 = rtmp3 + row; 17349877982aSShri Abhyankar pc4 = rtmp4 + row; 17359877982aSShri Abhyankar if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 17369877982aSShri Abhyankar pv = b->a + bdiag[row]; 17379371c9d4SSatish Balay mul1 = *pc1 * (*pv); 17389371c9d4SSatish Balay mul2 = *pc2 * (*pv); 17399371c9d4SSatish Balay mul3 = *pc3 * (*pv); 17409371c9d4SSatish Balay mul4 = *pc4 * (*pv); 17419371c9d4SSatish Balay *pc1 = mul1; 17429371c9d4SSatish Balay *pc2 = mul2; 17439371c9d4SSatish Balay *pc3 = mul3; 17449371c9d4SSatish Balay *pc4 = mul4; 17459877982aSShri Abhyankar 17469877982aSShri Abhyankar pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 17479877982aSShri Abhyankar pv = b->a + bdiag[row + 1] + 1; 17489877982aSShri Abhyankar nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 17499877982aSShri Abhyankar for (j = 0; j < nz; j++) { 17509877982aSShri Abhyankar col = pj[j]; 17519877982aSShri Abhyankar rtmp1[col] -= mul1 * pv[j]; 17529877982aSShri Abhyankar rtmp2[col] -= mul2 * pv[j]; 17539877982aSShri Abhyankar rtmp3[col] -= mul3 * pv[j]; 17549877982aSShri Abhyankar rtmp4[col] -= mul4 * pv[j]; 17559877982aSShri Abhyankar } 17569566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(4 + 8.0 * nz)); 17579877982aSShri Abhyankar } 17589877982aSShri Abhyankar row = *bjtmp++; 17599877982aSShri Abhyankar } 17609877982aSShri Abhyankar 17619877982aSShri Abhyankar /* finished row i; check zero pivot, then stick row i into b->a */ 17629877982aSShri Abhyankar rs = 0.0; 17639877982aSShri Abhyankar /* L part */ 17649877982aSShri Abhyankar pc1 = b->a + bi[i]; 17659877982aSShri Abhyankar pj = b->j + bi[i]; 17669877982aSShri Abhyankar nz = bi[i + 1] - bi[i]; 17679877982aSShri Abhyankar for (j = 0; j < nz; j++) { 17689877982aSShri Abhyankar col = pj[j]; 17699371c9d4SSatish Balay pc1[j] = rtmp1[col]; 17709371c9d4SSatish Balay rs += PetscAbsScalar(pc1[j]); 17719877982aSShri Abhyankar } 17729877982aSShri Abhyankar /* U part */ 17739877982aSShri Abhyankar pc1 = b->a + bdiag[i + 1] + 1; 17749877982aSShri Abhyankar pj = b->j + bdiag[i + 1] + 1; 17759877982aSShri Abhyankar nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */ 17769877982aSShri Abhyankar for (j = 0; j < nz; j++) { 17779877982aSShri Abhyankar col = pj[j]; 17789371c9d4SSatish Balay pc1[j] = rtmp1[col]; 17799371c9d4SSatish Balay rs += PetscAbsScalar(pc1[j]); 17809877982aSShri Abhyankar } 17819877982aSShri Abhyankar 17829877982aSShri Abhyankar sctx.rs = rs; 17839877982aSShri Abhyankar sctx.pv = rtmp1[i]; 17849566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 178507b50cabSHong Zhang if (sctx.newshift) break; 17869877982aSShri Abhyankar pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 17879877982aSShri Abhyankar *pc1 = 1.0 / sctx.pv; 17889877982aSShri Abhyankar 17899877982aSShri Abhyankar /* Now take care of 1st column of diagonal 4x4 block. */ 17909877982aSShri Abhyankar pc2 = rtmp2 + i; 17919877982aSShri Abhyankar pc3 = rtmp3 + i; 17929877982aSShri Abhyankar pc4 = rtmp4 + i; 17939877982aSShri Abhyankar if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 17949371c9d4SSatish Balay mul2 = (*pc2) * (*pc1); 17959371c9d4SSatish Balay *pc2 = mul2; 17969371c9d4SSatish Balay mul3 = (*pc3) * (*pc1); 17979371c9d4SSatish Balay *pc3 = mul3; 17989371c9d4SSatish Balay mul4 = (*pc4) * (*pc1); 17999371c9d4SSatish Balay *pc4 = mul4; 18009877982aSShri Abhyankar pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */ 18019877982aSShri Abhyankar nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */ 18029877982aSShri Abhyankar for (j = 0; j < nz; j++) { 18039877982aSShri Abhyankar col = pj[j]; 18049877982aSShri Abhyankar rtmp2[col] -= mul2 * rtmp1[col]; 18059877982aSShri Abhyankar rtmp3[col] -= mul3 * rtmp1[col]; 18069877982aSShri Abhyankar rtmp4[col] -= mul4 * rtmp1[col]; 18079877982aSShri Abhyankar } 18089566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(3 + 6.0 * nz)); 18099877982aSShri Abhyankar } 18109877982aSShri Abhyankar 18119877982aSShri Abhyankar /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 18129877982aSShri Abhyankar rs = 0.0; 18139877982aSShri Abhyankar /* L part */ 18149877982aSShri Abhyankar pc2 = b->a + bi[i + 1]; 18159877982aSShri Abhyankar pj = b->j + bi[i + 1]; 18169877982aSShri Abhyankar nz = bi[i + 2] - bi[i + 1]; 18179877982aSShri Abhyankar for (j = 0; j < nz; j++) { 18189877982aSShri Abhyankar col = pj[j]; 18199371c9d4SSatish Balay pc2[j] = rtmp2[col]; 18209371c9d4SSatish Balay rs += PetscAbsScalar(pc2[j]); 18219877982aSShri Abhyankar } 18229877982aSShri Abhyankar /* U part */ 18239877982aSShri Abhyankar pc2 = b->a + bdiag[i + 2] + 1; 18249877982aSShri Abhyankar pj = b->j + bdiag[i + 2] + 1; 18259877982aSShri Abhyankar nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */ 18269877982aSShri Abhyankar for (j = 0; j < nz; j++) { 18279877982aSShri Abhyankar col = pj[j]; 18289371c9d4SSatish Balay pc2[j] = rtmp2[col]; 18299371c9d4SSatish Balay rs += PetscAbsScalar(pc2[j]); 18309877982aSShri Abhyankar } 18319877982aSShri Abhyankar 18329877982aSShri Abhyankar sctx.rs = rs; 18339877982aSShri Abhyankar sctx.pv = rtmp2[i + 1]; 18349566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1)); 183507b50cabSHong Zhang if (sctx.newshift) break; 18369877982aSShri Abhyankar pc2 = b->a + bdiag[i + 1]; 18379877982aSShri Abhyankar *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */ 18389877982aSShri Abhyankar 18399877982aSShri Abhyankar /* Now take care of 2nd column of diagonal 4x4 block. */ 18409877982aSShri Abhyankar pc3 = rtmp3 + i + 1; 18419877982aSShri Abhyankar pc4 = rtmp4 + i + 1; 18429877982aSShri Abhyankar if (*pc3 != 0.0 || *pc4 != 0.0) { 18439371c9d4SSatish Balay mul3 = (*pc3) * (*pc2); 18449371c9d4SSatish Balay *pc3 = mul3; 18459371c9d4SSatish Balay mul4 = (*pc4) * (*pc2); 18469371c9d4SSatish Balay *pc4 = mul4; 18479877982aSShri Abhyankar pj = b->j + bdiag[i + 2] + 1; /* beginning of U(i+1,:) */ 18489877982aSShri Abhyankar nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */ 18499877982aSShri Abhyankar for (j = 0; j < nz; j++) { 18509877982aSShri Abhyankar col = pj[j]; 18519877982aSShri Abhyankar rtmp3[col] -= mul3 * rtmp2[col]; 18529877982aSShri Abhyankar rtmp4[col] -= mul4 * rtmp2[col]; 18539877982aSShri Abhyankar } 18549566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(4.0 * nz)); 18559877982aSShri Abhyankar } 18569877982aSShri Abhyankar 18579877982aSShri Abhyankar /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 18589877982aSShri Abhyankar rs = 0.0; 18599877982aSShri Abhyankar /* L part */ 18609877982aSShri Abhyankar pc3 = b->a + bi[i + 2]; 18619877982aSShri Abhyankar pj = b->j + bi[i + 2]; 18629877982aSShri Abhyankar nz = bi[i + 3] - bi[i + 2]; 18639877982aSShri Abhyankar for (j = 0; j < nz; j++) { 18649877982aSShri Abhyankar col = pj[j]; 18659371c9d4SSatish Balay pc3[j] = rtmp3[col]; 18669371c9d4SSatish Balay rs += PetscAbsScalar(pc3[j]); 18679877982aSShri Abhyankar } 18689877982aSShri Abhyankar /* U part */ 18699877982aSShri Abhyankar pc3 = b->a + bdiag[i + 3] + 1; 18709877982aSShri Abhyankar pj = b->j + bdiag[i + 3] + 1; 18719877982aSShri Abhyankar nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */ 18729877982aSShri Abhyankar for (j = 0; j < nz; j++) { 18739877982aSShri Abhyankar col = pj[j]; 18749371c9d4SSatish Balay pc3[j] = rtmp3[col]; 18759371c9d4SSatish Balay rs += PetscAbsScalar(pc3[j]); 18769877982aSShri Abhyankar } 18779877982aSShri Abhyankar 18789877982aSShri Abhyankar sctx.rs = rs; 18799877982aSShri Abhyankar sctx.pv = rtmp3[i + 2]; 18809566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2)); 188107b50cabSHong Zhang if (sctx.newshift) break; 18829877982aSShri Abhyankar pc3 = b->a + bdiag[i + 2]; 18839877982aSShri Abhyankar *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */ 18849877982aSShri Abhyankar 18859877982aSShri Abhyankar /* Now take care of 3rd column of diagonal 4x4 block. */ 18869877982aSShri Abhyankar pc4 = rtmp4 + i + 2; 18879877982aSShri Abhyankar if (*pc4 != 0.0) { 18889371c9d4SSatish Balay mul4 = (*pc4) * (*pc3); 18899371c9d4SSatish Balay *pc4 = mul4; 18909877982aSShri Abhyankar pj = b->j + bdiag[i + 3] + 1; /* beginning of U(i+2,:) */ 18919877982aSShri Abhyankar nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* num of entries in U(i+2,:) excluding diag */ 18929877982aSShri Abhyankar for (j = 0; j < nz; j++) { 18939877982aSShri Abhyankar col = pj[j]; 18949877982aSShri Abhyankar rtmp4[col] -= mul4 * rtmp3[col]; 18959877982aSShri Abhyankar } 18969566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(1 + 2.0 * nz)); 18979877982aSShri Abhyankar } 18989877982aSShri Abhyankar 18999877982aSShri Abhyankar /* finished i+3; check zero pivot, then stick row i+3 into b->a */ 19009877982aSShri Abhyankar rs = 0.0; 19019877982aSShri Abhyankar /* L part */ 19029877982aSShri Abhyankar pc4 = b->a + bi[i + 3]; 19039877982aSShri Abhyankar pj = b->j + bi[i + 3]; 19049877982aSShri Abhyankar nz = bi[i + 4] - bi[i + 3]; 19059877982aSShri Abhyankar for (j = 0; j < nz; j++) { 19069877982aSShri Abhyankar col = pj[j]; 19079371c9d4SSatish Balay pc4[j] = rtmp4[col]; 19089371c9d4SSatish Balay rs += PetscAbsScalar(pc4[j]); 19099877982aSShri Abhyankar } 19109877982aSShri Abhyankar /* U part */ 19119877982aSShri Abhyankar pc4 = b->a + bdiag[i + 4] + 1; 19129877982aSShri Abhyankar pj = b->j + bdiag[i + 4] + 1; 19139877982aSShri Abhyankar nz = bdiag[i + 3] - bdiag[i + 4] - 1; /* exclude diagonal */ 19149877982aSShri Abhyankar for (j = 0; j < nz; j++) { 19159877982aSShri Abhyankar col = pj[j]; 19169371c9d4SSatish Balay pc4[j] = rtmp4[col]; 19179371c9d4SSatish Balay rs += PetscAbsScalar(pc4[j]); 19189877982aSShri Abhyankar } 19199877982aSShri Abhyankar 19209877982aSShri Abhyankar sctx.rs = rs; 19219877982aSShri Abhyankar sctx.pv = rtmp4[i + 3]; 19229566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(B, A, info, &sctx, i + 3)); 192307b50cabSHong Zhang if (sctx.newshift) break; 19249877982aSShri Abhyankar pc4 = b->a + bdiag[i + 3]; 19259877982aSShri Abhyankar *pc4 = 1.0 / sctx.pv; /* Mark diag[i+3] */ 19269877982aSShri Abhyankar break; 192768785679SHong Zhang 1928d71ae5a4SJacob Faibussowitsch default: 1929d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported "); 193028f1b45aSHong Zhang } 1931c2b86aeeSHong Zhang if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */ 193228f1b45aSHong Zhang i += nodesz; /* Update the row */ 193368785679SHong Zhang } 193428f1b45aSHong Zhang 193528f1b45aSHong Zhang /* MatPivotRefine() */ 193607b50cabSHong Zhang if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) { 193728f1b45aSHong Zhang /* 193828f1b45aSHong Zhang * if no shift in this attempt & shifting & started shifting & can refine, 193928f1b45aSHong Zhang * then try lower shift 194028f1b45aSHong Zhang */ 194128f1b45aSHong Zhang sctx.shift_hi = sctx.shift_fraction; 194228f1b45aSHong Zhang sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.; 194328f1b45aSHong Zhang sctx.shift_amount = sctx.shift_fraction * sctx.shift_top; 194407b50cabSHong Zhang sctx.newshift = PETSC_TRUE; 194528f1b45aSHong Zhang sctx.nshift++; 194628f1b45aSHong Zhang } 194707b50cabSHong Zhang } while (sctx.newshift); 194828f1b45aSHong Zhang 19499566063dSJacob Faibussowitsch PetscCall(PetscFree4(rtmp1, rtmp2, rtmp3, rtmp4)); 19509566063dSJacob Faibussowitsch PetscCall(PetscFree(tmp_vec2)); 19519566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isicol, &ic)); 19529566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &r)); 195328f1b45aSHong Zhang 19544d12350bSJunchao Zhang if (b->inode.size_csr) { 1955abb87a52SBarry Smith C->ops->solve = MatSolve_SeqAIJ_Inode; 1956abb87a52SBarry Smith } else { 1957d3ac4fa3SBarry Smith C->ops->solve = MatSolve_SeqAIJ; 1958abb87a52SBarry Smith } 195928f1b45aSHong Zhang C->ops->solveadd = MatSolveAdd_SeqAIJ; 196028f1b45aSHong Zhang C->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 196128f1b45aSHong Zhang C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 196228f1b45aSHong Zhang C->ops->matsolve = MatMatSolve_SeqAIJ; 1963a3d9026eSPierre Jolivet C->ops->matsolvetranspose = MatMatSolveTranspose_SeqAIJ; 196428f1b45aSHong Zhang C->assembled = PETSC_TRUE; 196528f1b45aSHong Zhang C->preallocated = PETSC_TRUE; 19662205254eSKarl Rupp 19679566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(C->cmap->n)); 196828f1b45aSHong Zhang 196928f1b45aSHong Zhang /* MatShiftView(A,info,&sctx) */ 197028f1b45aSHong Zhang if (sctx.nshift) { 1971f4db908eSBarry Smith if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 19729566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top)); 1973f4db908eSBarry Smith } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 19749566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 1975f4db908eSBarry Smith } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) { 19769566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount)); 197728f1b45aSHong Zhang } 197828f1b45aSHong Zhang } 19793ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 198028f1b45aSHong Zhang } 1981628f99d7SShri Abhyankar 1982d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A, Vec bb, Vec xx) 1983d71ae5a4SJacob Faibussowitsch { 1984019b515eSShri Abhyankar Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1985019b515eSShri Abhyankar IS iscol = a->col, isrow = a->row; 1986019b515eSShri Abhyankar const PetscInt *r, *c, *rout, *cout; 19874d12350bSJunchao Zhang PetscInt i, j; 19888758e1faSBarry Smith PetscInt node_max, row, nsz, aii, i0, i1, nz; 19898758e1faSBarry Smith const PetscInt *ai = a->i, *a_j = a->j, *ns, *vi, *ad, *aj; 1990019b515eSShri Abhyankar PetscScalar *x, *tmp, *tmps, tmp0, tmp1; 1991019b515eSShri Abhyankar PetscScalar sum1, sum2, sum3, sum4, sum5; 1992019b515eSShri Abhyankar const MatScalar *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa; 1993019b515eSShri Abhyankar const PetscScalar *b; 1994019b515eSShri Abhyankar 1995019b515eSShri Abhyankar PetscFunctionBegin; 19964d12350bSJunchao Zhang PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 1997019b515eSShri Abhyankar node_max = a->inode.node_count; 19984d12350bSJunchao Zhang ns = a->inode.size_csr; /* Node Size array */ 1999019b515eSShri Abhyankar 20009566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 20019566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(xx, &x)); 2002019b515eSShri Abhyankar tmp = a->solve_work; 2003019b515eSShri Abhyankar 20049371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 20059371c9d4SSatish Balay r = rout; 20069371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 20079371c9d4SSatish Balay c = cout; 2008019b515eSShri Abhyankar 2009019b515eSShri Abhyankar /* forward solve the lower triangular */ 2010019b515eSShri Abhyankar tmps = tmp; 2011019b515eSShri Abhyankar aa = a_a; 2012019b515eSShri Abhyankar aj = a_j; 2013019b515eSShri Abhyankar ad = a->diag; 2014019b515eSShri Abhyankar 20154d12350bSJunchao Zhang for (i = 0; i < node_max; ++i) { 20164d12350bSJunchao Zhang row = ns[i]; 20174d12350bSJunchao Zhang nsz = ns[i + 1] - ns[i]; 2018019b515eSShri Abhyankar aii = ai[row]; 2019019b515eSShri Abhyankar v1 = aa + aii; 2020019b515eSShri Abhyankar vi = aj + aii; 2021019b515eSShri Abhyankar nz = ai[row + 1] - ai[row]; 2022019b515eSShri Abhyankar 202398991853SShri Abhyankar if (i < node_max - 1) { 202498991853SShri Abhyankar /* Prefetch the indices for the next block */ 202550d8bf02SJed Brown PetscPrefetchBlock(aj + ai[row + nsz], ai[row + nsz + 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */ 202698991853SShri Abhyankar /* Prefetch the data for the next block */ 20274d12350bSJunchao Zhang PetscPrefetchBlock(aa + ai[row + nsz], ai[ns[i + 2]] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); 202898991853SShri Abhyankar } 202998991853SShri Abhyankar 2030019b515eSShri Abhyankar switch (nsz) { /* Each loop in 'case' is unrolled */ 2031019b515eSShri Abhyankar case 1: 2032019b515eSShri Abhyankar sum1 = b[r[row]]; 2033019b515eSShri Abhyankar for (j = 0; j < nz - 1; j += 2) { 2034019b515eSShri Abhyankar i0 = vi[j]; 2035019b515eSShri Abhyankar i1 = vi[j + 1]; 2036019b515eSShri Abhyankar tmp0 = tmps[i0]; 2037019b515eSShri Abhyankar tmp1 = tmps[i1]; 2038019b515eSShri Abhyankar sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2039019b515eSShri Abhyankar } 2040019b515eSShri Abhyankar if (j == nz - 1) { 2041019b515eSShri Abhyankar tmp0 = tmps[vi[j]]; 2042019b515eSShri Abhyankar sum1 -= v1[j] * tmp0; 2043019b515eSShri Abhyankar } 2044019b515eSShri Abhyankar tmp[row++] = sum1; 2045019b515eSShri Abhyankar break; 2046019b515eSShri Abhyankar case 2: 2047019b515eSShri Abhyankar sum1 = b[r[row]]; 2048019b515eSShri Abhyankar sum2 = b[r[row + 1]]; 2049019b515eSShri Abhyankar v2 = aa + ai[row + 1]; 2050019b515eSShri Abhyankar 2051019b515eSShri Abhyankar for (j = 0; j < nz - 1; j += 2) { 2052019b515eSShri Abhyankar i0 = vi[j]; 2053019b515eSShri Abhyankar i1 = vi[j + 1]; 2054019b515eSShri Abhyankar tmp0 = tmps[i0]; 2055019b515eSShri Abhyankar tmp1 = tmps[i1]; 2056019b515eSShri Abhyankar sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2057019b515eSShri Abhyankar sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1; 2058019b515eSShri Abhyankar } 2059019b515eSShri Abhyankar if (j == nz - 1) { 2060019b515eSShri Abhyankar tmp0 = tmps[vi[j]]; 2061019b515eSShri Abhyankar sum1 -= v1[j] * tmp0; 2062019b515eSShri Abhyankar sum2 -= v2[j] * tmp0; 2063019b515eSShri Abhyankar } 2064019b515eSShri Abhyankar sum2 -= v2[nz] * sum1; 2065019b515eSShri Abhyankar tmp[row++] = sum1; 2066019b515eSShri Abhyankar tmp[row++] = sum2; 2067019b515eSShri Abhyankar break; 2068019b515eSShri Abhyankar case 3: 2069019b515eSShri Abhyankar sum1 = b[r[row]]; 2070019b515eSShri Abhyankar sum2 = b[r[row + 1]]; 2071019b515eSShri Abhyankar sum3 = b[r[row + 2]]; 2072019b515eSShri Abhyankar v2 = aa + ai[row + 1]; 2073019b515eSShri Abhyankar v3 = aa + ai[row + 2]; 2074019b515eSShri Abhyankar 2075019b515eSShri Abhyankar for (j = 0; j < nz - 1; j += 2) { 2076019b515eSShri Abhyankar i0 = vi[j]; 2077019b515eSShri Abhyankar i1 = vi[j + 1]; 2078019b515eSShri Abhyankar tmp0 = tmps[i0]; 2079019b515eSShri Abhyankar tmp1 = tmps[i1]; 2080019b515eSShri Abhyankar sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2081019b515eSShri Abhyankar sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1; 2082019b515eSShri Abhyankar sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1; 2083019b515eSShri Abhyankar } 2084019b515eSShri Abhyankar if (j == nz - 1) { 2085019b515eSShri Abhyankar tmp0 = tmps[vi[j]]; 2086019b515eSShri Abhyankar sum1 -= v1[j] * tmp0; 2087019b515eSShri Abhyankar sum2 -= v2[j] * tmp0; 2088019b515eSShri Abhyankar sum3 -= v3[j] * tmp0; 2089019b515eSShri Abhyankar } 2090019b515eSShri Abhyankar sum2 -= v2[nz] * sum1; 2091019b515eSShri Abhyankar sum3 -= v3[nz] * sum1; 2092019b515eSShri Abhyankar sum3 -= v3[nz + 1] * sum2; 2093019b515eSShri Abhyankar tmp[row++] = sum1; 2094019b515eSShri Abhyankar tmp[row++] = sum2; 2095019b515eSShri Abhyankar tmp[row++] = sum3; 2096019b515eSShri Abhyankar break; 2097019b515eSShri Abhyankar 2098019b515eSShri Abhyankar case 4: 2099019b515eSShri Abhyankar sum1 = b[r[row]]; 2100019b515eSShri Abhyankar sum2 = b[r[row + 1]]; 2101019b515eSShri Abhyankar sum3 = b[r[row + 2]]; 2102019b515eSShri Abhyankar sum4 = b[r[row + 3]]; 2103019b515eSShri Abhyankar v2 = aa + ai[row + 1]; 2104019b515eSShri Abhyankar v3 = aa + ai[row + 2]; 2105019b515eSShri Abhyankar v4 = aa + ai[row + 3]; 2106019b515eSShri Abhyankar 2107019b515eSShri Abhyankar for (j = 0; j < nz - 1; j += 2) { 2108019b515eSShri Abhyankar i0 = vi[j]; 2109019b515eSShri Abhyankar i1 = vi[j + 1]; 2110019b515eSShri Abhyankar tmp0 = tmps[i0]; 2111019b515eSShri Abhyankar tmp1 = tmps[i1]; 2112019b515eSShri Abhyankar sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2113019b515eSShri Abhyankar sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1; 2114019b515eSShri Abhyankar sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1; 2115019b515eSShri Abhyankar sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1; 2116019b515eSShri Abhyankar } 2117019b515eSShri Abhyankar if (j == nz - 1) { 2118019b515eSShri Abhyankar tmp0 = tmps[vi[j]]; 2119019b515eSShri Abhyankar sum1 -= v1[j] * tmp0; 2120019b515eSShri Abhyankar sum2 -= v2[j] * tmp0; 2121019b515eSShri Abhyankar sum3 -= v3[j] * tmp0; 2122019b515eSShri Abhyankar sum4 -= v4[j] * tmp0; 2123019b515eSShri Abhyankar } 2124019b515eSShri Abhyankar sum2 -= v2[nz] * sum1; 2125019b515eSShri Abhyankar sum3 -= v3[nz] * sum1; 2126019b515eSShri Abhyankar sum4 -= v4[nz] * sum1; 2127019b515eSShri Abhyankar sum3 -= v3[nz + 1] * sum2; 2128019b515eSShri Abhyankar sum4 -= v4[nz + 1] * sum2; 2129019b515eSShri Abhyankar sum4 -= v4[nz + 2] * sum3; 2130019b515eSShri Abhyankar 2131019b515eSShri Abhyankar tmp[row++] = sum1; 2132019b515eSShri Abhyankar tmp[row++] = sum2; 2133019b515eSShri Abhyankar tmp[row++] = sum3; 2134019b515eSShri Abhyankar tmp[row++] = sum4; 2135019b515eSShri Abhyankar break; 2136019b515eSShri Abhyankar case 5: 2137019b515eSShri Abhyankar sum1 = b[r[row]]; 2138019b515eSShri Abhyankar sum2 = b[r[row + 1]]; 2139019b515eSShri Abhyankar sum3 = b[r[row + 2]]; 2140019b515eSShri Abhyankar sum4 = b[r[row + 3]]; 2141019b515eSShri Abhyankar sum5 = b[r[row + 4]]; 2142019b515eSShri Abhyankar v2 = aa + ai[row + 1]; 2143019b515eSShri Abhyankar v3 = aa + ai[row + 2]; 2144019b515eSShri Abhyankar v4 = aa + ai[row + 3]; 2145019b515eSShri Abhyankar v5 = aa + ai[row + 4]; 2146019b515eSShri Abhyankar 2147019b515eSShri Abhyankar for (j = 0; j < nz - 1; j += 2) { 2148019b515eSShri Abhyankar i0 = vi[j]; 2149019b515eSShri Abhyankar i1 = vi[j + 1]; 2150019b515eSShri Abhyankar tmp0 = tmps[i0]; 2151019b515eSShri Abhyankar tmp1 = tmps[i1]; 2152019b515eSShri Abhyankar sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2153019b515eSShri Abhyankar sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1; 2154019b515eSShri Abhyankar sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1; 2155019b515eSShri Abhyankar sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1; 2156019b515eSShri Abhyankar sum5 -= v5[j] * tmp0 + v5[j + 1] * tmp1; 2157019b515eSShri Abhyankar } 2158019b515eSShri Abhyankar if (j == nz - 1) { 2159019b515eSShri Abhyankar tmp0 = tmps[vi[j]]; 2160019b515eSShri Abhyankar sum1 -= v1[j] * tmp0; 2161019b515eSShri Abhyankar sum2 -= v2[j] * tmp0; 2162019b515eSShri Abhyankar sum3 -= v3[j] * tmp0; 2163019b515eSShri Abhyankar sum4 -= v4[j] * tmp0; 2164019b515eSShri Abhyankar sum5 -= v5[j] * tmp0; 2165019b515eSShri Abhyankar } 2166019b515eSShri Abhyankar 2167019b515eSShri Abhyankar sum2 -= v2[nz] * sum1; 2168019b515eSShri Abhyankar sum3 -= v3[nz] * sum1; 2169019b515eSShri Abhyankar sum4 -= v4[nz] * sum1; 2170019b515eSShri Abhyankar sum5 -= v5[nz] * sum1; 2171019b515eSShri Abhyankar sum3 -= v3[nz + 1] * sum2; 2172019b515eSShri Abhyankar sum4 -= v4[nz + 1] * sum2; 2173019b515eSShri Abhyankar sum5 -= v5[nz + 1] * sum2; 2174019b515eSShri Abhyankar sum4 -= v4[nz + 2] * sum3; 2175019b515eSShri Abhyankar sum5 -= v5[nz + 2] * sum3; 2176019b515eSShri Abhyankar sum5 -= v5[nz + 3] * sum4; 2177019b515eSShri Abhyankar 2178019b515eSShri Abhyankar tmp[row++] = sum1; 2179019b515eSShri Abhyankar tmp[row++] = sum2; 2180019b515eSShri Abhyankar tmp[row++] = sum3; 2181019b515eSShri Abhyankar tmp[row++] = sum4; 2182019b515eSShri Abhyankar tmp[row++] = sum5; 2183019b515eSShri Abhyankar break; 2184d71ae5a4SJacob Faibussowitsch default: 2185d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported "); 2186019b515eSShri Abhyankar } 2187019b515eSShri Abhyankar } 2188019b515eSShri Abhyankar /* backward solve the upper triangular */ 21894d12350bSJunchao Zhang for (i = node_max - 1; i >= 0; i--) { 21904d12350bSJunchao Zhang row = ns[i + 1] - 1; 21914d12350bSJunchao Zhang nsz = ns[i + 1] - ns[i]; 2192019b515eSShri Abhyankar aii = ad[row + 1] + 1; 2193019b515eSShri Abhyankar v1 = aa + aii; 2194019b515eSShri Abhyankar vi = aj + aii; 2195019b515eSShri Abhyankar nz = ad[row] - ad[row + 1] - 1; 219698991853SShri Abhyankar 219798991853SShri Abhyankar if (i > 0) { 219898991853SShri Abhyankar /* Prefetch the indices for the next block */ 219950d8bf02SJed Brown PetscPrefetchBlock(aj + ad[row - nsz + 1] + 1, ad[row - nsz] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA); 220098991853SShri Abhyankar /* Prefetch the data for the next block */ 22014d12350bSJunchao Zhang PetscPrefetchBlock(aa + ad[row - nsz + 1] + 1, ad[ns[i - 1] + 1] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA); 220298991853SShri Abhyankar } 220398991853SShri Abhyankar 2204019b515eSShri Abhyankar switch (nsz) { /* Each loop in 'case' is unrolled */ 2205019b515eSShri Abhyankar case 1: 2206019b515eSShri Abhyankar sum1 = tmp[row]; 2207019b515eSShri Abhyankar 2208019b515eSShri Abhyankar for (j = 0; j < nz - 1; j += 2) { 2209019b515eSShri Abhyankar i0 = vi[j]; 2210019b515eSShri Abhyankar i1 = vi[j + 1]; 2211019b515eSShri Abhyankar tmp0 = tmps[i0]; 2212019b515eSShri Abhyankar tmp1 = tmps[i1]; 2213019b515eSShri Abhyankar sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2214019b515eSShri Abhyankar } 2215019b515eSShri Abhyankar if (j == nz - 1) { 2216019b515eSShri Abhyankar tmp0 = tmps[vi[j]]; 2217019b515eSShri Abhyankar sum1 -= v1[j] * tmp0; 2218019b515eSShri Abhyankar } 22199371c9d4SSatish Balay x[c[row]] = tmp[row] = sum1 * v1[nz]; 22209371c9d4SSatish Balay row--; 2221019b515eSShri Abhyankar break; 2222019b515eSShri Abhyankar case 2: 2223019b515eSShri Abhyankar sum1 = tmp[row]; 2224019b515eSShri Abhyankar sum2 = tmp[row - 1]; 2225019b515eSShri Abhyankar v2 = aa + ad[row] + 1; 2226019b515eSShri Abhyankar for (j = 0; j < nz - 1; j += 2) { 2227019b515eSShri Abhyankar i0 = vi[j]; 2228019b515eSShri Abhyankar i1 = vi[j + 1]; 2229019b515eSShri Abhyankar tmp0 = tmps[i0]; 2230019b515eSShri Abhyankar tmp1 = tmps[i1]; 2231019b515eSShri Abhyankar sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2232019b515eSShri Abhyankar sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1; 2233019b515eSShri Abhyankar } 2234019b515eSShri Abhyankar if (j == nz - 1) { 2235019b515eSShri Abhyankar tmp0 = tmps[vi[j]]; 2236019b515eSShri Abhyankar sum1 -= v1[j] * tmp0; 2237019b515eSShri Abhyankar sum2 -= v2[j + 1] * tmp0; 2238019b515eSShri Abhyankar } 2239019b515eSShri Abhyankar 22409371c9d4SSatish Balay tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz]; 22419371c9d4SSatish Balay row--; 2242019b515eSShri Abhyankar sum2 -= v2[0] * tmp0; 22439371c9d4SSatish Balay x[c[row]] = tmp[row] = sum2 * v2[nz + 1]; 22449371c9d4SSatish Balay row--; 2245019b515eSShri Abhyankar break; 2246019b515eSShri Abhyankar case 3: 2247019b515eSShri Abhyankar sum1 = tmp[row]; 2248019b515eSShri Abhyankar sum2 = tmp[row - 1]; 2249019b515eSShri Abhyankar sum3 = tmp[row - 2]; 2250019b515eSShri Abhyankar v2 = aa + ad[row] + 1; 2251019b515eSShri Abhyankar v3 = aa + ad[row - 1] + 1; 2252019b515eSShri Abhyankar for (j = 0; j < nz - 1; j += 2) { 2253019b515eSShri Abhyankar i0 = vi[j]; 2254019b515eSShri Abhyankar i1 = vi[j + 1]; 2255019b515eSShri Abhyankar tmp0 = tmps[i0]; 2256019b515eSShri Abhyankar tmp1 = tmps[i1]; 2257019b515eSShri Abhyankar sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2258019b515eSShri Abhyankar sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1; 2259019b515eSShri Abhyankar sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1; 2260019b515eSShri Abhyankar } 2261019b515eSShri Abhyankar if (j == nz - 1) { 2262019b515eSShri Abhyankar tmp0 = tmps[vi[j]]; 2263019b515eSShri Abhyankar sum1 -= v1[j] * tmp0; 2264019b515eSShri Abhyankar sum2 -= v2[j + 1] * tmp0; 2265019b515eSShri Abhyankar sum3 -= v3[j + 2] * tmp0; 2266019b515eSShri Abhyankar } 22679371c9d4SSatish Balay tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz]; 22689371c9d4SSatish Balay row--; 2269019b515eSShri Abhyankar sum2 -= v2[0] * tmp0; 2270019b515eSShri Abhyankar sum3 -= v3[1] * tmp0; 22719371c9d4SSatish Balay tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1]; 22729371c9d4SSatish Balay row--; 2273019b515eSShri Abhyankar sum3 -= v3[0] * tmp0; 22749371c9d4SSatish Balay x[c[row]] = tmp[row] = sum3 * v3[nz + 2]; 22759371c9d4SSatish Balay row--; 2276019b515eSShri Abhyankar 2277019b515eSShri Abhyankar break; 2278019b515eSShri Abhyankar case 4: 2279019b515eSShri Abhyankar sum1 = tmp[row]; 2280019b515eSShri Abhyankar sum2 = tmp[row - 1]; 2281019b515eSShri Abhyankar sum3 = tmp[row - 2]; 2282019b515eSShri Abhyankar sum4 = tmp[row - 3]; 2283019b515eSShri Abhyankar v2 = aa + ad[row] + 1; 2284019b515eSShri Abhyankar v3 = aa + ad[row - 1] + 1; 2285019b515eSShri Abhyankar v4 = aa + ad[row - 2] + 1; 2286019b515eSShri Abhyankar 2287019b515eSShri Abhyankar for (j = 0; j < nz - 1; j += 2) { 2288019b515eSShri Abhyankar i0 = vi[j]; 2289019b515eSShri Abhyankar i1 = vi[j + 1]; 2290019b515eSShri Abhyankar tmp0 = tmps[i0]; 2291019b515eSShri Abhyankar tmp1 = tmps[i1]; 2292019b515eSShri Abhyankar sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2293019b515eSShri Abhyankar sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1; 2294019b515eSShri Abhyankar sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1; 2295019b515eSShri Abhyankar sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1; 2296019b515eSShri Abhyankar } 2297019b515eSShri Abhyankar if (j == nz - 1) { 2298019b515eSShri Abhyankar tmp0 = tmps[vi[j]]; 2299019b515eSShri Abhyankar sum1 -= v1[j] * tmp0; 2300019b515eSShri Abhyankar sum2 -= v2[j + 1] * tmp0; 2301019b515eSShri Abhyankar sum3 -= v3[j + 2] * tmp0; 2302019b515eSShri Abhyankar sum4 -= v4[j + 3] * tmp0; 2303019b515eSShri Abhyankar } 2304019b515eSShri Abhyankar 23059371c9d4SSatish Balay tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz]; 23069371c9d4SSatish Balay row--; 2307019b515eSShri Abhyankar sum2 -= v2[0] * tmp0; 2308019b515eSShri Abhyankar sum3 -= v3[1] * tmp0; 2309019b515eSShri Abhyankar sum4 -= v4[2] * tmp0; 23109371c9d4SSatish Balay tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1]; 23119371c9d4SSatish Balay row--; 2312019b515eSShri Abhyankar sum3 -= v3[0] * tmp0; 2313019b515eSShri Abhyankar sum4 -= v4[1] * tmp0; 23149371c9d4SSatish Balay tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2]; 23159371c9d4SSatish Balay row--; 2316019b515eSShri Abhyankar sum4 -= v4[0] * tmp0; 23179371c9d4SSatish Balay x[c[row]] = tmp[row] = sum4 * v4[nz + 3]; 23189371c9d4SSatish Balay row--; 2319019b515eSShri Abhyankar break; 2320019b515eSShri Abhyankar case 5: 2321019b515eSShri Abhyankar sum1 = tmp[row]; 2322019b515eSShri Abhyankar sum2 = tmp[row - 1]; 2323019b515eSShri Abhyankar sum3 = tmp[row - 2]; 2324019b515eSShri Abhyankar sum4 = tmp[row - 3]; 2325019b515eSShri Abhyankar sum5 = tmp[row - 4]; 2326019b515eSShri Abhyankar v2 = aa + ad[row] + 1; 2327019b515eSShri Abhyankar v3 = aa + ad[row - 1] + 1; 2328019b515eSShri Abhyankar v4 = aa + ad[row - 2] + 1; 2329019b515eSShri Abhyankar v5 = aa + ad[row - 3] + 1; 2330019b515eSShri Abhyankar for (j = 0; j < nz - 1; j += 2) { 2331019b515eSShri Abhyankar i0 = vi[j]; 2332019b515eSShri Abhyankar i1 = vi[j + 1]; 2333019b515eSShri Abhyankar tmp0 = tmps[i0]; 2334019b515eSShri Abhyankar tmp1 = tmps[i1]; 2335019b515eSShri Abhyankar sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2336019b515eSShri Abhyankar sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1; 2337019b515eSShri Abhyankar sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1; 2338019b515eSShri Abhyankar sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1; 2339019b515eSShri Abhyankar sum5 -= v5[j + 4] * tmp0 + v5[j + 5] * tmp1; 2340019b515eSShri Abhyankar } 2341019b515eSShri Abhyankar if (j == nz - 1) { 2342019b515eSShri Abhyankar tmp0 = tmps[vi[j]]; 2343019b515eSShri Abhyankar sum1 -= v1[j] * tmp0; 2344019b515eSShri Abhyankar sum2 -= v2[j + 1] * tmp0; 2345019b515eSShri Abhyankar sum3 -= v3[j + 2] * tmp0; 2346019b515eSShri Abhyankar sum4 -= v4[j + 3] * tmp0; 2347019b515eSShri Abhyankar sum5 -= v5[j + 4] * tmp0; 2348019b515eSShri Abhyankar } 2349019b515eSShri Abhyankar 23509371c9d4SSatish Balay tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz]; 23519371c9d4SSatish Balay row--; 2352019b515eSShri Abhyankar sum2 -= v2[0] * tmp0; 2353019b515eSShri Abhyankar sum3 -= v3[1] * tmp0; 2354019b515eSShri Abhyankar sum4 -= v4[2] * tmp0; 2355019b515eSShri Abhyankar sum5 -= v5[3] * tmp0; 23569371c9d4SSatish Balay tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1]; 23579371c9d4SSatish Balay row--; 2358019b515eSShri Abhyankar sum3 -= v3[0] * tmp0; 2359019b515eSShri Abhyankar sum4 -= v4[1] * tmp0; 2360019b515eSShri Abhyankar sum5 -= v5[2] * tmp0; 23619371c9d4SSatish Balay tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2]; 23629371c9d4SSatish Balay row--; 2363019b515eSShri Abhyankar sum4 -= v4[0] * tmp0; 2364019b515eSShri Abhyankar sum5 -= v5[1] * tmp0; 23659371c9d4SSatish Balay tmp0 = x[c[row]] = tmp[row] = sum4 * v4[nz + 3]; 23669371c9d4SSatish Balay row--; 2367019b515eSShri Abhyankar sum5 -= v5[0] * tmp0; 23689371c9d4SSatish Balay x[c[row]] = tmp[row] = sum5 * v5[nz + 4]; 23699371c9d4SSatish Balay row--; 2370019b515eSShri Abhyankar break; 2371d71ae5a4SJacob Faibussowitsch default: 2372d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported "); 2373019b515eSShri Abhyankar } 2374019b515eSShri Abhyankar } 23759566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 23769566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 23779566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 23789566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(xx, &x)); 23799566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n)); 23803ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2381019b515eSShri Abhyankar } 2382019b515eSShri Abhyankar 23834c1414c8SBarry Smith /* 23844c1414c8SBarry Smith Makes a longer coloring[] array and calls the usual code with that 23854c1414c8SBarry Smith */ 238666976f2fSJacob Faibussowitsch static PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat, PetscInt ncolors, PetscInt nin, ISColoringValue coloring[], ISColoring *iscoloring) 2387d71ae5a4SJacob Faibussowitsch { 23884c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)mat->data; 23894d12350bSJunchao Zhang PetscInt n = mat->cmap->n, m = a->inode.node_count, j, *ns = a->inode.size_csr, row; 23904c1414c8SBarry Smith PetscInt *colorused, i; 23914c1414c8SBarry Smith ISColoringValue *newcolor; 23924c1414c8SBarry Smith 23934c1414c8SBarry Smith PetscFunctionBegin; 23944d12350bSJunchao Zhang PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 23959566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(n + 1, &newcolor)); 23964c1414c8SBarry Smith /* loop over inodes, marking a color for each column*/ 23974c1414c8SBarry Smith row = 0; 23984c1414c8SBarry Smith for (i = 0; i < m; i++) { 23994d12350bSJunchao Zhang for (j = 0; j < (ns[i + 1] - ns[i]); j++) PetscCall(ISColoringValueCast(coloring[i] + j * ncolors, newcolor + row++)); 24004c1414c8SBarry Smith } 24014c1414c8SBarry Smith 24024c1414c8SBarry Smith /* eliminate unneeded colors */ 24039566063dSJacob Faibussowitsch PetscCall(PetscCalloc1(5 * ncolors, &colorused)); 2404ad540459SPierre Jolivet for (i = 0; i < n; i++) colorused[newcolor[i]] = 1; 24054c1414c8SBarry Smith 2406ad540459SPierre Jolivet for (i = 1; i < 5 * ncolors; i++) colorused[i] += colorused[i - 1]; 24074c1414c8SBarry Smith ncolors = colorused[5 * ncolors - 1]; 24086497c311SBarry Smith for (i = 0; i < n; i++) PetscCall(ISColoringValueCast(colorused[newcolor[i]] - 1, newcolor + i)); 24099566063dSJacob Faibussowitsch PetscCall(PetscFree(colorused)); 24109566063dSJacob Faibussowitsch PetscCall(ISColoringCreate(PetscObjectComm((PetscObject)mat), ncolors, n, newcolor, PETSC_OWN_POINTER, iscoloring)); 24119566063dSJacob Faibussowitsch PetscCall(PetscFree(coloring)); 24123ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 24134c1414c8SBarry Smith } 24144c1414c8SBarry Smith 2415af0996ceSBarry Smith #include <petsc/private/kernels/blockinvert.h> 24162af78befSBarry Smith 241707425a8dSBarry Smith /* 241807425a8dSBarry Smith Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways 241907425a8dSBarry Smith */ 242007425a8dSBarry Smith static PetscErrorCode MatInvertDiagonalForSOR_SeqAIJ_Inode(Mat A, PetscScalar omega, PetscScalar fshift) 2421d71ae5a4SJacob Faibussowitsch { 24222af78befSBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 242307425a8dSBarry Smith MatScalar *ibdiag, *bdiag, work[25]; 242407425a8dSBarry Smith const MatScalar *v = a->a; 24257b6c816cSBarry Smith PetscReal zeropivot = 100. * PETSC_MACHINE_EPSILON, shift = 0.0; 242607425a8dSBarry Smith PetscInt m = a->inode.node_count, cnt = 0, i, j, row, nodesz; 242707425a8dSBarry Smith PetscInt k, ipvt[5]; 242807425a8dSBarry Smith PetscBool allowzeropivot = PetscNot(A->erroriffailure), zeropivotdetected; 2429*421480d9SBarry Smith const PetscInt *sizes = a->inode.size_csr, *diag; 24302af78befSBarry Smith 24312af78befSBarry Smith PetscFunctionBegin; 243207425a8dSBarry Smith if (a->idiagState == ((PetscObject)A)->state) PetscFunctionReturn(PETSC_SUCCESS); 2433*421480d9SBarry Smith PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &diag, NULL)); 24342af78befSBarry Smith if (!a->inode.ibdiag) { 24352af78befSBarry Smith /* calculate space needed for diagonal blocks */ 24364d12350bSJunchao Zhang for (i = 0; i < m; i++) { 24374d12350bSJunchao Zhang nodesz = sizes[i + 1] - sizes[i]; 24384d12350bSJunchao Zhang cnt += nodesz * nodesz; 24394d12350bSJunchao Zhang } 2440f0d39aaaSBarry Smith a->inode.bdiagsize = cnt; 24419566063dSJacob Faibussowitsch PetscCall(PetscMalloc3(cnt, &a->inode.ibdiag, cnt, &a->inode.bdiag, A->rmap->n, &a->inode.ssor_work)); 244271f1c65dSBarry Smith } 244371f1c65dSBarry Smith 244471f1c65dSBarry Smith /* copy over the diagonal blocks and invert them */ 24452af78befSBarry Smith ibdiag = a->inode.ibdiag; 24462af78befSBarry Smith bdiag = a->inode.bdiag; 24472af78befSBarry Smith cnt = 0; 24482af78befSBarry Smith for (i = 0, row = 0; i < m; i++) { 24494d12350bSJunchao Zhang nodesz = sizes[i + 1] - sizes[i]; 24504d12350bSJunchao Zhang for (j = 0; j < nodesz; j++) { 24514d12350bSJunchao Zhang for (k = 0; k < nodesz; k++) bdiag[cnt + k * nodesz + j] = v[diag[row + j] - j + k]; 24522af78befSBarry Smith } 24534d12350bSJunchao Zhang PetscCall(PetscArraycpy(ibdiag + cnt, bdiag + cnt, nodesz * nodesz)); 24542af78befSBarry Smith 24554d12350bSJunchao Zhang switch (nodesz) { 24562af78befSBarry Smith case 1: 24572af78befSBarry Smith /* Create matrix data structure */ 24588e0e2a9aSHong Zhang if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) { 2459966bd95aSPierre Jolivet PetscCheck(allowzeropivot, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot on row %" PetscInt_FMT, row); 24607b6c816cSBarry Smith A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 24617b6c816cSBarry Smith A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]); 24627b6c816cSBarry Smith A->factorerror_zeropivot_row = row; 24639566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "Zero pivot, row %" PetscInt_FMT "\n", row)); 24648e0e2a9aSHong Zhang } 246564c62002SMatthew Knepley ibdiag[cnt] = 1.0 / ibdiag[cnt]; 24662af78befSBarry Smith break; 24672af78befSBarry Smith case 2: 24689566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_inverse_A_2(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected)); 24697b6c816cSBarry Smith if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 24702af78befSBarry Smith break; 24712af78befSBarry Smith case 3: 24729566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_inverse_A_3(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected)); 24737b6c816cSBarry Smith if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 24742af78befSBarry Smith break; 24752af78befSBarry Smith case 4: 24769566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_inverse_A_4(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected)); 24777b6c816cSBarry Smith if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 24782af78befSBarry Smith break; 24792af78befSBarry Smith case 5: 24809566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_inverse_A_5(ibdiag + cnt, ipvt, work, shift, allowzeropivot, &zeropivotdetected)); 24817b6c816cSBarry Smith if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 24822af78befSBarry Smith break; 2483d71ae5a4SJacob Faibussowitsch default: 24844d12350bSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz); 24852af78befSBarry Smith } 24864d12350bSJunchao Zhang cnt += nodesz * nodesz; 24874d12350bSJunchao Zhang row += nodesz; 24882af78befSBarry Smith } 248907425a8dSBarry Smith a->inode.ibdiagState = ((PetscObject)A)->state; 249007425a8dSBarry Smith PetscFunctionReturn(PETSC_SUCCESS); 24912af78befSBarry Smith } 249207425a8dSBarry Smith 249307425a8dSBarry Smith PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx) 249407425a8dSBarry Smith { 249507425a8dSBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 249607425a8dSBarry Smith PetscScalar sum1 = 0.0, sum2 = 0.0, sum3 = 0.0, sum4 = 0.0, sum5 = 0.0, tmp0, tmp1, tmp2, tmp3; 249707425a8dSBarry Smith MatScalar *ibdiag, *bdiag, *t; 249807425a8dSBarry Smith PetscScalar *x, tmp4, tmp5, x1, x2, x3, x4, x5; 249907425a8dSBarry Smith const MatScalar *v1 = NULL, *v2 = NULL, *v3 = NULL, *v4 = NULL, *v5 = NULL; 250007425a8dSBarry Smith const PetscScalar *xb, *b; 250107425a8dSBarry Smith PetscInt n, m = a->inode.node_count, cnt = 0, i, row, i1, i2, nodesz; 250207425a8dSBarry Smith PetscInt sz; 250307425a8dSBarry Smith const PetscInt *sizes = a->inode.size_csr, *idx, *diag, *ii = a->i; 250407425a8dSBarry Smith 250507425a8dSBarry Smith PetscFunctionBegin; 250607425a8dSBarry Smith PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 250707425a8dSBarry Smith PetscCheck(omega == 1.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for omega != 1.0; use -mat_no_inode"); 250807425a8dSBarry Smith PetscCheck(fshift == 0.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for fshift != 0.0; use -mat_no_inode"); 250907425a8dSBarry Smith PetscCall(MatInvertDiagonalForSOR_SeqAIJ_Inode(A, omega, fshift)); 251007425a8dSBarry Smith diag = a->diag; 251107425a8dSBarry Smith 25122af78befSBarry Smith ibdiag = a->inode.ibdiag; 25132af78befSBarry Smith bdiag = a->inode.bdiag; 25145850ef23SBarry Smith t = a->inode.ssor_work; 25152af78befSBarry Smith 25169566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 25179566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 25185850ef23SBarry Smith /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 25195850ef23SBarry Smith if (flag & SOR_ZERO_INITIAL_GUESS) { 25202af78befSBarry Smith if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 25218862d2efSBarry Smith for (i = 0, row = 0; i < m; i++) { 25228862d2efSBarry Smith sz = diag[row] - ii[row]; 25238862d2efSBarry Smith v1 = a->a + ii[row]; 25248862d2efSBarry Smith idx = a->j + ii[row]; 25258862d2efSBarry Smith 25264108e4d5SBarry Smith /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 25274d12350bSJunchao Zhang nodesz = sizes[i + 1] - sizes[i]; 25284d12350bSJunchao Zhang switch (nodesz) { 25298862d2efSBarry Smith case 1: 25308862d2efSBarry Smith 25318862d2efSBarry Smith sum1 = b[row]; 25328862d2efSBarry Smith for (n = 0; n < sz - 1; n += 2) { 25338862d2efSBarry Smith i1 = idx[0]; 25348862d2efSBarry Smith i2 = idx[1]; 25358862d2efSBarry Smith idx += 2; 25368862d2efSBarry Smith tmp0 = x[i1]; 25378862d2efSBarry Smith tmp1 = x[i2]; 25389371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 25399371c9d4SSatish Balay v1 += 2; 25408862d2efSBarry Smith } 25418862d2efSBarry Smith 25428862d2efSBarry Smith if (n == sz - 1) { 2543f0d39aaaSBarry Smith tmp0 = x[*idx]; 2544f0d39aaaSBarry Smith sum1 -= *v1 * tmp0; 25458862d2efSBarry Smith } 25465850ef23SBarry Smith t[row] = sum1; 25478862d2efSBarry Smith x[row++] = sum1 * (*ibdiag++); 25488862d2efSBarry Smith break; 2549f0d39aaaSBarry Smith case 2: 2550f0d39aaaSBarry Smith v2 = a->a + ii[row + 1]; 2551f0d39aaaSBarry Smith sum1 = b[row]; 2552f0d39aaaSBarry Smith sum2 = b[row + 1]; 2553f0d39aaaSBarry Smith for (n = 0; n < sz - 1; n += 2) { 2554f0d39aaaSBarry Smith i1 = idx[0]; 2555f0d39aaaSBarry Smith i2 = idx[1]; 2556f0d39aaaSBarry Smith idx += 2; 2557f0d39aaaSBarry Smith tmp0 = x[i1]; 2558f0d39aaaSBarry Smith tmp1 = x[i2]; 25599371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 25609371c9d4SSatish Balay v1 += 2; 25619371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 25629371c9d4SSatish Balay v2 += 2; 2563f0d39aaaSBarry Smith } 2564f0d39aaaSBarry Smith 2565f0d39aaaSBarry Smith if (n == sz - 1) { 2566f0d39aaaSBarry Smith tmp0 = x[*idx]; 2567f0d39aaaSBarry Smith sum1 -= v1[0] * tmp0; 2568f0d39aaaSBarry Smith sum2 -= v2[0] * tmp0; 2569f0d39aaaSBarry Smith } 25705850ef23SBarry Smith t[row] = sum1; 25715850ef23SBarry Smith t[row + 1] = sum2; 2572f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[2]; 2573f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[3]; 2574f0d39aaaSBarry Smith ibdiag += 4; 2575f0d39aaaSBarry Smith break; 2576f0d39aaaSBarry Smith case 3: 2577f0d39aaaSBarry Smith v2 = a->a + ii[row + 1]; 2578f0d39aaaSBarry Smith v3 = a->a + ii[row + 2]; 2579f0d39aaaSBarry Smith sum1 = b[row]; 2580f0d39aaaSBarry Smith sum2 = b[row + 1]; 2581f0d39aaaSBarry Smith sum3 = b[row + 2]; 2582f0d39aaaSBarry Smith for (n = 0; n < sz - 1; n += 2) { 2583f0d39aaaSBarry Smith i1 = idx[0]; 2584f0d39aaaSBarry Smith i2 = idx[1]; 2585f0d39aaaSBarry Smith idx += 2; 2586f0d39aaaSBarry Smith tmp0 = x[i1]; 2587f0d39aaaSBarry Smith tmp1 = x[i2]; 25889371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 25899371c9d4SSatish Balay v1 += 2; 25909371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 25919371c9d4SSatish Balay v2 += 2; 25929371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 25939371c9d4SSatish Balay v3 += 2; 2594f0d39aaaSBarry Smith } 2595f0d39aaaSBarry Smith 2596f0d39aaaSBarry Smith if (n == sz - 1) { 2597f0d39aaaSBarry Smith tmp0 = x[*idx]; 2598f0d39aaaSBarry Smith sum1 -= v1[0] * tmp0; 2599f0d39aaaSBarry Smith sum2 -= v2[0] * tmp0; 2600f0d39aaaSBarry Smith sum3 -= v3[0] * tmp0; 2601f0d39aaaSBarry Smith } 26025850ef23SBarry Smith t[row] = sum1; 26035850ef23SBarry Smith t[row + 1] = sum2; 26045850ef23SBarry Smith t[row + 2] = sum3; 2605f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6]; 2606f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7]; 2607f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8]; 2608f0d39aaaSBarry Smith ibdiag += 9; 2609f0d39aaaSBarry Smith break; 2610f0d39aaaSBarry Smith case 4: 2611f0d39aaaSBarry Smith v2 = a->a + ii[row + 1]; 2612f0d39aaaSBarry Smith v3 = a->a + ii[row + 2]; 2613f0d39aaaSBarry Smith v4 = a->a + ii[row + 3]; 2614f0d39aaaSBarry Smith sum1 = b[row]; 2615f0d39aaaSBarry Smith sum2 = b[row + 1]; 2616f0d39aaaSBarry Smith sum3 = b[row + 2]; 2617f0d39aaaSBarry Smith sum4 = b[row + 3]; 2618f0d39aaaSBarry Smith for (n = 0; n < sz - 1; n += 2) { 2619f0d39aaaSBarry Smith i1 = idx[0]; 2620f0d39aaaSBarry Smith i2 = idx[1]; 2621f0d39aaaSBarry Smith idx += 2; 2622f0d39aaaSBarry Smith tmp0 = x[i1]; 2623f0d39aaaSBarry Smith tmp1 = x[i2]; 26249371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 26259371c9d4SSatish Balay v1 += 2; 26269371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 26279371c9d4SSatish Balay v2 += 2; 26289371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 26299371c9d4SSatish Balay v3 += 2; 26309371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 26319371c9d4SSatish Balay v4 += 2; 2632f0d39aaaSBarry Smith } 2633f0d39aaaSBarry Smith 2634f0d39aaaSBarry Smith if (n == sz - 1) { 2635f0d39aaaSBarry Smith tmp0 = x[*idx]; 2636f0d39aaaSBarry Smith sum1 -= v1[0] * tmp0; 2637f0d39aaaSBarry Smith sum2 -= v2[0] * tmp0; 2638f0d39aaaSBarry Smith sum3 -= v3[0] * tmp0; 2639f0d39aaaSBarry Smith sum4 -= v4[0] * tmp0; 2640f0d39aaaSBarry Smith } 26415850ef23SBarry Smith t[row] = sum1; 26425850ef23SBarry Smith t[row + 1] = sum2; 26435850ef23SBarry Smith t[row + 2] = sum3; 26445850ef23SBarry Smith t[row + 3] = sum4; 2645f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12]; 2646f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13]; 2647f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14]; 2648f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15]; 2649f0d39aaaSBarry Smith ibdiag += 16; 2650f0d39aaaSBarry Smith break; 2651f0d39aaaSBarry Smith case 5: 2652f0d39aaaSBarry Smith v2 = a->a + ii[row + 1]; 2653f0d39aaaSBarry Smith v3 = a->a + ii[row + 2]; 2654f0d39aaaSBarry Smith v4 = a->a + ii[row + 3]; 2655f0d39aaaSBarry Smith v5 = a->a + ii[row + 4]; 2656f0d39aaaSBarry Smith sum1 = b[row]; 2657f0d39aaaSBarry Smith sum2 = b[row + 1]; 2658f0d39aaaSBarry Smith sum3 = b[row + 2]; 2659f0d39aaaSBarry Smith sum4 = b[row + 3]; 2660f0d39aaaSBarry Smith sum5 = b[row + 4]; 2661f0d39aaaSBarry Smith for (n = 0; n < sz - 1; n += 2) { 2662f0d39aaaSBarry Smith i1 = idx[0]; 2663f0d39aaaSBarry Smith i2 = idx[1]; 2664f0d39aaaSBarry Smith idx += 2; 2665f0d39aaaSBarry Smith tmp0 = x[i1]; 2666f0d39aaaSBarry Smith tmp1 = x[i2]; 26679371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 26689371c9d4SSatish Balay v1 += 2; 26699371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 26709371c9d4SSatish Balay v2 += 2; 26719371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 26729371c9d4SSatish Balay v3 += 2; 26739371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 26749371c9d4SSatish Balay v4 += 2; 26759371c9d4SSatish Balay sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 26769371c9d4SSatish Balay v5 += 2; 2677f0d39aaaSBarry Smith } 2678f0d39aaaSBarry Smith 2679f0d39aaaSBarry Smith if (n == sz - 1) { 2680f0d39aaaSBarry Smith tmp0 = x[*idx]; 2681f0d39aaaSBarry Smith sum1 -= v1[0] * tmp0; 2682f0d39aaaSBarry Smith sum2 -= v2[0] * tmp0; 2683f0d39aaaSBarry Smith sum3 -= v3[0] * tmp0; 2684f0d39aaaSBarry Smith sum4 -= v4[0] * tmp0; 2685f0d39aaaSBarry Smith sum5 -= v5[0] * tmp0; 2686f0d39aaaSBarry Smith } 26875850ef23SBarry Smith t[row] = sum1; 26885850ef23SBarry Smith t[row + 1] = sum2; 26895850ef23SBarry Smith t[row + 2] = sum3; 26905850ef23SBarry Smith t[row + 3] = sum4; 26915850ef23SBarry Smith t[row + 4] = sum5; 2692f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20]; 2693f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21]; 2694f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22]; 2695f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23]; 2696f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24]; 2697f0d39aaaSBarry Smith ibdiag += 25; 2698f0d39aaaSBarry Smith break; 2699d71ae5a4SJacob Faibussowitsch default: 27004d12350bSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz); 27018862d2efSBarry Smith } 27022af78befSBarry Smith } 27032af78befSBarry Smith 27045850ef23SBarry Smith xb = t; 27059566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(a->nz)); 27062af78befSBarry Smith } else xb = b; 27072af78befSBarry Smith if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 2708f0d39aaaSBarry Smith ibdiag = a->inode.ibdiag + a->inode.bdiagsize; 2709d0f46423SBarry Smith for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) { 27104d12350bSJunchao Zhang nodesz = sizes[i + 1] - sizes[i]; 27114d12350bSJunchao Zhang ibdiag -= nodesz * nodesz; 27128862d2efSBarry Smith sz = ii[row + 1] - diag[row] - 1; 27138862d2efSBarry Smith v1 = a->a + diag[row] + 1; 27148862d2efSBarry Smith idx = a->j + diag[row] + 1; 27152af78befSBarry Smith 27164108e4d5SBarry Smith /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 27174d12350bSJunchao Zhang switch (nodesz) { 27188862d2efSBarry Smith case 1: 27198862d2efSBarry Smith 27208862d2efSBarry Smith sum1 = xb[row]; 27218862d2efSBarry Smith for (n = 0; n < sz - 1; n += 2) { 27228862d2efSBarry Smith i1 = idx[0]; 27238862d2efSBarry Smith i2 = idx[1]; 27248862d2efSBarry Smith idx += 2; 27258862d2efSBarry Smith tmp0 = x[i1]; 27268862d2efSBarry Smith tmp1 = x[i2]; 27279371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 27289371c9d4SSatish Balay v1 += 2; 27298862d2efSBarry Smith } 27308862d2efSBarry Smith 27318862d2efSBarry Smith if (n == sz - 1) { 2732f0d39aaaSBarry Smith tmp0 = x[*idx]; 2733f0d39aaaSBarry Smith sum1 -= *v1 * tmp0; 27348862d2efSBarry Smith } 2735f0d39aaaSBarry Smith x[row--] = sum1 * (*ibdiag); 2736f0d39aaaSBarry Smith break; 2737f0d39aaaSBarry Smith 2738f0d39aaaSBarry Smith case 2: 2739f0d39aaaSBarry Smith 2740f0d39aaaSBarry Smith sum1 = xb[row]; 2741f0d39aaaSBarry Smith sum2 = xb[row - 1]; 2742f0d39aaaSBarry Smith /* note that sum1 is associated with the second of the two rows */ 2743f0d39aaaSBarry Smith v2 = a->a + diag[row - 1] + 2; 2744f0d39aaaSBarry Smith for (n = 0; n < sz - 1; n += 2) { 2745f0d39aaaSBarry Smith i1 = idx[0]; 2746f0d39aaaSBarry Smith i2 = idx[1]; 2747f0d39aaaSBarry Smith idx += 2; 2748f0d39aaaSBarry Smith tmp0 = x[i1]; 2749f0d39aaaSBarry Smith tmp1 = x[i2]; 27509371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 27519371c9d4SSatish Balay v1 += 2; 27529371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 27539371c9d4SSatish Balay v2 += 2; 2754f0d39aaaSBarry Smith } 2755f0d39aaaSBarry Smith 2756f0d39aaaSBarry Smith if (n == sz - 1) { 2757f0d39aaaSBarry Smith tmp0 = x[*idx]; 2758f0d39aaaSBarry Smith sum1 -= *v1 * tmp0; 2759f0d39aaaSBarry Smith sum2 -= *v2 * tmp0; 2760f0d39aaaSBarry Smith } 2761f0d39aaaSBarry Smith x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3]; 2762f0d39aaaSBarry Smith x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2]; 2763f0d39aaaSBarry Smith break; 2764f0d39aaaSBarry Smith case 3: 2765f0d39aaaSBarry Smith 2766f0d39aaaSBarry Smith sum1 = xb[row]; 2767f0d39aaaSBarry Smith sum2 = xb[row - 1]; 2768f0d39aaaSBarry Smith sum3 = xb[row - 2]; 2769f0d39aaaSBarry Smith v2 = a->a + diag[row - 1] + 2; 2770f0d39aaaSBarry Smith v3 = a->a + diag[row - 2] + 3; 2771f0d39aaaSBarry Smith for (n = 0; n < sz - 1; n += 2) { 2772f0d39aaaSBarry Smith i1 = idx[0]; 2773f0d39aaaSBarry Smith i2 = idx[1]; 2774f0d39aaaSBarry Smith idx += 2; 2775f0d39aaaSBarry Smith tmp0 = x[i1]; 2776f0d39aaaSBarry Smith tmp1 = x[i2]; 27779371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 27789371c9d4SSatish Balay v1 += 2; 27799371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 27809371c9d4SSatish Balay v2 += 2; 27819371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 27829371c9d4SSatish Balay v3 += 2; 2783f0d39aaaSBarry Smith } 2784f0d39aaaSBarry Smith 2785f0d39aaaSBarry Smith if (n == sz - 1) { 2786f0d39aaaSBarry Smith tmp0 = x[*idx]; 2787f0d39aaaSBarry Smith sum1 -= *v1 * tmp0; 2788f0d39aaaSBarry Smith sum2 -= *v2 * tmp0; 2789f0d39aaaSBarry Smith sum3 -= *v3 * tmp0; 2790f0d39aaaSBarry Smith } 2791f0d39aaaSBarry Smith x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8]; 2792f0d39aaaSBarry Smith x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7]; 2793f0d39aaaSBarry Smith x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6]; 2794f0d39aaaSBarry Smith break; 2795f0d39aaaSBarry Smith case 4: 2796f0d39aaaSBarry Smith 2797f0d39aaaSBarry Smith sum1 = xb[row]; 2798f0d39aaaSBarry Smith sum2 = xb[row - 1]; 2799f0d39aaaSBarry Smith sum3 = xb[row - 2]; 2800f0d39aaaSBarry Smith sum4 = xb[row - 3]; 2801f0d39aaaSBarry Smith v2 = a->a + diag[row - 1] + 2; 2802f0d39aaaSBarry Smith v3 = a->a + diag[row - 2] + 3; 2803f0d39aaaSBarry Smith v4 = a->a + diag[row - 3] + 4; 2804f0d39aaaSBarry Smith for (n = 0; n < sz - 1; n += 2) { 2805f0d39aaaSBarry Smith i1 = idx[0]; 2806f0d39aaaSBarry Smith i2 = idx[1]; 2807f0d39aaaSBarry Smith idx += 2; 2808f0d39aaaSBarry Smith tmp0 = x[i1]; 2809f0d39aaaSBarry Smith tmp1 = x[i2]; 28109371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 28119371c9d4SSatish Balay v1 += 2; 28129371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 28139371c9d4SSatish Balay v2 += 2; 28149371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 28159371c9d4SSatish Balay v3 += 2; 28169371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 28179371c9d4SSatish Balay v4 += 2; 2818f0d39aaaSBarry Smith } 2819f0d39aaaSBarry Smith 2820f0d39aaaSBarry Smith if (n == sz - 1) { 2821f0d39aaaSBarry Smith tmp0 = x[*idx]; 2822f0d39aaaSBarry Smith sum1 -= *v1 * tmp0; 2823f0d39aaaSBarry Smith sum2 -= *v2 * tmp0; 2824f0d39aaaSBarry Smith sum3 -= *v3 * tmp0; 2825f0d39aaaSBarry Smith sum4 -= *v4 * tmp0; 2826f0d39aaaSBarry Smith } 2827f0d39aaaSBarry Smith x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15]; 2828f0d39aaaSBarry Smith x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14]; 2829f0d39aaaSBarry Smith x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13]; 2830f0d39aaaSBarry Smith x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12]; 2831f0d39aaaSBarry Smith break; 2832f0d39aaaSBarry Smith case 5: 2833f0d39aaaSBarry Smith 2834f0d39aaaSBarry Smith sum1 = xb[row]; 2835f0d39aaaSBarry Smith sum2 = xb[row - 1]; 2836f0d39aaaSBarry Smith sum3 = xb[row - 2]; 2837f0d39aaaSBarry Smith sum4 = xb[row - 3]; 2838f0d39aaaSBarry Smith sum5 = xb[row - 4]; 2839f0d39aaaSBarry Smith v2 = a->a + diag[row - 1] + 2; 2840f0d39aaaSBarry Smith v3 = a->a + diag[row - 2] + 3; 2841f0d39aaaSBarry Smith v4 = a->a + diag[row - 3] + 4; 2842f0d39aaaSBarry Smith v5 = a->a + diag[row - 4] + 5; 2843f0d39aaaSBarry Smith for (n = 0; n < sz - 1; n += 2) { 2844f0d39aaaSBarry Smith i1 = idx[0]; 2845f0d39aaaSBarry Smith i2 = idx[1]; 2846f0d39aaaSBarry Smith idx += 2; 2847f0d39aaaSBarry Smith tmp0 = x[i1]; 2848f0d39aaaSBarry Smith tmp1 = x[i2]; 28499371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 28509371c9d4SSatish Balay v1 += 2; 28519371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 28529371c9d4SSatish Balay v2 += 2; 28539371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 28549371c9d4SSatish Balay v3 += 2; 28559371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 28569371c9d4SSatish Balay v4 += 2; 28579371c9d4SSatish Balay sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 28589371c9d4SSatish Balay v5 += 2; 2859f0d39aaaSBarry Smith } 2860f0d39aaaSBarry Smith 2861f0d39aaaSBarry Smith if (n == sz - 1) { 2862f0d39aaaSBarry Smith tmp0 = x[*idx]; 2863f0d39aaaSBarry Smith sum1 -= *v1 * tmp0; 2864f0d39aaaSBarry Smith sum2 -= *v2 * tmp0; 2865f0d39aaaSBarry Smith sum3 -= *v3 * tmp0; 2866f0d39aaaSBarry Smith sum4 -= *v4 * tmp0; 2867f0d39aaaSBarry Smith sum5 -= *v5 * tmp0; 2868f0d39aaaSBarry Smith } 2869f0d39aaaSBarry Smith x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24]; 2870f0d39aaaSBarry Smith x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23]; 2871f0d39aaaSBarry Smith x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22]; 2872f0d39aaaSBarry Smith x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21]; 2873f0d39aaaSBarry Smith x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20]; 28748862d2efSBarry Smith break; 2875d71ae5a4SJacob Faibussowitsch default: 28764d12350bSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz); 28778862d2efSBarry Smith } 28782af78befSBarry Smith } 28792af78befSBarry Smith 28809566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(a->nz)); 28812af78befSBarry Smith } 28822af78befSBarry Smith its--; 28835850ef23SBarry Smith } 28845850ef23SBarry Smith while (its--) { 28855850ef23SBarry Smith if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 28864d12350bSJunchao Zhang for (i = 0, row = 0, ibdiag = a->inode.ibdiag; i < m; row += nodesz, ibdiag += nodesz * nodesz, i++) { 28874d12350bSJunchao Zhang nodesz = sizes[i + 1] - sizes[i]; 2888d876e2b0SMark Adams sz = diag[row] - ii[row]; 28895850ef23SBarry Smith v1 = a->a + ii[row]; 28905850ef23SBarry Smith idx = a->j + ii[row]; 28915850ef23SBarry Smith /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 28924d12350bSJunchao Zhang switch (nodesz) { 28935850ef23SBarry Smith case 1: 28945850ef23SBarry Smith sum1 = b[row]; 28955850ef23SBarry Smith for (n = 0; n < sz - 1; n += 2) { 28965850ef23SBarry Smith i1 = idx[0]; 28975850ef23SBarry Smith i2 = idx[1]; 28985850ef23SBarry Smith idx += 2; 28995850ef23SBarry Smith tmp0 = x[i1]; 29005850ef23SBarry Smith tmp1 = x[i2]; 29019371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 29029371c9d4SSatish Balay v1 += 2; 29035850ef23SBarry Smith } 29045850ef23SBarry Smith if (n == sz - 1) { 2905d876e2b0SMark Adams tmp0 = x[*idx++]; 2906d876e2b0SMark Adams sum1 -= *v1 * tmp0; 2907d876e2b0SMark Adams v1++; 2908d876e2b0SMark Adams } 2909d876e2b0SMark Adams t[row] = sum1; 2910d876e2b0SMark Adams sz = ii[row + 1] - diag[row] - 1; 2911d876e2b0SMark Adams idx = a->j + diag[row] + 1; 2912d876e2b0SMark Adams v1 += 1; 2913d876e2b0SMark Adams for (n = 0; n < sz - 1; n += 2) { 2914d876e2b0SMark Adams i1 = idx[0]; 2915d876e2b0SMark Adams i2 = idx[1]; 2916d876e2b0SMark Adams idx += 2; 2917d876e2b0SMark Adams tmp0 = x[i1]; 2918d876e2b0SMark Adams tmp1 = x[i2]; 29199371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 29209371c9d4SSatish Balay v1 += 2; 2921d876e2b0SMark Adams } 2922d876e2b0SMark Adams if (n == sz - 1) { 2923d876e2b0SMark Adams tmp0 = x[*idx++]; 29245850ef23SBarry Smith sum1 -= *v1 * tmp0; 29255850ef23SBarry Smith } 29265850ef23SBarry Smith /* in MatSOR_SeqAIJ this line would be 29275850ef23SBarry Smith * 29285850ef23SBarry Smith * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++); 29295850ef23SBarry Smith * 29305850ef23SBarry Smith * but omega == 1, so this becomes 29315850ef23SBarry Smith * 2932d876e2b0SMark Adams * x[row] = sum1*(*ibdiag++); 29335850ef23SBarry Smith * 29345850ef23SBarry Smith */ 2935d876e2b0SMark Adams x[row] = sum1 * (*ibdiag); 29365850ef23SBarry Smith break; 29375850ef23SBarry Smith case 2: 29385850ef23SBarry Smith v2 = a->a + ii[row + 1]; 29395850ef23SBarry Smith sum1 = b[row]; 29405850ef23SBarry Smith sum2 = b[row + 1]; 29415850ef23SBarry Smith for (n = 0; n < sz - 1; n += 2) { 29425850ef23SBarry Smith i1 = idx[0]; 29435850ef23SBarry Smith i2 = idx[1]; 29445850ef23SBarry Smith idx += 2; 29455850ef23SBarry Smith tmp0 = x[i1]; 29465850ef23SBarry Smith tmp1 = x[i2]; 29479371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 29489371c9d4SSatish Balay v1 += 2; 29499371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 29509371c9d4SSatish Balay v2 += 2; 29515850ef23SBarry Smith } 2952d876e2b0SMark Adams if (n == sz - 1) { 2953d876e2b0SMark Adams tmp0 = x[*idx++]; 2954d876e2b0SMark Adams sum1 -= v1[0] * tmp0; 2955d876e2b0SMark Adams sum2 -= v2[0] * tmp0; 29569371c9d4SSatish Balay v1++; 29579371c9d4SSatish Balay v2++; 2958d876e2b0SMark Adams } 2959d876e2b0SMark Adams t[row] = sum1; 2960d876e2b0SMark Adams t[row + 1] = sum2; 2961d876e2b0SMark Adams sz = ii[row + 1] - diag[row] - 2; 2962d876e2b0SMark Adams idx = a->j + diag[row] + 2; 2963d876e2b0SMark Adams v1 += 2; 2964d876e2b0SMark Adams v2 += 2; 2965d876e2b0SMark Adams for (n = 0; n < sz - 1; n += 2) { 2966d876e2b0SMark Adams i1 = idx[0]; 2967d876e2b0SMark Adams i2 = idx[1]; 2968d876e2b0SMark Adams idx += 2; 2969d876e2b0SMark Adams tmp0 = x[i1]; 2970d876e2b0SMark Adams tmp1 = x[i2]; 29719371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 29729371c9d4SSatish Balay v1 += 2; 29739371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 29749371c9d4SSatish Balay v2 += 2; 2975d876e2b0SMark Adams } 29765850ef23SBarry Smith if (n == sz - 1) { 29775850ef23SBarry Smith tmp0 = x[*idx]; 29785850ef23SBarry Smith sum1 -= v1[0] * tmp0; 29795850ef23SBarry Smith sum2 -= v2[0] * tmp0; 29805850ef23SBarry Smith } 2981d876e2b0SMark Adams x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[2]; 2982d876e2b0SMark Adams x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3]; 29835850ef23SBarry Smith break; 29845850ef23SBarry Smith case 3: 29855850ef23SBarry Smith v2 = a->a + ii[row + 1]; 29865850ef23SBarry Smith v3 = a->a + ii[row + 2]; 29875850ef23SBarry Smith sum1 = b[row]; 29885850ef23SBarry Smith sum2 = b[row + 1]; 29895850ef23SBarry Smith sum3 = b[row + 2]; 29905850ef23SBarry Smith for (n = 0; n < sz - 1; n += 2) { 29915850ef23SBarry Smith i1 = idx[0]; 29925850ef23SBarry Smith i2 = idx[1]; 29935850ef23SBarry Smith idx += 2; 29945850ef23SBarry Smith tmp0 = x[i1]; 29955850ef23SBarry Smith tmp1 = x[i2]; 29969371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 29979371c9d4SSatish Balay v1 += 2; 29989371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 29999371c9d4SSatish Balay v2 += 2; 30009371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 30019371c9d4SSatish Balay v3 += 2; 30025850ef23SBarry Smith } 3003d876e2b0SMark Adams if (n == sz - 1) { 3004d876e2b0SMark Adams tmp0 = x[*idx++]; 3005d876e2b0SMark Adams sum1 -= v1[0] * tmp0; 3006d876e2b0SMark Adams sum2 -= v2[0] * tmp0; 3007d876e2b0SMark Adams sum3 -= v3[0] * tmp0; 30089371c9d4SSatish Balay v1++; 30099371c9d4SSatish Balay v2++; 30109371c9d4SSatish Balay v3++; 3011d876e2b0SMark Adams } 3012d876e2b0SMark Adams t[row] = sum1; 3013d876e2b0SMark Adams t[row + 1] = sum2; 3014d876e2b0SMark Adams t[row + 2] = sum3; 3015d876e2b0SMark Adams sz = ii[row + 1] - diag[row] - 3; 3016d876e2b0SMark Adams idx = a->j + diag[row] + 3; 3017d876e2b0SMark Adams v1 += 3; 3018d876e2b0SMark Adams v2 += 3; 3019d876e2b0SMark Adams v3 += 3; 3020d876e2b0SMark Adams for (n = 0; n < sz - 1; n += 2) { 3021d876e2b0SMark Adams i1 = idx[0]; 3022d876e2b0SMark Adams i2 = idx[1]; 3023d876e2b0SMark Adams idx += 2; 3024d876e2b0SMark Adams tmp0 = x[i1]; 3025d876e2b0SMark Adams tmp1 = x[i2]; 30269371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 30279371c9d4SSatish Balay v1 += 2; 30289371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 30299371c9d4SSatish Balay v2 += 2; 30309371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 30319371c9d4SSatish Balay v3 += 2; 3032d876e2b0SMark Adams } 30335850ef23SBarry Smith if (n == sz - 1) { 30345850ef23SBarry Smith tmp0 = x[*idx]; 30355850ef23SBarry Smith sum1 -= v1[0] * tmp0; 30365850ef23SBarry Smith sum2 -= v2[0] * tmp0; 30375850ef23SBarry Smith sum3 -= v3[0] * tmp0; 30385850ef23SBarry Smith } 3039d876e2b0SMark Adams x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6]; 3040d876e2b0SMark Adams x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7]; 3041d876e2b0SMark Adams x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8]; 30425850ef23SBarry Smith break; 30435850ef23SBarry Smith case 4: 30445850ef23SBarry Smith v2 = a->a + ii[row + 1]; 30455850ef23SBarry Smith v3 = a->a + ii[row + 2]; 30465850ef23SBarry Smith v4 = a->a + ii[row + 3]; 30475850ef23SBarry Smith sum1 = b[row]; 30485850ef23SBarry Smith sum2 = b[row + 1]; 30495850ef23SBarry Smith sum3 = b[row + 2]; 30505850ef23SBarry Smith sum4 = b[row + 3]; 30515850ef23SBarry Smith for (n = 0; n < sz - 1; n += 2) { 30525850ef23SBarry Smith i1 = idx[0]; 30535850ef23SBarry Smith i2 = idx[1]; 30545850ef23SBarry Smith idx += 2; 30555850ef23SBarry Smith tmp0 = x[i1]; 30565850ef23SBarry Smith tmp1 = x[i2]; 30579371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 30589371c9d4SSatish Balay v1 += 2; 30599371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 30609371c9d4SSatish Balay v2 += 2; 30619371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 30629371c9d4SSatish Balay v3 += 2; 30639371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 30649371c9d4SSatish Balay v4 += 2; 30655850ef23SBarry Smith } 3066d876e2b0SMark Adams if (n == sz - 1) { 3067d876e2b0SMark Adams tmp0 = x[*idx++]; 3068d876e2b0SMark Adams sum1 -= v1[0] * tmp0; 3069d876e2b0SMark Adams sum2 -= v2[0] * tmp0; 3070d876e2b0SMark Adams sum3 -= v3[0] * tmp0; 3071d876e2b0SMark Adams sum4 -= v4[0] * tmp0; 30729371c9d4SSatish Balay v1++; 30739371c9d4SSatish Balay v2++; 30749371c9d4SSatish Balay v3++; 30759371c9d4SSatish Balay v4++; 3076d876e2b0SMark Adams } 3077d876e2b0SMark Adams t[row] = sum1; 3078d876e2b0SMark Adams t[row + 1] = sum2; 3079d876e2b0SMark Adams t[row + 2] = sum3; 3080d876e2b0SMark Adams t[row + 3] = sum4; 3081d876e2b0SMark Adams sz = ii[row + 1] - diag[row] - 4; 3082d876e2b0SMark Adams idx = a->j + diag[row] + 4; 3083d876e2b0SMark Adams v1 += 4; 3084d876e2b0SMark Adams v2 += 4; 3085d876e2b0SMark Adams v3 += 4; 3086d876e2b0SMark Adams v4 += 4; 3087d876e2b0SMark Adams for (n = 0; n < sz - 1; n += 2) { 3088d876e2b0SMark Adams i1 = idx[0]; 3089d876e2b0SMark Adams i2 = idx[1]; 3090d876e2b0SMark Adams idx += 2; 3091d876e2b0SMark Adams tmp0 = x[i1]; 3092d876e2b0SMark Adams tmp1 = x[i2]; 30939371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 30949371c9d4SSatish Balay v1 += 2; 30959371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 30969371c9d4SSatish Balay v2 += 2; 30979371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 30989371c9d4SSatish Balay v3 += 2; 30999371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 31009371c9d4SSatish Balay v4 += 2; 3101d876e2b0SMark Adams } 31025850ef23SBarry Smith if (n == sz - 1) { 31035850ef23SBarry Smith tmp0 = x[*idx]; 31045850ef23SBarry Smith sum1 -= v1[0] * tmp0; 31055850ef23SBarry Smith sum2 -= v2[0] * tmp0; 31065850ef23SBarry Smith sum3 -= v3[0] * tmp0; 31075850ef23SBarry Smith sum4 -= v4[0] * tmp0; 31085850ef23SBarry Smith } 3109d876e2b0SMark Adams x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12]; 3110d876e2b0SMark Adams x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13]; 3111d876e2b0SMark Adams x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14]; 3112d876e2b0SMark Adams x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15]; 31135850ef23SBarry Smith break; 31145850ef23SBarry Smith case 5: 31155850ef23SBarry Smith v2 = a->a + ii[row + 1]; 31165850ef23SBarry Smith v3 = a->a + ii[row + 2]; 31175850ef23SBarry Smith v4 = a->a + ii[row + 3]; 31185850ef23SBarry Smith v5 = a->a + ii[row + 4]; 31195850ef23SBarry Smith sum1 = b[row]; 31205850ef23SBarry Smith sum2 = b[row + 1]; 31215850ef23SBarry Smith sum3 = b[row + 2]; 31225850ef23SBarry Smith sum4 = b[row + 3]; 31235850ef23SBarry Smith sum5 = b[row + 4]; 31245850ef23SBarry Smith for (n = 0; n < sz - 1; n += 2) { 31255850ef23SBarry Smith i1 = idx[0]; 31265850ef23SBarry Smith i2 = idx[1]; 31275850ef23SBarry Smith idx += 2; 31285850ef23SBarry Smith tmp0 = x[i1]; 31295850ef23SBarry Smith tmp1 = x[i2]; 31309371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 31319371c9d4SSatish Balay v1 += 2; 31329371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 31339371c9d4SSatish Balay v2 += 2; 31349371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 31359371c9d4SSatish Balay v3 += 2; 31369371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 31379371c9d4SSatish Balay v4 += 2; 31389371c9d4SSatish Balay sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 31399371c9d4SSatish Balay v5 += 2; 31405850ef23SBarry Smith } 31415850ef23SBarry Smith if (n == sz - 1) { 3142d876e2b0SMark Adams tmp0 = x[*idx++]; 31435850ef23SBarry Smith sum1 -= v1[0] * tmp0; 31445850ef23SBarry Smith sum2 -= v2[0] * tmp0; 31455850ef23SBarry Smith sum3 -= v3[0] * tmp0; 31465850ef23SBarry Smith sum4 -= v4[0] * tmp0; 31475850ef23SBarry Smith sum5 -= v5[0] * tmp0; 31489371c9d4SSatish Balay v1++; 31499371c9d4SSatish Balay v2++; 31509371c9d4SSatish Balay v3++; 31519371c9d4SSatish Balay v4++; 31529371c9d4SSatish Balay v5++; 31535850ef23SBarry Smith } 3154d876e2b0SMark Adams t[row] = sum1; 3155d876e2b0SMark Adams t[row + 1] = sum2; 3156d876e2b0SMark Adams t[row + 2] = sum3; 3157d876e2b0SMark Adams t[row + 3] = sum4; 3158d876e2b0SMark Adams t[row + 4] = sum5; 3159d876e2b0SMark Adams sz = ii[row + 1] - diag[row] - 5; 3160d876e2b0SMark Adams idx = a->j + diag[row] + 5; 3161d876e2b0SMark Adams v1 += 5; 3162d876e2b0SMark Adams v2 += 5; 3163d876e2b0SMark Adams v3 += 5; 3164d876e2b0SMark Adams v4 += 5; 3165d876e2b0SMark Adams v5 += 5; 31665850ef23SBarry Smith for (n = 0; n < sz - 1; n += 2) { 31675850ef23SBarry Smith i1 = idx[0]; 31685850ef23SBarry Smith i2 = idx[1]; 31695850ef23SBarry Smith idx += 2; 31705850ef23SBarry Smith tmp0 = x[i1]; 31715850ef23SBarry Smith tmp1 = x[i2]; 31729371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 31739371c9d4SSatish Balay v1 += 2; 31749371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 31759371c9d4SSatish Balay v2 += 2; 31769371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 31779371c9d4SSatish Balay v3 += 2; 31789371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 31799371c9d4SSatish Balay v4 += 2; 31809371c9d4SSatish Balay sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 31819371c9d4SSatish Balay v5 += 2; 31825850ef23SBarry Smith } 31835850ef23SBarry Smith if (n == sz - 1) { 31845850ef23SBarry Smith tmp0 = x[*idx]; 3185d876e2b0SMark Adams sum1 -= v1[0] * tmp0; 3186d876e2b0SMark Adams sum2 -= v2[0] * tmp0; 3187d876e2b0SMark Adams sum3 -= v3[0] * tmp0; 3188d876e2b0SMark Adams sum4 -= v4[0] * tmp0; 3189d876e2b0SMark Adams sum5 -= v5[0] * tmp0; 31905850ef23SBarry Smith } 3191d876e2b0SMark Adams x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20]; 3192d876e2b0SMark Adams x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21]; 3193d876e2b0SMark Adams x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22]; 3194d876e2b0SMark Adams x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23]; 3195d876e2b0SMark Adams x[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24]; 3196d876e2b0SMark Adams break; 3197d71ae5a4SJacob Faibussowitsch default: 31984d12350bSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz); 3199d876e2b0SMark Adams } 3200d876e2b0SMark Adams } 3201d876e2b0SMark Adams xb = t; 32029566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz)); /* undercounts diag inverse */ 3203d876e2b0SMark Adams } else xb = b; 3204d876e2b0SMark Adams 3205d876e2b0SMark Adams if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 3206d876e2b0SMark Adams ibdiag = a->inode.ibdiag + a->inode.bdiagsize; 3207d876e2b0SMark Adams for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) { 32084d12350bSJunchao Zhang nodesz = sizes[i + 1] - sizes[i]; 32094d12350bSJunchao Zhang ibdiag -= nodesz * nodesz; 3210d876e2b0SMark Adams 3211d876e2b0SMark Adams /* set RHS */ 3212d876e2b0SMark Adams if (xb == b) { 3213d876e2b0SMark Adams /* whole (old way) */ 3214d876e2b0SMark Adams sz = ii[row + 1] - ii[row]; 3215d876e2b0SMark Adams idx = a->j + ii[row]; 32164d12350bSJunchao Zhang switch (nodesz) { 3217d71ae5a4SJacob Faibussowitsch case 5: 3218d71ae5a4SJacob Faibussowitsch v5 = a->a + ii[row - 4]; /* fall through */ 3219d71ae5a4SJacob Faibussowitsch case 4: 3220d71ae5a4SJacob Faibussowitsch v4 = a->a + ii[row - 3]; /* fall through */ 3221d71ae5a4SJacob Faibussowitsch case 3: 3222d71ae5a4SJacob Faibussowitsch v3 = a->a + ii[row - 2]; /* fall through */ 3223d71ae5a4SJacob Faibussowitsch case 2: 3224d71ae5a4SJacob Faibussowitsch v2 = a->a + ii[row - 1]; /* fall through */ 3225d71ae5a4SJacob Faibussowitsch case 1: 3226d71ae5a4SJacob Faibussowitsch v1 = a->a + ii[row]; 3227d71ae5a4SJacob Faibussowitsch break; 3228d71ae5a4SJacob Faibussowitsch default: 32294d12350bSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz); 3230d876e2b0SMark Adams } 3231d876e2b0SMark Adams } else { 3232d876e2b0SMark Adams /* upper, no diag */ 3233d876e2b0SMark Adams sz = ii[row + 1] - diag[row] - 1; 3234d876e2b0SMark Adams idx = a->j + diag[row] + 1; 32354d12350bSJunchao Zhang switch (nodesz) { 3236d71ae5a4SJacob Faibussowitsch case 5: 3237d71ae5a4SJacob Faibussowitsch v5 = a->a + diag[row - 4] + 5; /* fall through */ 3238d71ae5a4SJacob Faibussowitsch case 4: 3239d71ae5a4SJacob Faibussowitsch v4 = a->a + diag[row - 3] + 4; /* fall through */ 3240d71ae5a4SJacob Faibussowitsch case 3: 3241d71ae5a4SJacob Faibussowitsch v3 = a->a + diag[row - 2] + 3; /* fall through */ 3242d71ae5a4SJacob Faibussowitsch case 2: 3243d71ae5a4SJacob Faibussowitsch v2 = a->a + diag[row - 1] + 2; /* fall through */ 3244d71ae5a4SJacob Faibussowitsch case 1: 3245d71ae5a4SJacob Faibussowitsch v1 = a->a + diag[row] + 1; 3246d876e2b0SMark Adams } 3247d876e2b0SMark Adams } 3248d876e2b0SMark Adams /* set sum */ 32494d12350bSJunchao Zhang switch (nodesz) { 3250d71ae5a4SJacob Faibussowitsch case 5: 3251d71ae5a4SJacob Faibussowitsch sum5 = xb[row - 4]; /* fall through */ 3252d71ae5a4SJacob Faibussowitsch case 4: 3253d71ae5a4SJacob Faibussowitsch sum4 = xb[row - 3]; /* fall through */ 3254d71ae5a4SJacob Faibussowitsch case 3: 3255d71ae5a4SJacob Faibussowitsch sum3 = xb[row - 2]; /* fall through */ 3256d71ae5a4SJacob Faibussowitsch case 2: 3257d71ae5a4SJacob Faibussowitsch sum2 = xb[row - 1]; /* fall through */ 3258d876e2b0SMark Adams case 1: 3259d876e2b0SMark Adams /* note that sum1 is associated with the last row */ 3260d876e2b0SMark Adams sum1 = xb[row]; 3261d876e2b0SMark Adams } 3262d876e2b0SMark Adams /* do sums */ 3263d876e2b0SMark Adams for (n = 0; n < sz - 1; n += 2) { 3264d876e2b0SMark Adams i1 = idx[0]; 3265d876e2b0SMark Adams i2 = idx[1]; 3266d876e2b0SMark Adams idx += 2; 3267d876e2b0SMark Adams tmp0 = x[i1]; 3268d876e2b0SMark Adams tmp1 = x[i2]; 32694d12350bSJunchao Zhang switch (nodesz) { 3270d71ae5a4SJacob Faibussowitsch case 5: 3271d71ae5a4SJacob Faibussowitsch sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3272d71ae5a4SJacob Faibussowitsch v5 += 2; /* fall through */ 3273d71ae5a4SJacob Faibussowitsch case 4: 3274d71ae5a4SJacob Faibussowitsch sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3275d71ae5a4SJacob Faibussowitsch v4 += 2; /* fall through */ 3276d71ae5a4SJacob Faibussowitsch case 3: 3277d71ae5a4SJacob Faibussowitsch sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3278d71ae5a4SJacob Faibussowitsch v3 += 2; /* fall through */ 3279d71ae5a4SJacob Faibussowitsch case 2: 3280d71ae5a4SJacob Faibussowitsch sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3281d71ae5a4SJacob Faibussowitsch v2 += 2; /* fall through */ 3282d71ae5a4SJacob Faibussowitsch case 1: 3283d71ae5a4SJacob Faibussowitsch sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3284d71ae5a4SJacob Faibussowitsch v1 += 2; 3285d876e2b0SMark Adams } 3286d876e2b0SMark Adams } 3287d876e2b0SMark Adams /* ragged edge */ 3288d876e2b0SMark Adams if (n == sz - 1) { 3289d876e2b0SMark Adams tmp0 = x[*idx]; 32904d12350bSJunchao Zhang switch (nodesz) { 3291d71ae5a4SJacob Faibussowitsch case 5: 3292d71ae5a4SJacob Faibussowitsch sum5 -= *v5 * tmp0; /* fall through */ 3293d71ae5a4SJacob Faibussowitsch case 4: 3294d71ae5a4SJacob Faibussowitsch sum4 -= *v4 * tmp0; /* fall through */ 3295d71ae5a4SJacob Faibussowitsch case 3: 3296d71ae5a4SJacob Faibussowitsch sum3 -= *v3 * tmp0; /* fall through */ 3297d71ae5a4SJacob Faibussowitsch case 2: 3298d71ae5a4SJacob Faibussowitsch sum2 -= *v2 * tmp0; /* fall through */ 3299d71ae5a4SJacob Faibussowitsch case 1: 3300d71ae5a4SJacob Faibussowitsch sum1 -= *v1 * tmp0; 3301d876e2b0SMark Adams } 3302d876e2b0SMark Adams } 3303d876e2b0SMark Adams /* update */ 3304d876e2b0SMark Adams if (xb == b) { 3305d876e2b0SMark Adams /* whole (old way) w/ diag */ 33064d12350bSJunchao Zhang switch (nodesz) { 3307d876e2b0SMark Adams case 5: 33085850ef23SBarry Smith x[row--] += sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24]; 33095850ef23SBarry Smith x[row--] += sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23]; 33105850ef23SBarry Smith x[row--] += sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22]; 33115850ef23SBarry Smith x[row--] += sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21]; 33125850ef23SBarry Smith x[row--] += sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20]; 33135850ef23SBarry Smith break; 3314d876e2b0SMark Adams case 4: 3315d876e2b0SMark Adams x[row--] += sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15]; 3316d876e2b0SMark Adams x[row--] += sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14]; 3317d876e2b0SMark Adams x[row--] += sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13]; 3318d876e2b0SMark Adams x[row--] += sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12]; 3319d876e2b0SMark Adams break; 3320d876e2b0SMark Adams case 3: 3321d876e2b0SMark Adams x[row--] += sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8]; 3322d876e2b0SMark Adams x[row--] += sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7]; 3323d876e2b0SMark Adams x[row--] += sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6]; 3324d876e2b0SMark Adams break; 3325d876e2b0SMark Adams case 2: 3326d876e2b0SMark Adams x[row--] += sum2 * ibdiag[1] + sum1 * ibdiag[3]; 3327d876e2b0SMark Adams x[row--] += sum2 * ibdiag[0] + sum1 * ibdiag[2]; 3328d876e2b0SMark Adams break; 3329d71ae5a4SJacob Faibussowitsch case 1: 3330d71ae5a4SJacob Faibussowitsch x[row--] += sum1 * (*ibdiag); 3331d71ae5a4SJacob Faibussowitsch break; 3332d876e2b0SMark Adams } 3333d876e2b0SMark Adams } else { 3334d876e2b0SMark Adams /* no diag so set = */ 33354d12350bSJunchao Zhang switch (nodesz) { 3336d876e2b0SMark Adams case 5: 3337d876e2b0SMark Adams x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24]; 3338d876e2b0SMark Adams x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23]; 3339d876e2b0SMark Adams x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22]; 3340d876e2b0SMark Adams x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21]; 3341d876e2b0SMark Adams x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20]; 3342d876e2b0SMark Adams break; 3343d876e2b0SMark Adams case 4: 3344d876e2b0SMark Adams x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15]; 3345d876e2b0SMark Adams x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14]; 3346d876e2b0SMark Adams x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13]; 3347d876e2b0SMark Adams x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12]; 3348d876e2b0SMark Adams break; 3349d876e2b0SMark Adams case 3: 3350d876e2b0SMark Adams x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8]; 3351d876e2b0SMark Adams x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7]; 3352d876e2b0SMark Adams x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6]; 3353d876e2b0SMark Adams break; 3354d876e2b0SMark Adams case 2: 3355d876e2b0SMark Adams x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3]; 3356d876e2b0SMark Adams x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2]; 3357d876e2b0SMark Adams break; 3358d71ae5a4SJacob Faibussowitsch case 1: 3359d71ae5a4SJacob Faibussowitsch x[row--] = sum1 * (*ibdiag); 3360d71ae5a4SJacob Faibussowitsch break; 33615850ef23SBarry Smith } 33625850ef23SBarry Smith } 3363d876e2b0SMark Adams } 3364d876e2b0SMark Adams if (xb == b) { 33659566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz)); 3366d876e2b0SMark Adams } else { 33679566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper, undercounts diag inverse */ 3368d876e2b0SMark Adams } 33695850ef23SBarry Smith } 33702af78befSBarry Smith } 337189c6957cSBarry Smith if (flag & SOR_EISENSTAT) { 337289c6957cSBarry Smith /* 337389c6957cSBarry Smith Apply (U + D)^-1 where D is now the block diagonal 337489c6957cSBarry Smith */ 337589c6957cSBarry Smith ibdiag = a->inode.ibdiag + a->inode.bdiagsize; 337689c6957cSBarry Smith for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) { 33774d12350bSJunchao Zhang nodesz = sizes[i + 1] - sizes[i]; 33784d12350bSJunchao Zhang ibdiag -= nodesz * nodesz; 337989c6957cSBarry Smith sz = ii[row + 1] - diag[row] - 1; 338089c6957cSBarry Smith v1 = a->a + diag[row] + 1; 338189c6957cSBarry Smith idx = a->j + diag[row] + 1; 33824108e4d5SBarry Smith /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 33834d12350bSJunchao Zhang switch (nodesz) { 338489c6957cSBarry Smith case 1: 338589c6957cSBarry Smith 338689c6957cSBarry Smith sum1 = b[row]; 338789c6957cSBarry Smith for (n = 0; n < sz - 1; n += 2) { 338889c6957cSBarry Smith i1 = idx[0]; 338989c6957cSBarry Smith i2 = idx[1]; 339089c6957cSBarry Smith idx += 2; 339189c6957cSBarry Smith tmp0 = x[i1]; 339289c6957cSBarry Smith tmp1 = x[i2]; 33939371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 33949371c9d4SSatish Balay v1 += 2; 339589c6957cSBarry Smith } 339689c6957cSBarry Smith 339789c6957cSBarry Smith if (n == sz - 1) { 339889c6957cSBarry Smith tmp0 = x[*idx]; 339989c6957cSBarry Smith sum1 -= *v1 * tmp0; 340089c6957cSBarry Smith } 34019371c9d4SSatish Balay x[row] = sum1 * (*ibdiag); 34029371c9d4SSatish Balay row--; 340389c6957cSBarry Smith break; 340489c6957cSBarry Smith 340589c6957cSBarry Smith case 2: 340689c6957cSBarry Smith 340789c6957cSBarry Smith sum1 = b[row]; 340889c6957cSBarry Smith sum2 = b[row - 1]; 340989c6957cSBarry Smith /* note that sum1 is associated with the second of the two rows */ 341089c6957cSBarry Smith v2 = a->a + diag[row - 1] + 2; 341189c6957cSBarry Smith for (n = 0; n < sz - 1; n += 2) { 341289c6957cSBarry Smith i1 = idx[0]; 341389c6957cSBarry Smith i2 = idx[1]; 341489c6957cSBarry Smith idx += 2; 341589c6957cSBarry Smith tmp0 = x[i1]; 341689c6957cSBarry Smith tmp1 = x[i2]; 34179371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 34189371c9d4SSatish Balay v1 += 2; 34199371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 34209371c9d4SSatish Balay v2 += 2; 342189c6957cSBarry Smith } 342289c6957cSBarry Smith 342389c6957cSBarry Smith if (n == sz - 1) { 342489c6957cSBarry Smith tmp0 = x[*idx]; 342589c6957cSBarry Smith sum1 -= *v1 * tmp0; 342689c6957cSBarry Smith sum2 -= *v2 * tmp0; 342789c6957cSBarry Smith } 3428938d4eb3SBarry Smith x[row] = sum2 * ibdiag[1] + sum1 * ibdiag[3]; 3429938d4eb3SBarry Smith x[row - 1] = sum2 * ibdiag[0] + sum1 * ibdiag[2]; 3430938d4eb3SBarry Smith row -= 2; 343189c6957cSBarry Smith break; 343289c6957cSBarry Smith case 3: 343389c6957cSBarry Smith 343489c6957cSBarry Smith sum1 = b[row]; 343589c6957cSBarry Smith sum2 = b[row - 1]; 343689c6957cSBarry Smith sum3 = b[row - 2]; 343789c6957cSBarry Smith v2 = a->a + diag[row - 1] + 2; 343889c6957cSBarry Smith v3 = a->a + diag[row - 2] + 3; 343989c6957cSBarry Smith for (n = 0; n < sz - 1; n += 2) { 344089c6957cSBarry Smith i1 = idx[0]; 344189c6957cSBarry Smith i2 = idx[1]; 344289c6957cSBarry Smith idx += 2; 344389c6957cSBarry Smith tmp0 = x[i1]; 344489c6957cSBarry Smith tmp1 = x[i2]; 34459371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 34469371c9d4SSatish Balay v1 += 2; 34479371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 34489371c9d4SSatish Balay v2 += 2; 34499371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 34509371c9d4SSatish Balay v3 += 2; 345189c6957cSBarry Smith } 345289c6957cSBarry Smith 345389c6957cSBarry Smith if (n == sz - 1) { 345489c6957cSBarry Smith tmp0 = x[*idx]; 345589c6957cSBarry Smith sum1 -= *v1 * tmp0; 345689c6957cSBarry Smith sum2 -= *v2 * tmp0; 345789c6957cSBarry Smith sum3 -= *v3 * tmp0; 345889c6957cSBarry Smith } 3459938d4eb3SBarry Smith x[row] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8]; 3460938d4eb3SBarry Smith x[row - 1] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7]; 3461938d4eb3SBarry Smith x[row - 2] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6]; 3462938d4eb3SBarry Smith row -= 3; 346389c6957cSBarry Smith break; 346489c6957cSBarry Smith case 4: 346589c6957cSBarry Smith 346689c6957cSBarry Smith sum1 = b[row]; 346789c6957cSBarry Smith sum2 = b[row - 1]; 346889c6957cSBarry Smith sum3 = b[row - 2]; 346989c6957cSBarry Smith sum4 = b[row - 3]; 347089c6957cSBarry Smith v2 = a->a + diag[row - 1] + 2; 347189c6957cSBarry Smith v3 = a->a + diag[row - 2] + 3; 347289c6957cSBarry Smith v4 = a->a + diag[row - 3] + 4; 347389c6957cSBarry Smith for (n = 0; n < sz - 1; n += 2) { 347489c6957cSBarry Smith i1 = idx[0]; 347589c6957cSBarry Smith i2 = idx[1]; 347689c6957cSBarry Smith idx += 2; 347789c6957cSBarry Smith tmp0 = x[i1]; 347889c6957cSBarry Smith tmp1 = x[i2]; 34799371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 34809371c9d4SSatish Balay v1 += 2; 34819371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 34829371c9d4SSatish Balay v2 += 2; 34839371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 34849371c9d4SSatish Balay v3 += 2; 34859371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 34869371c9d4SSatish Balay v4 += 2; 348789c6957cSBarry Smith } 348889c6957cSBarry Smith 348989c6957cSBarry Smith if (n == sz - 1) { 349089c6957cSBarry Smith tmp0 = x[*idx]; 349189c6957cSBarry Smith sum1 -= *v1 * tmp0; 349289c6957cSBarry Smith sum2 -= *v2 * tmp0; 349389c6957cSBarry Smith sum3 -= *v3 * tmp0; 349489c6957cSBarry Smith sum4 -= *v4 * tmp0; 349589c6957cSBarry Smith } 3496938d4eb3SBarry Smith x[row] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15]; 3497938d4eb3SBarry Smith x[row - 1] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14]; 3498938d4eb3SBarry Smith x[row - 2] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13]; 3499938d4eb3SBarry Smith x[row - 3] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12]; 3500938d4eb3SBarry Smith row -= 4; 350189c6957cSBarry Smith break; 350289c6957cSBarry Smith case 5: 350389c6957cSBarry Smith 350489c6957cSBarry Smith sum1 = b[row]; 350589c6957cSBarry Smith sum2 = b[row - 1]; 350689c6957cSBarry Smith sum3 = b[row - 2]; 350789c6957cSBarry Smith sum4 = b[row - 3]; 350889c6957cSBarry Smith sum5 = b[row - 4]; 350989c6957cSBarry Smith v2 = a->a + diag[row - 1] + 2; 351089c6957cSBarry Smith v3 = a->a + diag[row - 2] + 3; 351189c6957cSBarry Smith v4 = a->a + diag[row - 3] + 4; 351289c6957cSBarry Smith v5 = a->a + diag[row - 4] + 5; 351389c6957cSBarry Smith for (n = 0; n < sz - 1; n += 2) { 351489c6957cSBarry Smith i1 = idx[0]; 351589c6957cSBarry Smith i2 = idx[1]; 351689c6957cSBarry Smith idx += 2; 351789c6957cSBarry Smith tmp0 = x[i1]; 351889c6957cSBarry Smith tmp1 = x[i2]; 35199371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 35209371c9d4SSatish Balay v1 += 2; 35219371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 35229371c9d4SSatish Balay v2 += 2; 35239371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 35249371c9d4SSatish Balay v3 += 2; 35259371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 35269371c9d4SSatish Balay v4 += 2; 35279371c9d4SSatish Balay sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 35289371c9d4SSatish Balay v5 += 2; 352989c6957cSBarry Smith } 353089c6957cSBarry Smith 353189c6957cSBarry Smith if (n == sz - 1) { 353289c6957cSBarry Smith tmp0 = x[*idx]; 353389c6957cSBarry Smith sum1 -= *v1 * tmp0; 353489c6957cSBarry Smith sum2 -= *v2 * tmp0; 353589c6957cSBarry Smith sum3 -= *v3 * tmp0; 353689c6957cSBarry Smith sum4 -= *v4 * tmp0; 353789c6957cSBarry Smith sum5 -= *v5 * tmp0; 353889c6957cSBarry Smith } 3539938d4eb3SBarry Smith x[row] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24]; 3540938d4eb3SBarry Smith x[row - 1] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23]; 3541938d4eb3SBarry Smith x[row - 2] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22]; 3542938d4eb3SBarry Smith x[row - 3] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21]; 3543938d4eb3SBarry Smith x[row - 4] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20]; 3544938d4eb3SBarry Smith row -= 5; 354589c6957cSBarry Smith break; 3546d71ae5a4SJacob Faibussowitsch default: 35474d12350bSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz); 354889c6957cSBarry Smith } 354989c6957cSBarry Smith } 35509566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(a->nz)); 355189c6957cSBarry Smith 355289c6957cSBarry Smith /* 355389c6957cSBarry Smith t = b - D x where D is the block diagonal 355489c6957cSBarry Smith */ 355589c6957cSBarry Smith cnt = 0; 355689c6957cSBarry Smith for (i = 0, row = 0; i < m; i++) { 35574d12350bSJunchao Zhang nodesz = sizes[i + 1] - sizes[i]; 35584d12350bSJunchao Zhang switch (nodesz) { 355989c6957cSBarry Smith case 1: 35609371c9d4SSatish Balay t[row] = b[row] - bdiag[cnt++] * x[row]; 35619371c9d4SSatish Balay row++; 356289c6957cSBarry Smith break; 356389c6957cSBarry Smith case 2: 35649371c9d4SSatish Balay x1 = x[row]; 35659371c9d4SSatish Balay x2 = x[row + 1]; 356689c6957cSBarry Smith tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2]; 356789c6957cSBarry Smith tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3]; 356889c6957cSBarry Smith t[row] = b[row] - tmp1; 35699371c9d4SSatish Balay t[row + 1] = b[row + 1] - tmp2; 35709371c9d4SSatish Balay row += 2; 357189c6957cSBarry Smith cnt += 4; 357289c6957cSBarry Smith break; 357389c6957cSBarry Smith case 3: 35749371c9d4SSatish Balay x1 = x[row]; 35759371c9d4SSatish Balay x2 = x[row + 1]; 35769371c9d4SSatish Balay x3 = x[row + 2]; 357789c6957cSBarry Smith tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6]; 357889c6957cSBarry Smith tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7]; 357989c6957cSBarry Smith tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8]; 358089c6957cSBarry Smith t[row] = b[row] - tmp1; 358189c6957cSBarry Smith t[row + 1] = b[row + 1] - tmp2; 35829371c9d4SSatish Balay t[row + 2] = b[row + 2] - tmp3; 35839371c9d4SSatish Balay row += 3; 358489c6957cSBarry Smith cnt += 9; 358589c6957cSBarry Smith break; 358689c6957cSBarry Smith case 4: 35879371c9d4SSatish Balay x1 = x[row]; 35889371c9d4SSatish Balay x2 = x[row + 1]; 35899371c9d4SSatish Balay x3 = x[row + 2]; 35909371c9d4SSatish Balay x4 = x[row + 3]; 359189c6957cSBarry Smith tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12]; 359289c6957cSBarry Smith tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13]; 359389c6957cSBarry Smith tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14]; 359489c6957cSBarry Smith tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15]; 359589c6957cSBarry Smith t[row] = b[row] - tmp1; 359689c6957cSBarry Smith t[row + 1] = b[row + 1] - tmp2; 359789c6957cSBarry Smith t[row + 2] = b[row + 2] - tmp3; 35989371c9d4SSatish Balay t[row + 3] = b[row + 3] - tmp4; 35999371c9d4SSatish Balay row += 4; 360089c6957cSBarry Smith cnt += 16; 360189c6957cSBarry Smith break; 360289c6957cSBarry Smith case 5: 36039371c9d4SSatish Balay x1 = x[row]; 36049371c9d4SSatish Balay x2 = x[row + 1]; 36059371c9d4SSatish Balay x3 = x[row + 2]; 36069371c9d4SSatish Balay x4 = x[row + 3]; 36079371c9d4SSatish Balay x5 = x[row + 4]; 360889c6957cSBarry Smith tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20]; 360989c6957cSBarry Smith tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21]; 361089c6957cSBarry Smith tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22]; 361189c6957cSBarry Smith tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23]; 361289c6957cSBarry Smith tmp5 = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24]; 361389c6957cSBarry Smith t[row] = b[row] - tmp1; 361489c6957cSBarry Smith t[row + 1] = b[row + 1] - tmp2; 361589c6957cSBarry Smith t[row + 2] = b[row + 2] - tmp3; 361689c6957cSBarry Smith t[row + 3] = b[row + 3] - tmp4; 36179371c9d4SSatish Balay t[row + 4] = b[row + 4] - tmp5; 36189371c9d4SSatish Balay row += 5; 361989c6957cSBarry Smith cnt += 25; 362089c6957cSBarry Smith break; 3621d71ae5a4SJacob Faibussowitsch default: 36224d12350bSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz); 362389c6957cSBarry Smith } 362489c6957cSBarry Smith } 36259566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(m)); 362689c6957cSBarry Smith 362789c6957cSBarry Smith /* 362889c6957cSBarry Smith Apply (L + D)^-1 where D is the block diagonal 362989c6957cSBarry Smith */ 363089c6957cSBarry Smith for (i = 0, row = 0; i < m; i++) { 36314d12350bSJunchao Zhang nodesz = sizes[i + 1] - sizes[i]; 363289c6957cSBarry Smith sz = diag[row] - ii[row]; 363389c6957cSBarry Smith v1 = a->a + ii[row]; 363489c6957cSBarry Smith idx = a->j + ii[row]; 36354108e4d5SBarry Smith /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 36364d12350bSJunchao Zhang switch (nodesz) { 363789c6957cSBarry Smith case 1: 363889c6957cSBarry Smith 363989c6957cSBarry Smith sum1 = t[row]; 364089c6957cSBarry Smith for (n = 0; n < sz - 1; n += 2) { 364189c6957cSBarry Smith i1 = idx[0]; 364289c6957cSBarry Smith i2 = idx[1]; 364389c6957cSBarry Smith idx += 2; 364489c6957cSBarry Smith tmp0 = t[i1]; 364589c6957cSBarry Smith tmp1 = t[i2]; 36469371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 36479371c9d4SSatish Balay v1 += 2; 364889c6957cSBarry Smith } 364989c6957cSBarry Smith 365089c6957cSBarry Smith if (n == sz - 1) { 365189c6957cSBarry Smith tmp0 = t[*idx]; 365289c6957cSBarry Smith sum1 -= *v1 * tmp0; 365389c6957cSBarry Smith } 36549371c9d4SSatish Balay x[row] += t[row] = sum1 * (*ibdiag++); 36559371c9d4SSatish Balay row++; 365689c6957cSBarry Smith break; 365789c6957cSBarry Smith case 2: 365889c6957cSBarry Smith v2 = a->a + ii[row + 1]; 365989c6957cSBarry Smith sum1 = t[row]; 366089c6957cSBarry Smith sum2 = t[row + 1]; 366189c6957cSBarry Smith for (n = 0; n < sz - 1; n += 2) { 366289c6957cSBarry Smith i1 = idx[0]; 366389c6957cSBarry Smith i2 = idx[1]; 366489c6957cSBarry Smith idx += 2; 366589c6957cSBarry Smith tmp0 = t[i1]; 366689c6957cSBarry Smith tmp1 = t[i2]; 36679371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 36689371c9d4SSatish Balay v1 += 2; 36699371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 36709371c9d4SSatish Balay v2 += 2; 367189c6957cSBarry Smith } 367289c6957cSBarry Smith 367389c6957cSBarry Smith if (n == sz - 1) { 367489c6957cSBarry Smith tmp0 = t[*idx]; 367589c6957cSBarry Smith sum1 -= v1[0] * tmp0; 367689c6957cSBarry Smith sum2 -= v2[0] * tmp0; 367789c6957cSBarry Smith } 367889c6957cSBarry Smith x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[2]; 367989c6957cSBarry Smith x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3]; 36809371c9d4SSatish Balay ibdiag += 4; 36819371c9d4SSatish Balay row += 2; 368289c6957cSBarry Smith break; 368389c6957cSBarry Smith case 3: 368489c6957cSBarry Smith v2 = a->a + ii[row + 1]; 368589c6957cSBarry Smith v3 = a->a + ii[row + 2]; 368689c6957cSBarry Smith sum1 = t[row]; 368789c6957cSBarry Smith sum2 = t[row + 1]; 368889c6957cSBarry Smith sum3 = t[row + 2]; 368989c6957cSBarry Smith for (n = 0; n < sz - 1; n += 2) { 369089c6957cSBarry Smith i1 = idx[0]; 369189c6957cSBarry Smith i2 = idx[1]; 369289c6957cSBarry Smith idx += 2; 369389c6957cSBarry Smith tmp0 = t[i1]; 369489c6957cSBarry Smith tmp1 = t[i2]; 36959371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 36969371c9d4SSatish Balay v1 += 2; 36979371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 36989371c9d4SSatish Balay v2 += 2; 36999371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 37009371c9d4SSatish Balay v3 += 2; 370189c6957cSBarry Smith } 370289c6957cSBarry Smith 370389c6957cSBarry Smith if (n == sz - 1) { 370489c6957cSBarry Smith tmp0 = t[*idx]; 370589c6957cSBarry Smith sum1 -= v1[0] * tmp0; 370689c6957cSBarry Smith sum2 -= v2[0] * tmp0; 370789c6957cSBarry Smith sum3 -= v3[0] * tmp0; 370889c6957cSBarry Smith } 370989c6957cSBarry Smith x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6]; 371089c6957cSBarry Smith x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7]; 371189c6957cSBarry Smith x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8]; 37129371c9d4SSatish Balay ibdiag += 9; 37139371c9d4SSatish Balay row += 3; 371489c6957cSBarry Smith break; 371589c6957cSBarry Smith case 4: 371689c6957cSBarry Smith v2 = a->a + ii[row + 1]; 371789c6957cSBarry Smith v3 = a->a + ii[row + 2]; 371889c6957cSBarry Smith v4 = a->a + ii[row + 3]; 371989c6957cSBarry Smith sum1 = t[row]; 372089c6957cSBarry Smith sum2 = t[row + 1]; 372189c6957cSBarry Smith sum3 = t[row + 2]; 372289c6957cSBarry Smith sum4 = t[row + 3]; 372389c6957cSBarry Smith for (n = 0; n < sz - 1; n += 2) { 372489c6957cSBarry Smith i1 = idx[0]; 372589c6957cSBarry Smith i2 = idx[1]; 372689c6957cSBarry Smith idx += 2; 372789c6957cSBarry Smith tmp0 = t[i1]; 372889c6957cSBarry Smith tmp1 = t[i2]; 37299371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 37309371c9d4SSatish Balay v1 += 2; 37319371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 37329371c9d4SSatish Balay v2 += 2; 37339371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 37349371c9d4SSatish Balay v3 += 2; 37359371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 37369371c9d4SSatish Balay v4 += 2; 373789c6957cSBarry Smith } 373889c6957cSBarry Smith 373989c6957cSBarry Smith if (n == sz - 1) { 374089c6957cSBarry Smith tmp0 = t[*idx]; 374189c6957cSBarry Smith sum1 -= v1[0] * tmp0; 374289c6957cSBarry Smith sum2 -= v2[0] * tmp0; 374389c6957cSBarry Smith sum3 -= v3[0] * tmp0; 374489c6957cSBarry Smith sum4 -= v4[0] * tmp0; 374589c6957cSBarry Smith } 374689c6957cSBarry Smith x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12]; 374789c6957cSBarry Smith x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13]; 374889c6957cSBarry Smith x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14]; 374989c6957cSBarry Smith x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15]; 37509371c9d4SSatish Balay ibdiag += 16; 37519371c9d4SSatish Balay row += 4; 375289c6957cSBarry Smith break; 375389c6957cSBarry Smith case 5: 375489c6957cSBarry Smith v2 = a->a + ii[row + 1]; 375589c6957cSBarry Smith v3 = a->a + ii[row + 2]; 375689c6957cSBarry Smith v4 = a->a + ii[row + 3]; 375789c6957cSBarry Smith v5 = a->a + ii[row + 4]; 375889c6957cSBarry Smith sum1 = t[row]; 375989c6957cSBarry Smith sum2 = t[row + 1]; 376089c6957cSBarry Smith sum3 = t[row + 2]; 376189c6957cSBarry Smith sum4 = t[row + 3]; 376289c6957cSBarry Smith sum5 = t[row + 4]; 376389c6957cSBarry Smith for (n = 0; n < sz - 1; n += 2) { 376489c6957cSBarry Smith i1 = idx[0]; 376589c6957cSBarry Smith i2 = idx[1]; 376689c6957cSBarry Smith idx += 2; 376789c6957cSBarry Smith tmp0 = t[i1]; 376889c6957cSBarry Smith tmp1 = t[i2]; 37699371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 37709371c9d4SSatish Balay v1 += 2; 37719371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 37729371c9d4SSatish Balay v2 += 2; 37739371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 37749371c9d4SSatish Balay v3 += 2; 37759371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 37769371c9d4SSatish Balay v4 += 2; 37779371c9d4SSatish Balay sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 37789371c9d4SSatish Balay v5 += 2; 377989c6957cSBarry Smith } 378089c6957cSBarry Smith 378189c6957cSBarry Smith if (n == sz - 1) { 378289c6957cSBarry Smith tmp0 = t[*idx]; 378389c6957cSBarry Smith sum1 -= v1[0] * tmp0; 378489c6957cSBarry Smith sum2 -= v2[0] * tmp0; 378589c6957cSBarry Smith sum3 -= v3[0] * tmp0; 378689c6957cSBarry Smith sum4 -= v4[0] * tmp0; 378789c6957cSBarry Smith sum5 -= v5[0] * tmp0; 378889c6957cSBarry Smith } 378989c6957cSBarry Smith x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20]; 379089c6957cSBarry Smith x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21]; 379189c6957cSBarry Smith x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22]; 379289c6957cSBarry Smith x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23]; 379389c6957cSBarry Smith x[row + 4] += t[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24]; 37949371c9d4SSatish Balay ibdiag += 25; 37959371c9d4SSatish Balay row += 5; 379689c6957cSBarry Smith break; 3797d71ae5a4SJacob Faibussowitsch default: 37984d12350bSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz); 379989c6957cSBarry Smith } 380089c6957cSBarry Smith } 38019566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(a->nz)); 38025850ef23SBarry Smith } 38039566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 38049566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 38053ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 38062af78befSBarry Smith } 38072af78befSBarry Smith 3808ff6a9541SJacob Faibussowitsch static PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A, Vec bb, Vec xx) 3809d71ae5a4SJacob Faibussowitsch { 381089c6957cSBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 381189c6957cSBarry Smith PetscScalar *x, tmp1, tmp2, tmp3, tmp4, tmp5, x1, x2, x3, x4, x5; 381289c6957cSBarry Smith const MatScalar *bdiag = a->inode.bdiag; 381389c6957cSBarry Smith const PetscScalar *b; 38144d12350bSJunchao Zhang PetscInt m = a->inode.node_count, cnt = 0, i, row, nodesz; 38154d12350bSJunchao Zhang const PetscInt *sizes = a->inode.size_csr; 38162af78befSBarry Smith 381789c6957cSBarry Smith PetscFunctionBegin; 38184d12350bSJunchao Zhang PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 38199566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 38209566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 382189c6957cSBarry Smith cnt = 0; 382289c6957cSBarry Smith for (i = 0, row = 0; i < m; i++) { 38234d12350bSJunchao Zhang nodesz = sizes[i + 1] - sizes[i]; 38244d12350bSJunchao Zhang switch (nodesz) { 382589c6957cSBarry Smith case 1: 38269371c9d4SSatish Balay x[row] = b[row] * bdiag[cnt++]; 38279371c9d4SSatish Balay row++; 382889c6957cSBarry Smith break; 382989c6957cSBarry Smith case 2: 38309371c9d4SSatish Balay x1 = b[row]; 38319371c9d4SSatish Balay x2 = b[row + 1]; 383289c6957cSBarry Smith tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2]; 383389c6957cSBarry Smith tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3]; 383489c6957cSBarry Smith x[row++] = tmp1; 383589c6957cSBarry Smith x[row++] = tmp2; 383689c6957cSBarry Smith cnt += 4; 383789c6957cSBarry Smith break; 383889c6957cSBarry Smith case 3: 38399371c9d4SSatish Balay x1 = b[row]; 38409371c9d4SSatish Balay x2 = b[row + 1]; 38419371c9d4SSatish Balay x3 = b[row + 2]; 384289c6957cSBarry Smith tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6]; 384389c6957cSBarry Smith tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7]; 384489c6957cSBarry Smith tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8]; 384589c6957cSBarry Smith x[row++] = tmp1; 384689c6957cSBarry Smith x[row++] = tmp2; 384789c6957cSBarry Smith x[row++] = tmp3; 384889c6957cSBarry Smith cnt += 9; 384989c6957cSBarry Smith break; 385089c6957cSBarry Smith case 4: 38519371c9d4SSatish Balay x1 = b[row]; 38529371c9d4SSatish Balay x2 = b[row + 1]; 38539371c9d4SSatish Balay x3 = b[row + 2]; 38549371c9d4SSatish Balay x4 = b[row + 3]; 385589c6957cSBarry Smith tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12]; 385689c6957cSBarry Smith tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13]; 385789c6957cSBarry Smith tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14]; 385889c6957cSBarry Smith tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15]; 385989c6957cSBarry Smith x[row++] = tmp1; 386089c6957cSBarry Smith x[row++] = tmp2; 386189c6957cSBarry Smith x[row++] = tmp3; 386289c6957cSBarry Smith x[row++] = tmp4; 386389c6957cSBarry Smith cnt += 16; 386489c6957cSBarry Smith break; 386589c6957cSBarry Smith case 5: 38669371c9d4SSatish Balay x1 = b[row]; 38679371c9d4SSatish Balay x2 = b[row + 1]; 38689371c9d4SSatish Balay x3 = b[row + 2]; 38699371c9d4SSatish Balay x4 = b[row + 3]; 38709371c9d4SSatish Balay x5 = b[row + 4]; 387189c6957cSBarry Smith tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20]; 387289c6957cSBarry Smith tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21]; 387389c6957cSBarry Smith tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22]; 387489c6957cSBarry Smith tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23]; 387589c6957cSBarry Smith tmp5 = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24]; 387689c6957cSBarry Smith x[row++] = tmp1; 387789c6957cSBarry Smith x[row++] = tmp2; 387889c6957cSBarry Smith x[row++] = tmp3; 387989c6957cSBarry Smith x[row++] = tmp4; 388089c6957cSBarry Smith x[row++] = tmp5; 388189c6957cSBarry Smith cnt += 25; 388289c6957cSBarry Smith break; 3883d71ae5a4SJacob Faibussowitsch default: 38844d12350bSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz); 388589c6957cSBarry Smith } 388689c6957cSBarry Smith } 38879566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * cnt)); 38889566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 38899566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 38903ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 389189c6957cSBarry Smith } 389289c6957cSBarry Smith 3893d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJ_Inode_ResetOps(Mat A) 3894d71ae5a4SJacob Faibussowitsch { 3895b215bc84SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3896b215bc84SStefano Zampini 3897b215bc84SStefano Zampini PetscFunctionBegin; 3898b215bc84SStefano Zampini a->inode.node_count = 0; 3899b215bc84SStefano Zampini a->inode.use = PETSC_FALSE; 3900b215bc84SStefano Zampini a->inode.checked = PETSC_FALSE; 3901b215bc84SStefano Zampini a->inode.mat_nonzerostate = -1; 3902b215bc84SStefano Zampini A->ops->getrowij = MatGetRowIJ_SeqAIJ; 3903b215bc84SStefano Zampini A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ; 3904b215bc84SStefano Zampini A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ; 3905b215bc84SStefano Zampini A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ; 3906b215bc84SStefano Zampini A->ops->coloringpatch = NULL; 3907b215bc84SStefano Zampini A->ops->multdiagonalblock = NULL; 3908ad540459SPierre Jolivet if (A->factortype) A->ops->solve = MatSolve_SeqAIJ_inplace; 39093ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3910b215bc84SStefano Zampini } 3911b215bc84SStefano Zampini 39124c1414c8SBarry Smith /* 39134c1414c8SBarry Smith samestructure indicates that the matrix has not changed its nonzero structure so we 39144c1414c8SBarry Smith do not need to recompute the inodes 39154c1414c8SBarry Smith */ 3916d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCheckInode(Mat A) 3917d71ae5a4SJacob Faibussowitsch { 39184c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 39198758e1faSBarry Smith PetscInt i, j, m, nzx, nzy, *ns, node_count, blk_size; 3920ace3abfcSBarry Smith PetscBool flag; 39218758e1faSBarry Smith const PetscInt *idx, *idy, *ii; 39224c1414c8SBarry Smith 39234c1414c8SBarry Smith PetscFunctionBegin; 3924b215bc84SStefano Zampini if (!a->inode.use) { 39259566063dSJacob Faibussowitsch PetscCall(MatSeqAIJ_Inode_ResetOps(A)); 39264d12350bSJunchao Zhang PetscCall(PetscFree(a->inode.size_csr)); 39273ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3928b215bc84SStefano Zampini } 39293ba16761SJacob Faibussowitsch if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(PETSC_SUCCESS); 39304c1414c8SBarry Smith 3931d0f46423SBarry Smith m = A->rmap->n; 39324d12350bSJunchao Zhang if (!a->inode.size_csr) PetscCall(PetscMalloc1(m + 1, &a->inode.size_csr)); 39334d12350bSJunchao Zhang ns = a->inode.size_csr; 39344d12350bSJunchao Zhang ns[0] = 0; 39354c1414c8SBarry Smith 39364c1414c8SBarry Smith i = 0; 39374c1414c8SBarry Smith node_count = 0; 39384c1414c8SBarry Smith idx = a->j; 39394c1414c8SBarry Smith ii = a->i; 39406f2c871aSStefano Zampini if (idx) { 39414c1414c8SBarry Smith while (i < m) { /* For each row */ 39424c1414c8SBarry Smith nzx = ii[i + 1] - ii[i]; /* Number of nonzeros */ 39434c1414c8SBarry Smith /* Limits the number of elements in a node to 'a->inode.limit' */ 39444c1414c8SBarry Smith for (j = i + 1, idy = idx, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) { 39454c1414c8SBarry Smith nzy = ii[j + 1] - ii[j]; /* Same number of nonzeros */ 39464c1414c8SBarry Smith if (nzy != nzx) break; 39474c1414c8SBarry Smith idy += nzx; /* Same nonzero pattern */ 39489566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(idx, idy, nzx, &flag)); 39494c1414c8SBarry Smith if (!flag) break; 39504c1414c8SBarry Smith } 39514d12350bSJunchao Zhang ns[node_count + 1] = ns[node_count] + blk_size; 39524d12350bSJunchao Zhang node_count++; 39534c1414c8SBarry Smith idx += blk_size * nzx; 39544c1414c8SBarry Smith i = j; 39554c1414c8SBarry Smith } 39566f2c871aSStefano Zampini } 39574c1414c8SBarry Smith /* If not enough inodes found,, do not use inode version of the routines */ 39586f2c871aSStefano Zampini if (!m || !idx || node_count > .8 * m) { 39599566063dSJacob Faibussowitsch PetscCall(MatSeqAIJ_Inode_ResetOps(A)); 39604d12350bSJunchao Zhang PetscCall(PetscFree(a->inode.size_csr)); 39619566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m)); 39624c1414c8SBarry Smith } else { 3963d5f3da31SBarry Smith if (!A->factortype) { 3964375a6242SBarry Smith A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 3965375a6242SBarry Smith if (A->rmap->n == A->cmap->n) { 39664108e4d5SBarry Smith A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 39674108e4d5SBarry Smith A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 39684108e4d5SBarry Smith A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 39694108e4d5SBarry Smith A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 39704108e4d5SBarry Smith A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 3971375a6242SBarry Smith } 3972d3ac4fa3SBarry Smith } else { 3973d3ac4fa3SBarry Smith A->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 3974d3ac4fa3SBarry Smith } 39754c1414c8SBarry Smith a->inode.node_count = node_count; 39769566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit)); 39774c1414c8SBarry Smith } 3978be6adb11SBarry Smith a->inode.checked = PETSC_TRUE; 3979a02bda8eSBarry Smith a->inode.mat_nonzerostate = A->nonzerostate; 39803ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 39814c1414c8SBarry Smith } 39824c1414c8SBarry Smith 3983d71ae5a4SJacob Faibussowitsch PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A, MatDuplicateOption cpvalues, Mat *C) 3984d71ae5a4SJacob Faibussowitsch { 3985150f0143SBarry Smith Mat B = *C; 3986150f0143SBarry Smith Mat_SeqAIJ *c = (Mat_SeqAIJ *)B->data, *a = (Mat_SeqAIJ *)A->data; 3987150f0143SBarry Smith PetscInt m = A->rmap->n; 3988150f0143SBarry Smith 3989150f0143SBarry Smith PetscFunctionBegin; 3990150f0143SBarry Smith c->inode.use = a->inode.use; 3991150f0143SBarry Smith c->inode.limit = a->inode.limit; 3992150f0143SBarry Smith c->inode.max_limit = a->inode.max_limit; 3993ec710b6aSStefano Zampini c->inode.checked = PETSC_FALSE; 39944d12350bSJunchao Zhang c->inode.size_csr = NULL; 3995ec710b6aSStefano Zampini c->inode.node_count = 0; 3996ec710b6aSStefano Zampini c->inode.ibdiag = NULL; 3997ec710b6aSStefano Zampini c->inode.bdiag = NULL; 3998ec710b6aSStefano Zampini c->inode.mat_nonzerostate = -1; 3999b215bc84SStefano Zampini if (a->inode.use) { 40004d12350bSJunchao Zhang if (a->inode.checked && a->inode.size_csr) { 40014d12350bSJunchao Zhang PetscCall(PetscMalloc1(m + 1, &c->inode.size_csr)); 40024d12350bSJunchao Zhang PetscCall(PetscArraycpy(c->inode.size_csr, a->inode.size_csr, m + 1)); 4003ec710b6aSStefano Zampini 4004ec710b6aSStefano Zampini c->inode.checked = PETSC_TRUE; 4005ec710b6aSStefano Zampini c->inode.node_count = a->inode.node_count; 4006ec710b6aSStefano Zampini c->inode.mat_nonzerostate = (*C)->nonzerostate; 4007ec710b6aSStefano Zampini } 4008a02bda8eSBarry Smith /* note the table of functions below should match that in MatSeqAIJCheckInode() */ 40092c451681SBarry Smith if (!B->factortype) { 40102c451681SBarry Smith B->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 40112c451681SBarry Smith B->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 40122c451681SBarry Smith B->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 40132c451681SBarry Smith B->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 40142c451681SBarry Smith B->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 40152c451681SBarry Smith B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4016150f0143SBarry Smith } else { 40172c451681SBarry Smith B->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4018150f0143SBarry Smith } 4019150f0143SBarry Smith } 40203ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4021150f0143SBarry Smith } 4022150f0143SBarry Smith 4023d71ae5a4SJacob Faibussowitsch static inline PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols, PetscInt nzl, PetscInt nzu, PetscInt nz, const PetscInt *ai, const PetscInt *aj, const PetscInt *adiag, PetscInt row) 4024d71ae5a4SJacob Faibussowitsch { 40258758e1faSBarry Smith PetscInt k; 40268758e1faSBarry Smith const PetscInt *vi; 40276e111a19SKarl Rupp 402817454e89SShri Abhyankar PetscFunctionBegin; 402917454e89SShri Abhyankar vi = aj + ai[row]; 403017454e89SShri Abhyankar for (k = 0; k < nzl; k++) cols[k] = vi[k]; 403117454e89SShri Abhyankar vi = aj + adiag[row]; 403217454e89SShri Abhyankar cols[nzl] = vi[0]; 403317454e89SShri Abhyankar vi = aj + adiag[row + 1] + 1; 403417454e89SShri Abhyankar for (k = 0; k < nzu; k++) cols[nzl + 1 + k] = vi[k]; 40353ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 403617454e89SShri Abhyankar } 40376936b636SHong Zhang /* 4038a02bda8eSBarry Smith MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix. 4039a02bda8eSBarry Smith Modified from MatSeqAIJCheckInode(). 40406936b636SHong Zhang 40416936b636SHong Zhang Input Parameters: 4042abb87a52SBarry Smith . Mat A - ILU or LU matrix factor 4043abb87a52SBarry Smith 40446936b636SHong Zhang */ 4045d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A) 4046d71ae5a4SJacob Faibussowitsch { 4047019b515eSShri Abhyankar Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4048019b515eSShri Abhyankar PetscInt i, j, m, nzl1, nzu1, nzl2, nzu2, nzx, nzy, node_count, blk_size; 40498758e1faSBarry Smith PetscInt *cols1, *cols2, *ns; 40508758e1faSBarry Smith const PetscInt *ai = a->i, *aj = a->j, *adiag = a->diag; 4051ace3abfcSBarry Smith PetscBool flag; 4052019b515eSShri Abhyankar 4053019b515eSShri Abhyankar PetscFunctionBegin; 40543ba16761SJacob Faibussowitsch if (!a->inode.use) PetscFunctionReturn(PETSC_SUCCESS); 40553ba16761SJacob Faibussowitsch if (a->inode.checked) PetscFunctionReturn(PETSC_SUCCESS); 4056019b515eSShri Abhyankar 4057019b515eSShri Abhyankar m = A->rmap->n; 40584d12350bSJunchao Zhang if (a->inode.size_csr) ns = a->inode.size_csr; 405948a46eb9SPierre Jolivet else PetscCall(PetscMalloc1(m + 1, &ns)); 40604d12350bSJunchao Zhang ns[0] = 0; 4061019b515eSShri Abhyankar 4062019b515eSShri Abhyankar i = 0; 4063019b515eSShri Abhyankar node_count = 0; 40649566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(m, &cols1, m, &cols2)); 4065019b515eSShri Abhyankar while (i < m) { /* For each row */ 4066019b515eSShri Abhyankar nzl1 = ai[i + 1] - ai[i]; /* Number of nonzeros in L */ 4067019b515eSShri Abhyankar nzu1 = adiag[i] - adiag[i + 1] - 1; /* Number of nonzeros in U excluding diagonal*/ 4068019b515eSShri Abhyankar nzx = nzl1 + nzu1 + 1; 40693ba16761SJacob Faibussowitsch PetscCall(MatGetRow_FactoredLU(cols1, nzl1, nzu1, nzx, ai, aj, adiag, i)); 4070019b515eSShri Abhyankar 4071019b515eSShri Abhyankar /* Limits the number of elements in a node to 'a->inode.limit' */ 4072019b515eSShri Abhyankar for (j = i + 1, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) { 4073019b515eSShri Abhyankar nzl2 = ai[j + 1] - ai[j]; 4074019b515eSShri Abhyankar nzu2 = adiag[j] - adiag[j + 1] - 1; 4075019b515eSShri Abhyankar nzy = nzl2 + nzu2 + 1; 4076019b515eSShri Abhyankar if (nzy != nzx) break; 40779566063dSJacob Faibussowitsch PetscCall(MatGetRow_FactoredLU(cols2, nzl2, nzu2, nzy, ai, aj, adiag, j)); 40789566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(cols1, cols2, nzx, &flag)); 40798758e1faSBarry Smith if (!flag) break; 4080019b515eSShri Abhyankar } 40814d12350bSJunchao Zhang ns[node_count + 1] = ns[node_count] + blk_size; 40824d12350bSJunchao Zhang node_count++; 4083019b515eSShri Abhyankar i = j; 4084019b515eSShri Abhyankar } 40859566063dSJacob Faibussowitsch PetscCall(PetscFree2(cols1, cols2)); 4086019b515eSShri Abhyankar /* If not enough inodes found,, do not use inode version of the routines */ 4087be6adb11SBarry Smith if (!m || node_count > .8 * m) { 40889566063dSJacob Faibussowitsch PetscCall(PetscFree(ns)); 40892205254eSKarl Rupp 4090019b515eSShri Abhyankar a->inode.node_count = 0; 40914d12350bSJunchao Zhang a->inode.size_csr = NULL; 4092019b515eSShri Abhyankar a->inode.use = PETSC_FALSE; 40932205254eSKarl Rupp 40949566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m)); 4095019b515eSShri Abhyankar } else { 4096f4259b30SLisandro Dalcin A->ops->mult = NULL; 4097f4259b30SLisandro Dalcin A->ops->sor = NULL; 4098f4259b30SLisandro Dalcin A->ops->multadd = NULL; 4099f4259b30SLisandro Dalcin A->ops->getrowij = NULL; 4100f4259b30SLisandro Dalcin A->ops->restorerowij = NULL; 4101f4259b30SLisandro Dalcin A->ops->getcolumnij = NULL; 4102f4259b30SLisandro Dalcin A->ops->restorecolumnij = NULL; 4103f4259b30SLisandro Dalcin A->ops->coloringpatch = NULL; 4104f4259b30SLisandro Dalcin A->ops->multdiagonalblock = NULL; 4105019b515eSShri Abhyankar a->inode.node_count = node_count; 41064d12350bSJunchao Zhang a->inode.size_csr = ns; 41079566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit)); 4108019b515eSShri Abhyankar } 4109be6adb11SBarry Smith a->inode.checked = PETSC_TRUE; 41103ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4111019b515eSShri Abhyankar } 4112019b515eSShri Abhyankar 41134c1414c8SBarry Smith /* 41144c1414c8SBarry Smith This is really ugly. if inodes are used this replaces the 41154c1414c8SBarry Smith permutations with ones that correspond to rows/cols of the matrix 4116467446fbSPierre Jolivet rather than inode blocks 41174c1414c8SBarry Smith */ 4118d71ae5a4SJacob Faibussowitsch PetscErrorCode MatInodeAdjustForInodes(Mat A, IS *rperm, IS *cperm) 4119d71ae5a4SJacob Faibussowitsch { 41204c1414c8SBarry Smith PetscFunctionBegin; 4121cac4c232SBarry Smith PetscTryMethod(A, "MatInodeAdjustForInodes_C", (Mat, IS *, IS *), (A, rperm, cperm)); 41223ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 41234c1414c8SBarry Smith } 41244c1414c8SBarry Smith 4125d71ae5a4SJacob Faibussowitsch PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A, IS *rperm, IS *cperm) 4126d71ae5a4SJacob Faibussowitsch { 41274c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 41285d0c19d7SBarry Smith PetscInt m = A->rmap->n, n = A->cmap->n, i, j, nslim_row = a->inode.node_count; 41295d0c19d7SBarry Smith const PetscInt *ridx, *cidx; 41304d12350bSJunchao Zhang PetscInt row, col, *permr, *permc, *ns_row = a->inode.size_csr, *tns, start_val, end_val, indx; 41314c1414c8SBarry Smith PetscInt nslim_col, *ns_col; 41324c1414c8SBarry Smith IS ris = *rperm, cis = *cperm; 41334c1414c8SBarry Smith 41344c1414c8SBarry Smith PetscFunctionBegin; 41354d12350bSJunchao Zhang if (!a->inode.size_csr) PetscFunctionReturn(PETSC_SUCCESS); /* no inodes so return */ 41363ba16761SJacob Faibussowitsch if (a->inode.node_count == m) PetscFunctionReturn(PETSC_SUCCESS); /* all inodes are of size 1 */ 41374c1414c8SBarry Smith 41389566063dSJacob Faibussowitsch PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col)); 413932603206SJames Wright PetscCall(PetscMalloc1(((nslim_row > nslim_col ? nslim_row : nslim_col) + 1), &tns)); 41409566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(m, &permr, n, &permc)); 41414c1414c8SBarry Smith 41429566063dSJacob Faibussowitsch PetscCall(ISGetIndices(ris, &ridx)); 41439566063dSJacob Faibussowitsch PetscCall(ISGetIndices(cis, &cidx)); 41444c1414c8SBarry Smith 4145baca6076SPierre Jolivet /* Form the inode structure for the rows of permuted matrix using inv perm*/ 41464d12350bSJunchao Zhang for (i = 0, tns[0] = 0; i < nslim_row; ++i) tns[i + 1] = tns[i] + (ns_row[i + 1] - ns_row[i]); 41474c1414c8SBarry Smith 41484c1414c8SBarry Smith /* Construct the permutations for rows*/ 41494c1414c8SBarry Smith for (i = 0, row = 0; i < nslim_row; ++i) { 41504c1414c8SBarry Smith indx = ridx[i]; 41514c1414c8SBarry Smith start_val = tns[indx]; 41524c1414c8SBarry Smith end_val = tns[indx + 1]; 41534c1414c8SBarry Smith for (j = start_val; j < end_val; ++j, ++row) permr[row] = j; 41544c1414c8SBarry Smith } 41554c1414c8SBarry Smith 41564c1414c8SBarry Smith /* Form the inode structure for the columns of permuted matrix using inv perm*/ 41574d12350bSJunchao Zhang for (i = 0, tns[0] = 0; i < nslim_col; ++i) tns[i + 1] = tns[i] + (ns_col[i + 1] - ns_col[i]); 41584c1414c8SBarry Smith 41594c1414c8SBarry Smith /* Construct permutations for columns */ 41604c1414c8SBarry Smith for (i = 0, col = 0; i < nslim_col; ++i) { 41614c1414c8SBarry Smith indx = cidx[i]; 41624c1414c8SBarry Smith start_val = tns[indx]; 41634c1414c8SBarry Smith end_val = tns[indx + 1]; 41644c1414c8SBarry Smith for (j = start_val; j < end_val; ++j, ++col) permc[col] = j; 41654c1414c8SBarry Smith } 41664c1414c8SBarry Smith 41679566063dSJacob Faibussowitsch PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permr, PETSC_COPY_VALUES, rperm)); 41689566063dSJacob Faibussowitsch PetscCall(ISSetPermutation(*rperm)); 41699566063dSJacob Faibussowitsch PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permc, PETSC_COPY_VALUES, cperm)); 41709566063dSJacob Faibussowitsch PetscCall(ISSetPermutation(*cperm)); 41714c1414c8SBarry Smith 41729566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(ris, &ridx)); 41739566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(cis, &cidx)); 41744c1414c8SBarry Smith 41759566063dSJacob Faibussowitsch PetscCall(PetscFree(ns_col)); 41769566063dSJacob Faibussowitsch PetscCall(PetscFree2(permr, permc)); 41779566063dSJacob Faibussowitsch PetscCall(ISDestroy(&cis)); 41789566063dSJacob Faibussowitsch PetscCall(ISDestroy(&ris)); 41799566063dSJacob Faibussowitsch PetscCall(PetscFree(tns)); 41803ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 41814c1414c8SBarry Smith } 41824c1414c8SBarry Smith 41834c1414c8SBarry Smith /*@C 418411a5261eSBarry Smith MatInodeGetInodeSizes - Returns the inode information of a matrix with inodes 41854c1414c8SBarry Smith 41863f9fe445SBarry Smith Not Collective 41874c1414c8SBarry Smith 41884c1414c8SBarry Smith Input Parameter: 418911a5261eSBarry Smith . A - the Inode matrix or matrix derived from the Inode class -- e.g., `MATSEQAIJ` 41904c1414c8SBarry Smith 4191d8d19677SJose E. Roman Output Parameters: 41924c1414c8SBarry Smith + node_count - no of inodes present in the matrix. 41932ef1f0ffSBarry Smith . sizes - an array of size `node_count`, with the sizes of each inode. 41944c1414c8SBarry Smith - limit - the max size used to generate the inodes. 41954c1414c8SBarry Smith 41964c1414c8SBarry Smith Level: advanced 41974c1414c8SBarry Smith 419811a5261eSBarry Smith Note: 41994c1414c8SBarry Smith It should be called after the matrix is assembled. 42004c1414c8SBarry Smith The contents of the sizes[] array should not be changed. 42012ef1f0ffSBarry Smith `NULL` may be passed for information not needed 42024c1414c8SBarry Smith 42031cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatGetInfo()` 42044c1414c8SBarry Smith @*/ 4205d71ae5a4SJacob Faibussowitsch PetscErrorCode MatInodeGetInodeSizes(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit) 4206d71ae5a4SJacob Faibussowitsch { 42075f80ce2aSJacob Faibussowitsch PetscErrorCode (*f)(Mat, PetscInt *, PetscInt **, PetscInt *); 42084c1414c8SBarry Smith 42094c1414c8SBarry Smith PetscFunctionBegin; 42105f80ce2aSJacob Faibussowitsch PetscCheck(A->assembled, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix"); 42119566063dSJacob Faibussowitsch PetscCall(PetscObjectQueryFunction((PetscObject)A, "MatInodeGetInodeSizes_C", &f)); 42129566063dSJacob Faibussowitsch if (f) PetscCall((*f)(A, node_count, sizes, limit)); 42133ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 42144c1414c8SBarry Smith } 42154c1414c8SBarry Smith 4216d71ae5a4SJacob Faibussowitsch PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit) 4217d71ae5a4SJacob Faibussowitsch { 42184c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 42194c1414c8SBarry Smith 42204c1414c8SBarry Smith PetscFunctionBegin; 42214c1414c8SBarry Smith if (node_count) *node_count = a->inode.node_count; 42224d12350bSJunchao Zhang if (sizes) *sizes = a->inode.size_csr; 42234c1414c8SBarry Smith if (limit) *limit = a->inode.limit; 42243ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 42254c1414c8SBarry Smith } 4226