14c1414c8SBarry Smith /* 24c1414c8SBarry Smith This file provides high performance routines for the Inode format (compressed sparse row) 34c1414c8SBarry Smith by taking advantage of rows with identical nonzero structure (I-nodes). 44c1414c8SBarry Smith */ 5c6db04a5SJed Brown #include <../src/mat/impls/aij/seq/aij.h> 6fb56d528SJed Brown #if defined(PETSC_HAVE_XMMINTRIN_H) 7fb56d528SJed Brown #include <xmmintrin.h> 8fb56d528SJed Brown #endif 94c1414c8SBarry Smith 10d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatCreateColInode_Private(Mat A, PetscInt *size, PetscInt **ns) 11d71ae5a4SJacob Faibussowitsch { 124c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 134c1414c8SBarry Smith PetscInt i, count, m, n, min_mn, *ns_row, *ns_col; 144c1414c8SBarry Smith 154c1414c8SBarry Smith PetscFunctionBegin; 16d0f46423SBarry Smith n = A->cmap->n; 17d0f46423SBarry Smith m = A->rmap->n; 18*4d12350bSJunchao Zhang PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 19*4d12350bSJunchao Zhang ns_row = a->inode.size_csr; 204c1414c8SBarry Smith 214c1414c8SBarry Smith min_mn = (m < n) ? m : n; 224c1414c8SBarry Smith if (!ns) { 23*4d12350bSJunchao Zhang for (count = 0, i = 0; count < min_mn; count += (ns_row[i + 1] - ns_row[i]), i++); 24fbccb6d4SPierre Jolivet for (; count + 1 < n; count++, i++); 25ad540459SPierre Jolivet if (count < n) i++; 264c1414c8SBarry Smith *size = i; 273ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 284c1414c8SBarry Smith } 299566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(n + 1, &ns_col)); 30*4d12350bSJunchao Zhang ns_col[0] = 0; 314c1414c8SBarry Smith 324c1414c8SBarry Smith /* Use the same row structure wherever feasible. */ 33*4d12350bSJunchao Zhang for (count = 0, i = 0; count < min_mn; count += (ns_row[i + 1] - ns_row[i]), i++) ns_col[i + 1] = ns_row[i + 1]; 344c1414c8SBarry Smith 354c1414c8SBarry Smith /* if m < n; pad up the remainder with inode_limit */ 36*4d12350bSJunchao Zhang for (; count + 1 < n; count++, i++) ns_col[i + 1] = ns_col[i] + 1; 37aaa8cc7dSPierre Jolivet /* The last node is the odd ball. pad it up with the remaining rows; */ 384c1414c8SBarry Smith if (count < n) { 39*4d12350bSJunchao Zhang ns_col[i + 1] = ns_col[i] + (n - count); 404c1414c8SBarry Smith i++; 414c1414c8SBarry Smith } else if (count > n) { 424c1414c8SBarry Smith /* Adjust for the over estimation */ 43*4d12350bSJunchao Zhang ns_col[i] += n - count; 444c1414c8SBarry Smith } 454c1414c8SBarry Smith *size = i; 464c1414c8SBarry Smith *ns = ns_col; 473ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 484c1414c8SBarry Smith } 494c1414c8SBarry Smith 504c1414c8SBarry Smith /* 514c1414c8SBarry Smith This builds symmetric version of nonzero structure, 524c1414c8SBarry Smith */ 53d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift) 54d71ae5a4SJacob Faibussowitsch { 554c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 568758e1faSBarry Smith PetscInt *work, *ia, *ja, nz, nslim_row, nslim_col, m, row, col, n; 57*4d12350bSJunchao Zhang PetscInt *tns, *tvc, *ns_row = a->inode.size_csr, *ns_col, nsz, i1, i2; 588758e1faSBarry Smith const PetscInt *j, *jmax, *ai = a->i, *aj = a->j; 594c1414c8SBarry Smith 604c1414c8SBarry Smith PetscFunctionBegin; 614c1414c8SBarry Smith nslim_row = a->inode.node_count; 62d0f46423SBarry Smith m = A->rmap->n; 63d0f46423SBarry Smith n = A->cmap->n; 6408401ef6SPierre Jolivet PetscCheck(m == n, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square"); 65*4d12350bSJunchao Zhang PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 664c1414c8SBarry Smith 674c1414c8SBarry Smith /* Use the row_inode as column_inode */ 684c1414c8SBarry Smith nslim_col = nslim_row; 694c1414c8SBarry Smith ns_col = ns_row; 704c1414c8SBarry Smith 7135cb6cd3SPierre Jolivet /* allocate space for reformatted inode structure */ 729566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc)); 73*4d12350bSJunchao Zhang for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + (ns_row[i1 + 1] - ns_row[i1]); 744c1414c8SBarry Smith 754c1414c8SBarry Smith for (i1 = 0, col = 0; i1 < nslim_col; ++i1) { 76*4d12350bSJunchao Zhang nsz = ns_col[i1 + 1] - ns_col[i1]; 772205254eSKarl Rupp for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1; 784c1414c8SBarry Smith } 794c1414c8SBarry Smith /* allocate space for row pointers */ 809566063dSJacob Faibussowitsch PetscCall(PetscCalloc1(nslim_row + 1, &ia)); 814c1414c8SBarry Smith *iia = ia; 829566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(nslim_row + 1, &work)); 834c1414c8SBarry Smith 844c1414c8SBarry Smith /* determine the number of columns in each row */ 854c1414c8SBarry Smith ia[0] = oshift; 86*4d12350bSJunchao Zhang for (i1 = 0; i1 < nslim_row; i1++) { 87*4d12350bSJunchao Zhang row = ns_row[i1]; 884c1414c8SBarry Smith j = aj + ai[row] + ishift; 894c1414c8SBarry Smith jmax = aj + ai[row + 1] + ishift; 9083fed2edSSatish Balay if (j == jmax) continue; /* empty row */ 914c1414c8SBarry Smith col = *j++ + ishift; 924c1414c8SBarry Smith i2 = tvc[col]; 936aad120cSJose E. Roman while (i2 < i1 && j < jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elements */ 944c1414c8SBarry Smith ia[i1 + 1]++; 954c1414c8SBarry Smith ia[i2 + 1]++; 964c1414c8SBarry Smith i2++; /* Start col of next node */ 9790d2dec7SBarry Smith while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j; 984c1414c8SBarry Smith i2 = tvc[col]; 994c1414c8SBarry Smith } 1004c1414c8SBarry Smith if (i2 == i1) ia[i2 + 1]++; /* now the diagonal element */ 1014c1414c8SBarry Smith } 1024c1414c8SBarry Smith 1034c1414c8SBarry Smith /* shift ia[i] to point to next row */ 1044c1414c8SBarry Smith for (i1 = 1; i1 < nslim_row + 1; i1++) { 1054c1414c8SBarry Smith row = ia[i1 - 1]; 1064c1414c8SBarry Smith ia[i1] += row; 1074c1414c8SBarry Smith work[i1 - 1] = row - oshift; 1084c1414c8SBarry Smith } 1094c1414c8SBarry Smith 1104c1414c8SBarry Smith /* allocate space for column pointers */ 1114c1414c8SBarry Smith nz = ia[nslim_row] + (!ishift); 1129566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(nz, &ja)); 1134c1414c8SBarry Smith *jja = ja; 1144c1414c8SBarry Smith 1154c1414c8SBarry Smith /* loop over lower triangular part putting into ja */ 116*4d12350bSJunchao Zhang for (i1 = 0; i1 < nslim_row; i1++) { 117*4d12350bSJunchao Zhang row = ns_row[i1]; 1184c1414c8SBarry Smith j = aj + ai[row] + ishift; 1194c1414c8SBarry Smith jmax = aj + ai[row + 1] + ishift; 12083fed2edSSatish Balay if (j == jmax) continue; /* empty row */ 1214c1414c8SBarry Smith col = *j++ + ishift; 1224c1414c8SBarry Smith i2 = tvc[col]; 1234c1414c8SBarry Smith while (i2 < i1 && j < jmax) { 1244c1414c8SBarry Smith ja[work[i2]++] = i1 + oshift; 1254c1414c8SBarry Smith ja[work[i1]++] = i2 + oshift; 1264c1414c8SBarry Smith ++i2; 12790d2dec7SBarry Smith while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j; /* Skip rest col indices in this node */ 1284c1414c8SBarry Smith i2 = tvc[col]; 1294c1414c8SBarry Smith } 1304c1414c8SBarry Smith if (i2 == i1) ja[work[i1]++] = i2 + oshift; 1314c1414c8SBarry Smith } 1329566063dSJacob Faibussowitsch PetscCall(PetscFree(work)); 1339566063dSJacob Faibussowitsch PetscCall(PetscFree2(tns, tvc)); 1343ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1354c1414c8SBarry Smith } 1364c1414c8SBarry Smith 1374c1414c8SBarry Smith /* 1384c1414c8SBarry Smith This builds nonsymmetric version of nonzero structure, 1394c1414c8SBarry Smith */ 140d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift) 141d71ae5a4SJacob Faibussowitsch { 1424c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1438758e1faSBarry Smith PetscInt *work, *ia, *ja, nz, nslim_row, n, row, col, *ns_col, nslim_col; 1448758e1faSBarry Smith PetscInt *tns, *tvc, nsz, i1, i2; 145*4d12350bSJunchao Zhang const PetscInt *j, *ai = a->i, *aj = a->j, *ns_row = a->inode.size_csr; 1464c1414c8SBarry Smith 1474c1414c8SBarry Smith PetscFunctionBegin; 148*4d12350bSJunchao Zhang PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 1494c1414c8SBarry Smith nslim_row = a->inode.node_count; 150d0f46423SBarry Smith n = A->cmap->n; 1514c1414c8SBarry Smith 1524c1414c8SBarry Smith /* Create The column_inode for this matrix */ 1539566063dSJacob Faibussowitsch PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col)); 1544c1414c8SBarry Smith 15535cb6cd3SPierre Jolivet /* allocate space for reformatted column_inode structure */ 1569566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc)); 157*4d12350bSJunchao Zhang for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + (ns_col[i1 + 1] - ns_col[i1]); 1584c1414c8SBarry Smith 1594c1414c8SBarry Smith for (i1 = 0, col = 0; i1 < nslim_col; ++i1) { 160*4d12350bSJunchao Zhang nsz = ns_col[i1 + 1] - ns_col[i1]; 1612205254eSKarl Rupp for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1; 1624c1414c8SBarry Smith } 1634c1414c8SBarry Smith /* allocate space for row pointers */ 1649566063dSJacob Faibussowitsch PetscCall(PetscCalloc1(nslim_row + 1, &ia)); 1654c1414c8SBarry Smith *iia = ia; 1669566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(nslim_row + 1, &work)); 1674c1414c8SBarry Smith 1684c1414c8SBarry Smith /* determine the number of columns in each row */ 1694c1414c8SBarry Smith ia[0] = oshift; 170*4d12350bSJunchao Zhang for (i1 = 0; i1 < nslim_row; i1++) { 171*4d12350bSJunchao Zhang row = ns_row[i1]; 1724c1414c8SBarry Smith j = aj + ai[row] + ishift; 17383fed2edSSatish Balay nz = ai[row + 1] - ai[row]; 17483fed2edSSatish Balay if (!nz) continue; /* empty row */ 1754c1414c8SBarry Smith col = *j++ + ishift; 1764c1414c8SBarry Smith i2 = tvc[col]; 1776aad120cSJose E. Roman while (nz-- > 0) { /* off-diagonal elements */ 1784c1414c8SBarry Smith ia[i1 + 1]++; 1794c1414c8SBarry Smith i2++; /* Start col of next node */ 180a8e3a797SJed Brown while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 1814c1414c8SBarry Smith if (nz > 0) i2 = tvc[col]; 1824c1414c8SBarry Smith } 1834c1414c8SBarry Smith } 1844c1414c8SBarry Smith 1854c1414c8SBarry Smith /* shift ia[i] to point to next row */ 1864c1414c8SBarry Smith for (i1 = 1; i1 < nslim_row + 1; i1++) { 1874c1414c8SBarry Smith row = ia[i1 - 1]; 1884c1414c8SBarry Smith ia[i1] += row; 1894c1414c8SBarry Smith work[i1 - 1] = row - oshift; 1904c1414c8SBarry Smith } 1914c1414c8SBarry Smith 1924c1414c8SBarry Smith /* allocate space for column pointers */ 1934c1414c8SBarry Smith nz = ia[nslim_row] + (!ishift); 1949566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(nz, &ja)); 1954c1414c8SBarry Smith *jja = ja; 1964c1414c8SBarry Smith 1974c1414c8SBarry Smith /* loop over matrix putting into ja */ 198*4d12350bSJunchao Zhang for (i1 = 0; i1 < nslim_row; i1++) { 199*4d12350bSJunchao Zhang row = ns_row[i1]; 2004c1414c8SBarry Smith j = aj + ai[row] + ishift; 20183fed2edSSatish Balay nz = ai[row + 1] - ai[row]; 20283fed2edSSatish Balay if (!nz) continue; /* empty row */ 2034c1414c8SBarry Smith col = *j++ + ishift; 2044c1414c8SBarry Smith i2 = tvc[col]; 2054c1414c8SBarry Smith while (nz-- > 0) { 2064c1414c8SBarry Smith ja[work[i1]++] = i2 + oshift; 2074c1414c8SBarry Smith ++i2; 208a8e3a797SJed Brown while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 2094c1414c8SBarry Smith if (nz > 0) i2 = tvc[col]; 2104c1414c8SBarry Smith } 2114c1414c8SBarry Smith } 2129566063dSJacob Faibussowitsch PetscCall(PetscFree(ns_col)); 2139566063dSJacob Faibussowitsch PetscCall(PetscFree(work)); 2149566063dSJacob Faibussowitsch PetscCall(PetscFree2(tns, tvc)); 2153ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2164c1414c8SBarry Smith } 2174c1414c8SBarry Smith 218d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) 219d71ae5a4SJacob Faibussowitsch { 2204c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2214c1414c8SBarry Smith 2224c1414c8SBarry Smith PetscFunctionBegin; 22350ba90b4SBarry Smith if (n) *n = a->inode.node_count; 2243ba16761SJacob Faibussowitsch if (!ia) PetscFunctionReturn(PETSC_SUCCESS); 2258f7157efSSatish Balay if (!blockcompressed) { 2269566063dSJacob Faibussowitsch PetscCall(MatGetRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done)); 2278f7157efSSatish Balay } else if (symmetric) { 2289566063dSJacob Faibussowitsch PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift)); 2294c1414c8SBarry Smith } else { 2309566063dSJacob Faibussowitsch PetscCall(MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift)); 2314c1414c8SBarry Smith } 2323ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2334c1414c8SBarry Smith } 2344c1414c8SBarry Smith 235d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) 236d71ae5a4SJacob Faibussowitsch { 2374c1414c8SBarry Smith PetscFunctionBegin; 2383ba16761SJacob Faibussowitsch if (!ia) PetscFunctionReturn(PETSC_SUCCESS); 2398f7157efSSatish Balay 2408f7157efSSatish Balay if (!blockcompressed) { 2419566063dSJacob Faibussowitsch PetscCall(MatRestoreRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done)); 2428f7157efSSatish Balay } else { 2439566063dSJacob Faibussowitsch PetscCall(PetscFree(*ia)); 2449566063dSJacob Faibussowitsch PetscCall(PetscFree(*ja)); 2458f7157efSSatish Balay } 2463ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2474c1414c8SBarry Smith } 2484c1414c8SBarry Smith 249d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift) 250d71ae5a4SJacob Faibussowitsch { 2514c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2524c1414c8SBarry Smith PetscInt *work, *ia, *ja, *j, nz, nslim_row, n, row, col, *ns_col, nslim_col; 253*4d12350bSJunchao Zhang PetscInt *tns, *tvc, *ns_row = a->inode.size_csr, nsz, i1, i2, *ai = a->i, *aj = a->j; 2544c1414c8SBarry Smith 2554c1414c8SBarry Smith PetscFunctionBegin; 256*4d12350bSJunchao Zhang PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 2574c1414c8SBarry Smith nslim_row = a->inode.node_count; 258d0f46423SBarry Smith n = A->cmap->n; 2594c1414c8SBarry Smith 2604c1414c8SBarry Smith /* Create The column_inode for this matrix */ 2619566063dSJacob Faibussowitsch PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col)); 2624c1414c8SBarry Smith 26335cb6cd3SPierre Jolivet /* allocate space for reformatted column_inode structure */ 2649566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc)); 265*4d12350bSJunchao Zhang for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + (ns_col[i1 + 1] - ns_col[i1]); 2664c1414c8SBarry Smith 2674c1414c8SBarry Smith for (i1 = 0, col = 0; i1 < nslim_col; ++i1) { 268*4d12350bSJunchao Zhang nsz = ns_col[i1 + 1] - ns_col[i1]; 2692205254eSKarl Rupp for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1; 2704c1414c8SBarry Smith } 2714c1414c8SBarry Smith /* allocate space for column pointers */ 2729566063dSJacob Faibussowitsch PetscCall(PetscCalloc1(nslim_col + 1, &ia)); 2734c1414c8SBarry Smith *iia = ia; 2749566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(nslim_col + 1, &work)); 2754c1414c8SBarry Smith 2764c1414c8SBarry Smith /* determine the number of columns in each row */ 2774c1414c8SBarry Smith ia[0] = oshift; 278*4d12350bSJunchao Zhang for (i1 = 0; i1 < nslim_row; i1++) { 279*4d12350bSJunchao Zhang row = ns_row[i1]; 2804c1414c8SBarry Smith j = aj + ai[row] + ishift; 2814c1414c8SBarry Smith col = *j++ + ishift; 2824c1414c8SBarry Smith i2 = tvc[col]; 2834c1414c8SBarry Smith nz = ai[row + 1] - ai[row]; 2846aad120cSJose E. Roman while (nz-- > 0) { /* off-diagonal elements */ 2854c1414c8SBarry Smith /* ia[i1+1]++; */ 2864c1414c8SBarry Smith ia[i2 + 1]++; 2874c1414c8SBarry Smith i2++; 288a8e3a797SJed Brown while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 2894c1414c8SBarry Smith if (nz > 0) i2 = tvc[col]; 2904c1414c8SBarry Smith } 2914c1414c8SBarry Smith } 2924c1414c8SBarry Smith 2934c1414c8SBarry Smith /* shift ia[i] to point to next col */ 2944c1414c8SBarry Smith for (i1 = 1; i1 < nslim_col + 1; i1++) { 2954c1414c8SBarry Smith col = ia[i1 - 1]; 2964c1414c8SBarry Smith ia[i1] += col; 2974c1414c8SBarry Smith work[i1 - 1] = col - oshift; 2984c1414c8SBarry Smith } 2994c1414c8SBarry Smith 3004c1414c8SBarry Smith /* allocate space for column pointers */ 3014c1414c8SBarry Smith nz = ia[nslim_col] + (!ishift); 3029566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(nz, &ja)); 3034c1414c8SBarry Smith *jja = ja; 3044c1414c8SBarry Smith 3054c1414c8SBarry Smith /* loop over matrix putting into ja */ 306*4d12350bSJunchao Zhang for (i1 = 0; i1 < nslim_row; i1++) { 307*4d12350bSJunchao Zhang row = ns_row[i1]; 3084c1414c8SBarry Smith j = aj + ai[row] + ishift; 3094c1414c8SBarry Smith col = *j++ + ishift; 3104c1414c8SBarry Smith i2 = tvc[col]; 3114c1414c8SBarry Smith nz = ai[row + 1] - ai[row]; 3124c1414c8SBarry Smith while (nz-- > 0) { 3134c1414c8SBarry Smith /* ja[work[i1]++] = i2 + oshift; */ 3144c1414c8SBarry Smith ja[work[i2]++] = i1 + oshift; 3154c1414c8SBarry Smith i2++; 316a8e3a797SJed Brown while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 3174c1414c8SBarry Smith if (nz > 0) i2 = tvc[col]; 3184c1414c8SBarry Smith } 3194c1414c8SBarry Smith } 3209566063dSJacob Faibussowitsch PetscCall(PetscFree(ns_col)); 3219566063dSJacob Faibussowitsch PetscCall(PetscFree(work)); 3229566063dSJacob Faibussowitsch PetscCall(PetscFree2(tns, tvc)); 3233ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3244c1414c8SBarry Smith } 3254c1414c8SBarry Smith 326d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) 327d71ae5a4SJacob Faibussowitsch { 3284c1414c8SBarry Smith PetscFunctionBegin; 3299566063dSJacob Faibussowitsch PetscCall(MatCreateColInode_Private(A, n, NULL)); 3303ba16761SJacob Faibussowitsch if (!ia) PetscFunctionReturn(PETSC_SUCCESS); 3314c1414c8SBarry Smith 3328f7157efSSatish Balay if (!blockcompressed) { 3339566063dSJacob Faibussowitsch PetscCall(MatGetColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done)); 3348f7157efSSatish Balay } else if (symmetric) { 335a5b23f4aSJose E. Roman /* Since the indices are symmetric it doesn't matter */ 3369566063dSJacob Faibussowitsch PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift)); 3374c1414c8SBarry Smith } else { 3389566063dSJacob Faibussowitsch PetscCall(MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift)); 3394c1414c8SBarry Smith } 3403ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3414c1414c8SBarry Smith } 3424c1414c8SBarry Smith 343d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) 344d71ae5a4SJacob Faibussowitsch { 3454c1414c8SBarry Smith PetscFunctionBegin; 3463ba16761SJacob Faibussowitsch if (!ia) PetscFunctionReturn(PETSC_SUCCESS); 3478f7157efSSatish Balay if (!blockcompressed) { 3489566063dSJacob Faibussowitsch PetscCall(MatRestoreColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done)); 3498f7157efSSatish Balay } else { 3509566063dSJacob Faibussowitsch PetscCall(PetscFree(*ia)); 3519566063dSJacob Faibussowitsch PetscCall(PetscFree(*ja)); 3528f7157efSSatish Balay } 3533ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3544c1414c8SBarry Smith } 3554c1414c8SBarry Smith 356d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqAIJ_Inode(Mat A, Vec xx, Vec yy) 357d71ae5a4SJacob Faibussowitsch { 3584c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3594c1414c8SBarry Smith PetscScalar sum1, sum2, sum3, sum4, sum5, tmp0, tmp1; 360d9fead3dSBarry Smith PetscScalar *y; 361dd6ea824SBarry Smith const PetscScalar *x; 362dd6ea824SBarry Smith const MatScalar *v1, *v2, *v3, *v4, *v5; 3638758e1faSBarry Smith PetscInt i1, i2, n, i, row, node_max, nsz, sz, nonzerorow = 0; 3648758e1faSBarry Smith const PetscInt *idx, *ns, *ii; 3654c1414c8SBarry Smith 3664c1414c8SBarry Smith #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 3674c1414c8SBarry Smith #pragma disjoint(*x, *y, *v1, *v2, *v3, *v4, *v5) 3684c1414c8SBarry Smith #endif 3694c1414c8SBarry Smith 3704c1414c8SBarry Smith PetscFunctionBegin; 371*4d12350bSJunchao Zhang PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 3724c1414c8SBarry Smith node_max = a->inode.node_count; 373*4d12350bSJunchao Zhang ns = a->inode.size_csr; /* Node Size array */ 3749566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 3759566063dSJacob Faibussowitsch PetscCall(VecGetArray(yy, &y)); 3764c1414c8SBarry Smith idx = a->j; 3774c1414c8SBarry Smith v1 = a->a; 3784c1414c8SBarry Smith ii = a->i; 3794c1414c8SBarry Smith 3804c1414c8SBarry Smith for (i = 0, row = 0; i < node_max; ++i) { 381*4d12350bSJunchao Zhang nsz = ns[i + 1] - ns[i]; 3824c1414c8SBarry Smith n = ii[1] - ii[0]; 38398c9bda7SSatish Balay nonzerorow += (n > 0) * nsz; 3844c1414c8SBarry Smith ii += nsz; 38550d8bf02SJed Brown PetscPrefetchBlock(idx + nsz * n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the indices for the block row after the current one */ 38650d8bf02SJed Brown PetscPrefetchBlock(v1 + nsz * n, nsz * n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one */ 3874c1414c8SBarry Smith sz = n; /* No of non zeros in this row */ 3884c1414c8SBarry Smith /* Switch on the size of Node */ 3894c1414c8SBarry Smith switch (nsz) { /* Each loop in 'case' is unrolled */ 3904c1414c8SBarry Smith case 1: 39175567043SBarry Smith sum1 = 0.; 3924c1414c8SBarry Smith 3934c1414c8SBarry Smith for (n = 0; n < sz - 1; n += 2) { 3944c1414c8SBarry Smith i1 = idx[0]; /* The instructions are ordered to */ 3954c1414c8SBarry Smith i2 = idx[1]; /* make the compiler's job easy */ 3964c1414c8SBarry Smith idx += 2; 3974c1414c8SBarry Smith tmp0 = x[i1]; 3984c1414c8SBarry Smith tmp1 = x[i2]; 3999371c9d4SSatish Balay sum1 += v1[0] * tmp0 + v1[1] * tmp1; 4009371c9d4SSatish Balay v1 += 2; 4014c1414c8SBarry Smith } 4024c1414c8SBarry Smith 4034c1414c8SBarry Smith if (n == sz - 1) { /* Take care of the last nonzero */ 4044c1414c8SBarry Smith tmp0 = x[*idx++]; 4054c1414c8SBarry Smith sum1 += *v1++ * tmp0; 4064c1414c8SBarry Smith } 4074c1414c8SBarry Smith y[row++] = sum1; 4084c1414c8SBarry Smith break; 4094c1414c8SBarry Smith case 2: 41075567043SBarry Smith sum1 = 0.; 41175567043SBarry Smith sum2 = 0.; 4124c1414c8SBarry Smith v2 = v1 + n; 4134c1414c8SBarry Smith 4144c1414c8SBarry Smith for (n = 0; n < sz - 1; n += 2) { 4154c1414c8SBarry Smith i1 = idx[0]; 4164c1414c8SBarry Smith i2 = idx[1]; 4174c1414c8SBarry Smith idx += 2; 4184c1414c8SBarry Smith tmp0 = x[i1]; 4194c1414c8SBarry Smith tmp1 = x[i2]; 4209371c9d4SSatish Balay sum1 += v1[0] * tmp0 + v1[1] * tmp1; 4219371c9d4SSatish Balay v1 += 2; 4229371c9d4SSatish Balay sum2 += v2[0] * tmp0 + v2[1] * tmp1; 4239371c9d4SSatish Balay v2 += 2; 4244c1414c8SBarry Smith } 4254c1414c8SBarry Smith if (n == sz - 1) { 4264c1414c8SBarry Smith tmp0 = x[*idx++]; 4274c1414c8SBarry Smith sum1 += *v1++ * tmp0; 4284c1414c8SBarry Smith sum2 += *v2++ * tmp0; 4294c1414c8SBarry Smith } 4304c1414c8SBarry Smith y[row++] = sum1; 4314c1414c8SBarry Smith y[row++] = sum2; 4324c1414c8SBarry Smith v1 = v2; /* Since the next block to be processed starts there*/ 4334c1414c8SBarry Smith idx += sz; 4344c1414c8SBarry Smith break; 4354c1414c8SBarry Smith case 3: 43675567043SBarry Smith sum1 = 0.; 43775567043SBarry Smith sum2 = 0.; 43875567043SBarry Smith sum3 = 0.; 4394c1414c8SBarry Smith v2 = v1 + n; 4404c1414c8SBarry Smith v3 = v2 + n; 4414c1414c8SBarry Smith 4424c1414c8SBarry Smith for (n = 0; n < sz - 1; n += 2) { 4434c1414c8SBarry Smith i1 = idx[0]; 4444c1414c8SBarry Smith i2 = idx[1]; 4454c1414c8SBarry Smith idx += 2; 4464c1414c8SBarry Smith tmp0 = x[i1]; 4474c1414c8SBarry Smith tmp1 = x[i2]; 4489371c9d4SSatish Balay sum1 += v1[0] * tmp0 + v1[1] * tmp1; 4499371c9d4SSatish Balay v1 += 2; 4509371c9d4SSatish Balay sum2 += v2[0] * tmp0 + v2[1] * tmp1; 4519371c9d4SSatish Balay v2 += 2; 4529371c9d4SSatish Balay sum3 += v3[0] * tmp0 + v3[1] * tmp1; 4539371c9d4SSatish Balay v3 += 2; 4544c1414c8SBarry Smith } 4554c1414c8SBarry Smith if (n == sz - 1) { 4564c1414c8SBarry Smith tmp0 = x[*idx++]; 4574c1414c8SBarry Smith sum1 += *v1++ * tmp0; 4584c1414c8SBarry Smith sum2 += *v2++ * tmp0; 4594c1414c8SBarry Smith sum3 += *v3++ * tmp0; 4604c1414c8SBarry Smith } 4614c1414c8SBarry Smith y[row++] = sum1; 4624c1414c8SBarry Smith y[row++] = sum2; 4634c1414c8SBarry Smith y[row++] = sum3; 4644c1414c8SBarry Smith v1 = v3; /* Since the next block to be processed starts there*/ 4654c1414c8SBarry Smith idx += 2 * sz; 4664c1414c8SBarry Smith break; 4674c1414c8SBarry Smith case 4: 46875567043SBarry Smith sum1 = 0.; 46975567043SBarry Smith sum2 = 0.; 47075567043SBarry Smith sum3 = 0.; 47175567043SBarry Smith sum4 = 0.; 4724c1414c8SBarry Smith v2 = v1 + n; 4734c1414c8SBarry Smith v3 = v2 + n; 4744c1414c8SBarry Smith v4 = v3 + n; 4754c1414c8SBarry Smith 4764c1414c8SBarry Smith for (n = 0; n < sz - 1; n += 2) { 4774c1414c8SBarry Smith i1 = idx[0]; 4784c1414c8SBarry Smith i2 = idx[1]; 4794c1414c8SBarry Smith idx += 2; 4804c1414c8SBarry Smith tmp0 = x[i1]; 4814c1414c8SBarry Smith tmp1 = x[i2]; 4829371c9d4SSatish Balay sum1 += v1[0] * tmp0 + v1[1] * tmp1; 4839371c9d4SSatish Balay v1 += 2; 4849371c9d4SSatish Balay sum2 += v2[0] * tmp0 + v2[1] * tmp1; 4859371c9d4SSatish Balay v2 += 2; 4869371c9d4SSatish Balay sum3 += v3[0] * tmp0 + v3[1] * tmp1; 4879371c9d4SSatish Balay v3 += 2; 4889371c9d4SSatish Balay sum4 += v4[0] * tmp0 + v4[1] * tmp1; 4899371c9d4SSatish Balay v4 += 2; 4904c1414c8SBarry Smith } 4914c1414c8SBarry Smith if (n == sz - 1) { 4924c1414c8SBarry Smith tmp0 = x[*idx++]; 4934c1414c8SBarry Smith sum1 += *v1++ * tmp0; 4944c1414c8SBarry Smith sum2 += *v2++ * tmp0; 4954c1414c8SBarry Smith sum3 += *v3++ * tmp0; 4964c1414c8SBarry Smith sum4 += *v4++ * tmp0; 4974c1414c8SBarry Smith } 4984c1414c8SBarry Smith y[row++] = sum1; 4994c1414c8SBarry Smith y[row++] = sum2; 5004c1414c8SBarry Smith y[row++] = sum3; 5014c1414c8SBarry Smith y[row++] = sum4; 5024c1414c8SBarry Smith v1 = v4; /* Since the next block to be processed starts there*/ 5034c1414c8SBarry Smith idx += 3 * sz; 5044c1414c8SBarry Smith break; 5054c1414c8SBarry Smith case 5: 50675567043SBarry Smith sum1 = 0.; 50775567043SBarry Smith sum2 = 0.; 50875567043SBarry Smith sum3 = 0.; 50975567043SBarry Smith sum4 = 0.; 51075567043SBarry Smith sum5 = 0.; 5114c1414c8SBarry Smith v2 = v1 + n; 5124c1414c8SBarry Smith v3 = v2 + n; 5134c1414c8SBarry Smith v4 = v3 + n; 5144c1414c8SBarry Smith v5 = v4 + n; 5154c1414c8SBarry Smith 5164c1414c8SBarry Smith for (n = 0; n < sz - 1; n += 2) { 5174c1414c8SBarry Smith i1 = idx[0]; 5184c1414c8SBarry Smith i2 = idx[1]; 5194c1414c8SBarry Smith idx += 2; 5204c1414c8SBarry Smith tmp0 = x[i1]; 5214c1414c8SBarry Smith tmp1 = x[i2]; 5229371c9d4SSatish Balay sum1 += v1[0] * tmp0 + v1[1] * tmp1; 5239371c9d4SSatish Balay v1 += 2; 5249371c9d4SSatish Balay sum2 += v2[0] * tmp0 + v2[1] * tmp1; 5259371c9d4SSatish Balay v2 += 2; 5269371c9d4SSatish Balay sum3 += v3[0] * tmp0 + v3[1] * tmp1; 5279371c9d4SSatish Balay v3 += 2; 5289371c9d4SSatish Balay sum4 += v4[0] * tmp0 + v4[1] * tmp1; 5299371c9d4SSatish Balay v4 += 2; 5309371c9d4SSatish Balay sum5 += v5[0] * tmp0 + v5[1] * tmp1; 5319371c9d4SSatish Balay v5 += 2; 5324c1414c8SBarry Smith } 5334c1414c8SBarry Smith if (n == sz - 1) { 5344c1414c8SBarry Smith tmp0 = x[*idx++]; 5354c1414c8SBarry Smith sum1 += *v1++ * tmp0; 5364c1414c8SBarry Smith sum2 += *v2++ * tmp0; 5374c1414c8SBarry Smith sum3 += *v3++ * tmp0; 5384c1414c8SBarry Smith sum4 += *v4++ * tmp0; 5394c1414c8SBarry Smith sum5 += *v5++ * tmp0; 5404c1414c8SBarry Smith } 5414c1414c8SBarry Smith y[row++] = sum1; 5424c1414c8SBarry Smith y[row++] = sum2; 5434c1414c8SBarry Smith y[row++] = sum3; 5444c1414c8SBarry Smith y[row++] = sum4; 5454c1414c8SBarry Smith y[row++] = sum5; 5464c1414c8SBarry Smith v1 = v5; /* Since the next block to be processed starts there */ 5474c1414c8SBarry Smith idx += 4 * sz; 5484c1414c8SBarry Smith break; 549d71ae5a4SJacob Faibussowitsch default: 5500c335700SBarry Smith SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nsz); 5514c1414c8SBarry Smith } 5524c1414c8SBarry Smith } 5539566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 5549566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(yy, &y)); 5559566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz - nonzerorow)); 5563ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 5574c1414c8SBarry Smith } 5582ef1f0ffSBarry Smith 5594108e4d5SBarry Smith /* Almost same code as the MatMult_SeqAIJ_Inode() */ 560d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A, Vec xx, Vec zz, Vec yy) 561d71ae5a4SJacob Faibussowitsch { 5624c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 5634c1414c8SBarry Smith PetscScalar sum1, sum2, sum3, sum4, sum5, tmp0, tmp1; 5648758e1faSBarry Smith const MatScalar *v1, *v2, *v3, *v4, *v5; 5658758e1faSBarry Smith const PetscScalar *x; 5668758e1faSBarry Smith PetscScalar *y, *z, *zt; 5678758e1faSBarry Smith PetscInt i1, i2, n, i, row, node_max, nsz, sz; 5688758e1faSBarry Smith const PetscInt *idx, *ns, *ii; 5694c1414c8SBarry Smith 5704c1414c8SBarry Smith PetscFunctionBegin; 571*4d12350bSJunchao Zhang PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 5724c1414c8SBarry Smith node_max = a->inode.node_count; 573*4d12350bSJunchao Zhang ns = a->inode.size_csr; /* Node Size array */ 5742205254eSKarl Rupp 5759566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 5769566063dSJacob Faibussowitsch PetscCall(VecGetArrayPair(zz, yy, &z, &y)); 5774c1414c8SBarry Smith zt = z; 5784c1414c8SBarry Smith 5794c1414c8SBarry Smith idx = a->j; 5804c1414c8SBarry Smith v1 = a->a; 5814c1414c8SBarry Smith ii = a->i; 5824c1414c8SBarry Smith 583*4d12350bSJunchao Zhang for (i = 0; i < node_max; ++i) { 584*4d12350bSJunchao Zhang row = ns[i]; 585*4d12350bSJunchao Zhang nsz = ns[i + 1] - ns[i]; 5864c1414c8SBarry Smith n = ii[1] - ii[0]; 5874c1414c8SBarry Smith ii += nsz; 5884c1414c8SBarry Smith sz = n; /* No of non zeros in this row */ 5894c1414c8SBarry Smith /* Switch on the size of Node */ 5904c1414c8SBarry Smith switch (nsz) { /* Each loop in 'case' is unrolled */ 5914c1414c8SBarry Smith case 1: 5924c1414c8SBarry Smith sum1 = *zt++; 5934c1414c8SBarry Smith 5944c1414c8SBarry Smith for (n = 0; n < sz - 1; n += 2) { 5954c1414c8SBarry Smith i1 = idx[0]; /* The instructions are ordered to */ 5964c1414c8SBarry Smith i2 = idx[1]; /* make the compiler's job easy */ 5974c1414c8SBarry Smith idx += 2; 5984c1414c8SBarry Smith tmp0 = x[i1]; 5994c1414c8SBarry Smith tmp1 = x[i2]; 6009371c9d4SSatish Balay sum1 += v1[0] * tmp0 + v1[1] * tmp1; 6019371c9d4SSatish Balay v1 += 2; 6024c1414c8SBarry Smith } 6034c1414c8SBarry Smith 6044c1414c8SBarry Smith if (n == sz - 1) { /* Take care of the last nonzero */ 6054c1414c8SBarry Smith tmp0 = x[*idx++]; 6064c1414c8SBarry Smith sum1 += *v1++ * tmp0; 6074c1414c8SBarry Smith } 6084c1414c8SBarry Smith y[row++] = sum1; 6094c1414c8SBarry Smith break; 6104c1414c8SBarry Smith case 2: 6114c1414c8SBarry Smith sum1 = *zt++; 6124c1414c8SBarry Smith sum2 = *zt++; 6134c1414c8SBarry Smith v2 = v1 + n; 6144c1414c8SBarry Smith 6154c1414c8SBarry Smith for (n = 0; n < sz - 1; n += 2) { 6164c1414c8SBarry Smith i1 = idx[0]; 6174c1414c8SBarry Smith i2 = idx[1]; 6184c1414c8SBarry Smith idx += 2; 6194c1414c8SBarry Smith tmp0 = x[i1]; 6204c1414c8SBarry Smith tmp1 = x[i2]; 6219371c9d4SSatish Balay sum1 += v1[0] * tmp0 + v1[1] * tmp1; 6229371c9d4SSatish Balay v1 += 2; 6239371c9d4SSatish Balay sum2 += v2[0] * tmp0 + v2[1] * tmp1; 6249371c9d4SSatish Balay v2 += 2; 6254c1414c8SBarry Smith } 6264c1414c8SBarry Smith if (n == sz - 1) { 6274c1414c8SBarry Smith tmp0 = x[*idx++]; 6284c1414c8SBarry Smith sum1 += *v1++ * tmp0; 6294c1414c8SBarry Smith sum2 += *v2++ * tmp0; 6304c1414c8SBarry Smith } 6314c1414c8SBarry Smith y[row++] = sum1; 6324c1414c8SBarry Smith y[row++] = sum2; 6334c1414c8SBarry Smith v1 = v2; /* Since the next block to be processed starts there*/ 6344c1414c8SBarry Smith idx += sz; 6354c1414c8SBarry Smith break; 6364c1414c8SBarry Smith case 3: 6374c1414c8SBarry Smith sum1 = *zt++; 6384c1414c8SBarry Smith sum2 = *zt++; 6394c1414c8SBarry Smith sum3 = *zt++; 6404c1414c8SBarry Smith v2 = v1 + n; 6414c1414c8SBarry Smith v3 = v2 + n; 6424c1414c8SBarry Smith 6434c1414c8SBarry Smith for (n = 0; n < sz - 1; n += 2) { 6444c1414c8SBarry Smith i1 = idx[0]; 6454c1414c8SBarry Smith i2 = idx[1]; 6464c1414c8SBarry Smith idx += 2; 6474c1414c8SBarry Smith tmp0 = x[i1]; 6484c1414c8SBarry Smith tmp1 = x[i2]; 6499371c9d4SSatish Balay sum1 += v1[0] * tmp0 + v1[1] * tmp1; 6509371c9d4SSatish Balay v1 += 2; 6519371c9d4SSatish Balay sum2 += v2[0] * tmp0 + v2[1] * tmp1; 6529371c9d4SSatish Balay v2 += 2; 6539371c9d4SSatish Balay sum3 += v3[0] * tmp0 + v3[1] * tmp1; 6549371c9d4SSatish Balay v3 += 2; 6554c1414c8SBarry Smith } 6564c1414c8SBarry Smith if (n == sz - 1) { 6574c1414c8SBarry Smith tmp0 = x[*idx++]; 6584c1414c8SBarry Smith sum1 += *v1++ * tmp0; 6594c1414c8SBarry Smith sum2 += *v2++ * tmp0; 6604c1414c8SBarry Smith sum3 += *v3++ * tmp0; 6614c1414c8SBarry Smith } 6624c1414c8SBarry Smith y[row++] = sum1; 6634c1414c8SBarry Smith y[row++] = sum2; 6644c1414c8SBarry Smith y[row++] = sum3; 6654c1414c8SBarry Smith v1 = v3; /* Since the next block to be processed starts there*/ 6664c1414c8SBarry Smith idx += 2 * sz; 6674c1414c8SBarry Smith break; 6684c1414c8SBarry Smith case 4: 6694c1414c8SBarry Smith sum1 = *zt++; 6704c1414c8SBarry Smith sum2 = *zt++; 6714c1414c8SBarry Smith sum3 = *zt++; 6724c1414c8SBarry Smith sum4 = *zt++; 6734c1414c8SBarry Smith v2 = v1 + n; 6744c1414c8SBarry Smith v3 = v2 + n; 6754c1414c8SBarry Smith v4 = v3 + n; 6764c1414c8SBarry Smith 6774c1414c8SBarry Smith for (n = 0; n < sz - 1; n += 2) { 6784c1414c8SBarry Smith i1 = idx[0]; 6794c1414c8SBarry Smith i2 = idx[1]; 6804c1414c8SBarry Smith idx += 2; 6814c1414c8SBarry Smith tmp0 = x[i1]; 6824c1414c8SBarry Smith tmp1 = x[i2]; 6839371c9d4SSatish Balay sum1 += v1[0] * tmp0 + v1[1] * tmp1; 6849371c9d4SSatish Balay v1 += 2; 6859371c9d4SSatish Balay sum2 += v2[0] * tmp0 + v2[1] * tmp1; 6869371c9d4SSatish Balay v2 += 2; 6879371c9d4SSatish Balay sum3 += v3[0] * tmp0 + v3[1] * tmp1; 6889371c9d4SSatish Balay v3 += 2; 6899371c9d4SSatish Balay sum4 += v4[0] * tmp0 + v4[1] * tmp1; 6909371c9d4SSatish Balay v4 += 2; 6914c1414c8SBarry Smith } 6924c1414c8SBarry Smith if (n == sz - 1) { 6934c1414c8SBarry Smith tmp0 = x[*idx++]; 6944c1414c8SBarry Smith sum1 += *v1++ * tmp0; 6954c1414c8SBarry Smith sum2 += *v2++ * tmp0; 6964c1414c8SBarry Smith sum3 += *v3++ * tmp0; 6974c1414c8SBarry Smith sum4 += *v4++ * tmp0; 6984c1414c8SBarry Smith } 6994c1414c8SBarry Smith y[row++] = sum1; 7004c1414c8SBarry Smith y[row++] = sum2; 7014c1414c8SBarry Smith y[row++] = sum3; 7024c1414c8SBarry Smith y[row++] = sum4; 7034c1414c8SBarry Smith v1 = v4; /* Since the next block to be processed starts there*/ 7044c1414c8SBarry Smith idx += 3 * sz; 7054c1414c8SBarry Smith break; 7064c1414c8SBarry Smith case 5: 7074c1414c8SBarry Smith sum1 = *zt++; 7084c1414c8SBarry Smith sum2 = *zt++; 7094c1414c8SBarry Smith sum3 = *zt++; 7104c1414c8SBarry Smith sum4 = *zt++; 7114c1414c8SBarry Smith sum5 = *zt++; 7124c1414c8SBarry Smith v2 = v1 + n; 7134c1414c8SBarry Smith v3 = v2 + n; 7144c1414c8SBarry Smith v4 = v3 + n; 7154c1414c8SBarry Smith v5 = v4 + n; 7164c1414c8SBarry Smith 7174c1414c8SBarry Smith for (n = 0; n < sz - 1; n += 2) { 7184c1414c8SBarry Smith i1 = idx[0]; 7194c1414c8SBarry Smith i2 = idx[1]; 7204c1414c8SBarry Smith idx += 2; 7214c1414c8SBarry Smith tmp0 = x[i1]; 7224c1414c8SBarry Smith tmp1 = x[i2]; 7239371c9d4SSatish Balay sum1 += v1[0] * tmp0 + v1[1] * tmp1; 7249371c9d4SSatish Balay v1 += 2; 7259371c9d4SSatish Balay sum2 += v2[0] * tmp0 + v2[1] * tmp1; 7269371c9d4SSatish Balay v2 += 2; 7279371c9d4SSatish Balay sum3 += v3[0] * tmp0 + v3[1] * tmp1; 7289371c9d4SSatish Balay v3 += 2; 7299371c9d4SSatish Balay sum4 += v4[0] * tmp0 + v4[1] * tmp1; 7309371c9d4SSatish Balay v4 += 2; 7319371c9d4SSatish Balay sum5 += v5[0] * tmp0 + v5[1] * tmp1; 7329371c9d4SSatish Balay v5 += 2; 7334c1414c8SBarry Smith } 7344c1414c8SBarry Smith if (n == sz - 1) { 7354c1414c8SBarry Smith tmp0 = x[*idx++]; 7364c1414c8SBarry Smith sum1 += *v1++ * tmp0; 7374c1414c8SBarry Smith sum2 += *v2++ * tmp0; 7384c1414c8SBarry Smith sum3 += *v3++ * tmp0; 7394c1414c8SBarry Smith sum4 += *v4++ * tmp0; 7404c1414c8SBarry Smith sum5 += *v5++ * tmp0; 7414c1414c8SBarry Smith } 7424c1414c8SBarry Smith y[row++] = sum1; 7434c1414c8SBarry Smith y[row++] = sum2; 7444c1414c8SBarry Smith y[row++] = sum3; 7454c1414c8SBarry Smith y[row++] = sum4; 7464c1414c8SBarry Smith y[row++] = sum5; 7474c1414c8SBarry Smith v1 = v5; /* Since the next block to be processed starts there */ 7484c1414c8SBarry Smith idx += 4 * sz; 7494c1414c8SBarry Smith break; 750d71ae5a4SJacob Faibussowitsch default: 751d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported"); 7524c1414c8SBarry Smith } 7534c1414c8SBarry Smith } 7549566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 7559566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayPair(zz, yy, &z, &y)); 7569566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz)); 7573ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 7584c1414c8SBarry Smith } 7594c1414c8SBarry Smith 760ff6a9541SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A, Vec bb, Vec xx) 761d71ae5a4SJacob Faibussowitsch { 7624c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 7634c1414c8SBarry Smith IS iscol = a->col, isrow = a->row; 7645d0c19d7SBarry Smith const PetscInt *r, *c, *rout, *cout; 7658758e1faSBarry Smith PetscInt i, j, n = A->rmap->n, nz; 7668758e1faSBarry Smith PetscInt node_max, *ns, row, nsz, aii, i0, i1; 7678758e1faSBarry Smith const PetscInt *ai = a->i, *a_j = a->j, *vi, *ad, *aj; 768d9fead3dSBarry Smith PetscScalar *x, *tmp, *tmps, tmp0, tmp1; 769d9fead3dSBarry Smith PetscScalar sum1, sum2, sum3, sum4, sum5; 770dd6ea824SBarry Smith const MatScalar *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa; 771dd6ea824SBarry Smith const PetscScalar *b; 7724c1414c8SBarry Smith 7734c1414c8SBarry Smith PetscFunctionBegin; 774*4d12350bSJunchao Zhang PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 7754c1414c8SBarry Smith node_max = a->inode.node_count; 776*4d12350bSJunchao Zhang ns = a->inode.size_csr; /* Node Size array */ 7774c1414c8SBarry Smith 7789566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 7799566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(xx, &x)); 7804c1414c8SBarry Smith tmp = a->solve_work; 7814c1414c8SBarry Smith 7829371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 7839371c9d4SSatish Balay r = rout; 7849371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 7859371c9d4SSatish Balay c = cout + (n - 1); 7864c1414c8SBarry Smith 7874c1414c8SBarry Smith /* forward solve the lower triangular */ 7884c1414c8SBarry Smith tmps = tmp; 7894c1414c8SBarry Smith aa = a_a; 7904c1414c8SBarry Smith aj = a_j; 7914c1414c8SBarry Smith ad = a->diag; 7924c1414c8SBarry Smith 7934c1414c8SBarry Smith for (i = 0, row = 0; i < node_max; ++i) { 794*4d12350bSJunchao Zhang row = ns[i]; 795*4d12350bSJunchao Zhang nsz = ns[i + 1] - ns[i]; 7964c1414c8SBarry Smith aii = ai[row]; 7974c1414c8SBarry Smith v1 = aa + aii; 7984c1414c8SBarry Smith vi = aj + aii; 7994c1414c8SBarry Smith nz = ad[row] - aii; 80026549573SJed Brown if (i < node_max - 1) { 80126549573SJed Brown /* Prefetch the block after the current one, the prefetch itself can't cause a memory error, 80291c35059SPierre Jolivet * but our indexing to determine its size could. */ 80350d8bf02SJed Brown PetscPrefetchBlock(aj + ai[row + nsz], ad[row + nsz] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */ 80426549573SJed Brown /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */ 805*4d12350bSJunchao Zhang PetscPrefetchBlock(aa + ai[row + nsz], ad[ns[i + 2] - 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); 80626549573SJed Brown /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */ 80726549573SJed Brown } 8084c1414c8SBarry Smith 8094c1414c8SBarry Smith switch (nsz) { /* Each loop in 'case' is unrolled */ 8104c1414c8SBarry Smith case 1: 8114c1414c8SBarry Smith sum1 = b[*r++]; 8124c1414c8SBarry Smith for (j = 0; j < nz - 1; j += 2) { 8134c1414c8SBarry Smith i0 = vi[0]; 8144c1414c8SBarry Smith i1 = vi[1]; 8154c1414c8SBarry Smith vi += 2; 8164c1414c8SBarry Smith tmp0 = tmps[i0]; 8174c1414c8SBarry Smith tmp1 = tmps[i1]; 8189371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 8199371c9d4SSatish Balay v1 += 2; 8204c1414c8SBarry Smith } 8214c1414c8SBarry Smith if (j == nz - 1) { 8224c1414c8SBarry Smith tmp0 = tmps[*vi++]; 8234c1414c8SBarry Smith sum1 -= *v1++ * tmp0; 8244c1414c8SBarry Smith } 8254c1414c8SBarry Smith tmp[row++] = sum1; 8264c1414c8SBarry Smith break; 8274c1414c8SBarry Smith case 2: 8284c1414c8SBarry Smith sum1 = b[*r++]; 8294c1414c8SBarry Smith sum2 = b[*r++]; 8304c1414c8SBarry Smith v2 = aa + ai[row + 1]; 8314c1414c8SBarry Smith 8324c1414c8SBarry Smith for (j = 0; j < nz - 1; j += 2) { 8334c1414c8SBarry Smith i0 = vi[0]; 8344c1414c8SBarry Smith i1 = vi[1]; 8354c1414c8SBarry Smith vi += 2; 8364c1414c8SBarry Smith tmp0 = tmps[i0]; 8374c1414c8SBarry Smith tmp1 = tmps[i1]; 8389371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 8399371c9d4SSatish Balay v1 += 2; 8409371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 8419371c9d4SSatish Balay v2 += 2; 8424c1414c8SBarry Smith } 8434c1414c8SBarry Smith if (j == nz - 1) { 8444c1414c8SBarry Smith tmp0 = tmps[*vi++]; 8454c1414c8SBarry Smith sum1 -= *v1++ * tmp0; 8464c1414c8SBarry Smith sum2 -= *v2++ * tmp0; 8474c1414c8SBarry Smith } 8484c1414c8SBarry Smith sum2 -= *v2++ * sum1; 8494c1414c8SBarry Smith tmp[row++] = sum1; 8504c1414c8SBarry Smith tmp[row++] = sum2; 8514c1414c8SBarry Smith break; 8524c1414c8SBarry Smith case 3: 8534c1414c8SBarry Smith sum1 = b[*r++]; 8544c1414c8SBarry Smith sum2 = b[*r++]; 8554c1414c8SBarry Smith sum3 = b[*r++]; 8564c1414c8SBarry Smith v2 = aa + ai[row + 1]; 8574c1414c8SBarry Smith v3 = aa + ai[row + 2]; 8584c1414c8SBarry Smith 8594c1414c8SBarry Smith for (j = 0; j < nz - 1; j += 2) { 8604c1414c8SBarry Smith i0 = vi[0]; 8614c1414c8SBarry Smith i1 = vi[1]; 8624c1414c8SBarry Smith vi += 2; 8634c1414c8SBarry Smith tmp0 = tmps[i0]; 8644c1414c8SBarry Smith tmp1 = tmps[i1]; 8659371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 8669371c9d4SSatish Balay v1 += 2; 8679371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 8689371c9d4SSatish Balay v2 += 2; 8699371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 8709371c9d4SSatish Balay v3 += 2; 8714c1414c8SBarry Smith } 8724c1414c8SBarry Smith if (j == nz - 1) { 8734c1414c8SBarry Smith tmp0 = tmps[*vi++]; 8744c1414c8SBarry Smith sum1 -= *v1++ * tmp0; 8754c1414c8SBarry Smith sum2 -= *v2++ * tmp0; 8764c1414c8SBarry Smith sum3 -= *v3++ * tmp0; 8774c1414c8SBarry Smith } 8784c1414c8SBarry Smith sum2 -= *v2++ * sum1; 8794c1414c8SBarry Smith sum3 -= *v3++ * sum1; 8804c1414c8SBarry Smith sum3 -= *v3++ * sum2; 8812205254eSKarl Rupp 8824c1414c8SBarry Smith tmp[row++] = sum1; 8834c1414c8SBarry Smith tmp[row++] = sum2; 8844c1414c8SBarry Smith tmp[row++] = sum3; 8854c1414c8SBarry Smith break; 8864c1414c8SBarry Smith 8874c1414c8SBarry Smith case 4: 8884c1414c8SBarry Smith sum1 = b[*r++]; 8894c1414c8SBarry Smith sum2 = b[*r++]; 8904c1414c8SBarry Smith sum3 = b[*r++]; 8914c1414c8SBarry Smith sum4 = b[*r++]; 8924c1414c8SBarry Smith v2 = aa + ai[row + 1]; 8934c1414c8SBarry Smith v3 = aa + ai[row + 2]; 8944c1414c8SBarry Smith v4 = aa + ai[row + 3]; 8954c1414c8SBarry Smith 8964c1414c8SBarry Smith for (j = 0; j < nz - 1; j += 2) { 8974c1414c8SBarry Smith i0 = vi[0]; 8984c1414c8SBarry Smith i1 = vi[1]; 8994c1414c8SBarry Smith vi += 2; 9004c1414c8SBarry Smith tmp0 = tmps[i0]; 9014c1414c8SBarry Smith tmp1 = tmps[i1]; 9029371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 9039371c9d4SSatish Balay v1 += 2; 9049371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 9059371c9d4SSatish Balay v2 += 2; 9069371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 9079371c9d4SSatish Balay v3 += 2; 9089371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 9099371c9d4SSatish Balay v4 += 2; 9104c1414c8SBarry Smith } 9114c1414c8SBarry Smith if (j == nz - 1) { 9124c1414c8SBarry Smith tmp0 = tmps[*vi++]; 9134c1414c8SBarry Smith sum1 -= *v1++ * tmp0; 9144c1414c8SBarry Smith sum2 -= *v2++ * tmp0; 9154c1414c8SBarry Smith sum3 -= *v3++ * tmp0; 9164c1414c8SBarry Smith sum4 -= *v4++ * tmp0; 9174c1414c8SBarry Smith } 9184c1414c8SBarry Smith sum2 -= *v2++ * sum1; 9194c1414c8SBarry Smith sum3 -= *v3++ * sum1; 9204c1414c8SBarry Smith sum4 -= *v4++ * sum1; 9214c1414c8SBarry Smith sum3 -= *v3++ * sum2; 9224c1414c8SBarry Smith sum4 -= *v4++ * sum2; 9234c1414c8SBarry Smith sum4 -= *v4++ * sum3; 9244c1414c8SBarry Smith 9254c1414c8SBarry Smith tmp[row++] = sum1; 9264c1414c8SBarry Smith tmp[row++] = sum2; 9274c1414c8SBarry Smith tmp[row++] = sum3; 9284c1414c8SBarry Smith tmp[row++] = sum4; 9294c1414c8SBarry Smith break; 9304c1414c8SBarry Smith case 5: 9314c1414c8SBarry Smith sum1 = b[*r++]; 9324c1414c8SBarry Smith sum2 = b[*r++]; 9334c1414c8SBarry Smith sum3 = b[*r++]; 9344c1414c8SBarry Smith sum4 = b[*r++]; 9354c1414c8SBarry Smith sum5 = b[*r++]; 9364c1414c8SBarry Smith v2 = aa + ai[row + 1]; 9374c1414c8SBarry Smith v3 = aa + ai[row + 2]; 9384c1414c8SBarry Smith v4 = aa + ai[row + 3]; 9394c1414c8SBarry Smith v5 = aa + ai[row + 4]; 9404c1414c8SBarry Smith 9414c1414c8SBarry Smith for (j = 0; j < nz - 1; j += 2) { 9424c1414c8SBarry Smith i0 = vi[0]; 9434c1414c8SBarry Smith i1 = vi[1]; 9444c1414c8SBarry Smith vi += 2; 9454c1414c8SBarry Smith tmp0 = tmps[i0]; 9464c1414c8SBarry Smith tmp1 = tmps[i1]; 9479371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 9489371c9d4SSatish Balay v1 += 2; 9499371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 9509371c9d4SSatish Balay v2 += 2; 9519371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 9529371c9d4SSatish Balay v3 += 2; 9539371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 9549371c9d4SSatish Balay v4 += 2; 9559371c9d4SSatish Balay sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 9569371c9d4SSatish Balay v5 += 2; 9574c1414c8SBarry Smith } 9584c1414c8SBarry Smith if (j == nz - 1) { 9594c1414c8SBarry Smith tmp0 = tmps[*vi++]; 9604c1414c8SBarry Smith sum1 -= *v1++ * tmp0; 9614c1414c8SBarry Smith sum2 -= *v2++ * tmp0; 9624c1414c8SBarry Smith sum3 -= *v3++ * tmp0; 9634c1414c8SBarry Smith sum4 -= *v4++ * tmp0; 9644c1414c8SBarry Smith sum5 -= *v5++ * tmp0; 9654c1414c8SBarry Smith } 9664c1414c8SBarry Smith 9674c1414c8SBarry Smith sum2 -= *v2++ * sum1; 9684c1414c8SBarry Smith sum3 -= *v3++ * sum1; 9694c1414c8SBarry Smith sum4 -= *v4++ * sum1; 9704c1414c8SBarry Smith sum5 -= *v5++ * sum1; 9714c1414c8SBarry Smith sum3 -= *v3++ * sum2; 9724c1414c8SBarry Smith sum4 -= *v4++ * sum2; 9734c1414c8SBarry Smith sum5 -= *v5++ * sum2; 9744c1414c8SBarry Smith sum4 -= *v4++ * sum3; 9754c1414c8SBarry Smith sum5 -= *v5++ * sum3; 9764c1414c8SBarry Smith sum5 -= *v5++ * sum4; 9774c1414c8SBarry Smith 9784c1414c8SBarry Smith tmp[row++] = sum1; 9794c1414c8SBarry Smith tmp[row++] = sum2; 9804c1414c8SBarry Smith tmp[row++] = sum3; 9814c1414c8SBarry Smith tmp[row++] = sum4; 9824c1414c8SBarry Smith tmp[row++] = sum5; 9834c1414c8SBarry Smith break; 984d71ae5a4SJacob Faibussowitsch default: 985d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported "); 9864c1414c8SBarry Smith } 9874c1414c8SBarry Smith } 9884c1414c8SBarry Smith /* backward solve the upper triangular */ 989*4d12350bSJunchao Zhang for (i = node_max - 1; i >= 0; i--) { 990*4d12350bSJunchao Zhang row = ns[i + 1]; 991*4d12350bSJunchao Zhang nsz = ns[i + 1] - ns[i]; 9924c1414c8SBarry Smith aii = ai[row + 1] - 1; 9934c1414c8SBarry Smith v1 = aa + aii; 9944c1414c8SBarry Smith vi = aj + aii; 9954c1414c8SBarry Smith nz = aii - ad[row]; 9964c1414c8SBarry Smith switch (nsz) { /* Each loop in 'case' is unrolled */ 9974c1414c8SBarry Smith case 1: 9984c1414c8SBarry Smith sum1 = tmp[row]; 9994c1414c8SBarry Smith 10004c1414c8SBarry Smith for (j = nz; j > 1; j -= 2) { 10014c1414c8SBarry Smith vi -= 2; 10024c1414c8SBarry Smith i0 = vi[2]; 10034c1414c8SBarry Smith i1 = vi[1]; 10044c1414c8SBarry Smith tmp0 = tmps[i0]; 10054c1414c8SBarry Smith tmp1 = tmps[i1]; 10064c1414c8SBarry Smith v1 -= 2; 10074c1414c8SBarry Smith sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 10084c1414c8SBarry Smith } 10094c1414c8SBarry Smith if (j == 1) { 10104c1414c8SBarry Smith tmp0 = tmps[*vi--]; 10114c1414c8SBarry Smith sum1 -= *v1-- * tmp0; 10124c1414c8SBarry Smith } 10139371c9d4SSatish Balay x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 10149371c9d4SSatish Balay row--; 10154c1414c8SBarry Smith break; 10164c1414c8SBarry Smith case 2: 10174c1414c8SBarry Smith sum1 = tmp[row]; 10184c1414c8SBarry Smith sum2 = tmp[row - 1]; 10194c1414c8SBarry Smith v2 = aa + ai[row] - 1; 10204c1414c8SBarry Smith for (j = nz; j > 1; j -= 2) { 10214c1414c8SBarry Smith vi -= 2; 10224c1414c8SBarry Smith i0 = vi[2]; 10234c1414c8SBarry Smith i1 = vi[1]; 10244c1414c8SBarry Smith tmp0 = tmps[i0]; 10254c1414c8SBarry Smith tmp1 = tmps[i1]; 10264c1414c8SBarry Smith v1 -= 2; 10274c1414c8SBarry Smith v2 -= 2; 10284c1414c8SBarry Smith sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 10294c1414c8SBarry Smith sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 10304c1414c8SBarry Smith } 10314c1414c8SBarry Smith if (j == 1) { 10324c1414c8SBarry Smith tmp0 = tmps[*vi--]; 10334c1414c8SBarry Smith sum1 -= *v1-- * tmp0; 10344c1414c8SBarry Smith sum2 -= *v2-- * tmp0; 10354c1414c8SBarry Smith } 10364c1414c8SBarry Smith 10379371c9d4SSatish Balay tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 10389371c9d4SSatish Balay row--; 10394c1414c8SBarry Smith sum2 -= *v2-- * tmp0; 10409371c9d4SSatish Balay x[*c--] = tmp[row] = sum2 * a_a[ad[row]]; 10419371c9d4SSatish Balay row--; 10424c1414c8SBarry Smith break; 10434c1414c8SBarry Smith case 3: 10444c1414c8SBarry Smith sum1 = tmp[row]; 10454c1414c8SBarry Smith sum2 = tmp[row - 1]; 10464c1414c8SBarry Smith sum3 = tmp[row - 2]; 10474c1414c8SBarry Smith v2 = aa + ai[row] - 1; 10484c1414c8SBarry Smith v3 = aa + ai[row - 1] - 1; 10494c1414c8SBarry Smith for (j = nz; j > 1; j -= 2) { 10504c1414c8SBarry Smith vi -= 2; 10514c1414c8SBarry Smith i0 = vi[2]; 10524c1414c8SBarry Smith i1 = vi[1]; 10534c1414c8SBarry Smith tmp0 = tmps[i0]; 10544c1414c8SBarry Smith tmp1 = tmps[i1]; 10554c1414c8SBarry Smith v1 -= 2; 10564c1414c8SBarry Smith v2 -= 2; 10574c1414c8SBarry Smith v3 -= 2; 10584c1414c8SBarry Smith sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 10594c1414c8SBarry Smith sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 10604c1414c8SBarry Smith sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 10614c1414c8SBarry Smith } 10624c1414c8SBarry Smith if (j == 1) { 10634c1414c8SBarry Smith tmp0 = tmps[*vi--]; 10644c1414c8SBarry Smith sum1 -= *v1-- * tmp0; 10654c1414c8SBarry Smith sum2 -= *v2-- * tmp0; 10664c1414c8SBarry Smith sum3 -= *v3-- * tmp0; 10674c1414c8SBarry Smith } 10689371c9d4SSatish Balay tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 10699371c9d4SSatish Balay row--; 10704c1414c8SBarry Smith sum2 -= *v2-- * tmp0; 10714c1414c8SBarry Smith sum3 -= *v3-- * tmp0; 10729371c9d4SSatish Balay tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]]; 10739371c9d4SSatish Balay row--; 10744c1414c8SBarry Smith sum3 -= *v3-- * tmp0; 10759371c9d4SSatish Balay x[*c--] = tmp[row] = sum3 * a_a[ad[row]]; 10769371c9d4SSatish Balay row--; 10774c1414c8SBarry Smith 10784c1414c8SBarry Smith break; 10794c1414c8SBarry Smith case 4: 10804c1414c8SBarry Smith sum1 = tmp[row]; 10814c1414c8SBarry Smith sum2 = tmp[row - 1]; 10824c1414c8SBarry Smith sum3 = tmp[row - 2]; 10834c1414c8SBarry Smith sum4 = tmp[row - 3]; 10844c1414c8SBarry Smith v2 = aa + ai[row] - 1; 10854c1414c8SBarry Smith v3 = aa + ai[row - 1] - 1; 10864c1414c8SBarry Smith v4 = aa + ai[row - 2] - 1; 10874c1414c8SBarry Smith 10884c1414c8SBarry Smith for (j = nz; j > 1; j -= 2) { 10894c1414c8SBarry Smith vi -= 2; 10904c1414c8SBarry Smith i0 = vi[2]; 10914c1414c8SBarry Smith i1 = vi[1]; 10924c1414c8SBarry Smith tmp0 = tmps[i0]; 10934c1414c8SBarry Smith tmp1 = tmps[i1]; 10944c1414c8SBarry Smith v1 -= 2; 10954c1414c8SBarry Smith v2 -= 2; 10964c1414c8SBarry Smith v3 -= 2; 10974c1414c8SBarry Smith v4 -= 2; 10984c1414c8SBarry Smith sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 10994c1414c8SBarry Smith sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 11004c1414c8SBarry Smith sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 11014c1414c8SBarry Smith sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 11024c1414c8SBarry Smith } 11034c1414c8SBarry Smith if (j == 1) { 11044c1414c8SBarry Smith tmp0 = tmps[*vi--]; 11054c1414c8SBarry Smith sum1 -= *v1-- * tmp0; 11064c1414c8SBarry Smith sum2 -= *v2-- * tmp0; 11074c1414c8SBarry Smith sum3 -= *v3-- * tmp0; 11084c1414c8SBarry Smith sum4 -= *v4-- * tmp0; 11094c1414c8SBarry Smith } 11104c1414c8SBarry Smith 11119371c9d4SSatish Balay tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 11129371c9d4SSatish Balay row--; 11134c1414c8SBarry Smith sum2 -= *v2-- * tmp0; 11144c1414c8SBarry Smith sum3 -= *v3-- * tmp0; 11154c1414c8SBarry Smith sum4 -= *v4-- * tmp0; 11169371c9d4SSatish Balay tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]]; 11179371c9d4SSatish Balay row--; 11184c1414c8SBarry Smith sum3 -= *v3-- * tmp0; 11194c1414c8SBarry Smith sum4 -= *v4-- * tmp0; 11209371c9d4SSatish Balay tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]]; 11219371c9d4SSatish Balay row--; 11224c1414c8SBarry Smith sum4 -= *v4-- * tmp0; 11239371c9d4SSatish Balay x[*c--] = tmp[row] = sum4 * a_a[ad[row]]; 11249371c9d4SSatish Balay row--; 11254c1414c8SBarry Smith break; 11264c1414c8SBarry Smith case 5: 11274c1414c8SBarry Smith sum1 = tmp[row]; 11284c1414c8SBarry Smith sum2 = tmp[row - 1]; 11294c1414c8SBarry Smith sum3 = tmp[row - 2]; 11304c1414c8SBarry Smith sum4 = tmp[row - 3]; 11314c1414c8SBarry Smith sum5 = tmp[row - 4]; 11324c1414c8SBarry Smith v2 = aa + ai[row] - 1; 11334c1414c8SBarry Smith v3 = aa + ai[row - 1] - 1; 11344c1414c8SBarry Smith v4 = aa + ai[row - 2] - 1; 11354c1414c8SBarry Smith v5 = aa + ai[row - 3] - 1; 11364c1414c8SBarry Smith for (j = nz; j > 1; j -= 2) { 11374c1414c8SBarry Smith vi -= 2; 11384c1414c8SBarry Smith i0 = vi[2]; 11394c1414c8SBarry Smith i1 = vi[1]; 11404c1414c8SBarry Smith tmp0 = tmps[i0]; 11414c1414c8SBarry Smith tmp1 = tmps[i1]; 11424c1414c8SBarry Smith v1 -= 2; 11434c1414c8SBarry Smith v2 -= 2; 11444c1414c8SBarry Smith v3 -= 2; 11454c1414c8SBarry Smith v4 -= 2; 11464c1414c8SBarry Smith v5 -= 2; 11474c1414c8SBarry Smith sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 11484c1414c8SBarry Smith sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 11494c1414c8SBarry Smith sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 11504c1414c8SBarry Smith sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 11514c1414c8SBarry Smith sum5 -= v5[2] * tmp0 + v5[1] * tmp1; 11524c1414c8SBarry Smith } 11534c1414c8SBarry Smith if (j == 1) { 11544c1414c8SBarry Smith tmp0 = tmps[*vi--]; 11554c1414c8SBarry Smith sum1 -= *v1-- * tmp0; 11564c1414c8SBarry Smith sum2 -= *v2-- * tmp0; 11574c1414c8SBarry Smith sum3 -= *v3-- * tmp0; 11584c1414c8SBarry Smith sum4 -= *v4-- * tmp0; 11594c1414c8SBarry Smith sum5 -= *v5-- * tmp0; 11604c1414c8SBarry Smith } 11614c1414c8SBarry Smith 11629371c9d4SSatish Balay tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 11639371c9d4SSatish Balay row--; 11644c1414c8SBarry Smith sum2 -= *v2-- * tmp0; 11654c1414c8SBarry Smith sum3 -= *v3-- * tmp0; 11664c1414c8SBarry Smith sum4 -= *v4-- * tmp0; 11674c1414c8SBarry Smith sum5 -= *v5-- * tmp0; 11689371c9d4SSatish Balay tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]]; 11699371c9d4SSatish Balay row--; 11704c1414c8SBarry Smith sum3 -= *v3-- * tmp0; 11714c1414c8SBarry Smith sum4 -= *v4-- * tmp0; 11724c1414c8SBarry Smith sum5 -= *v5-- * tmp0; 11739371c9d4SSatish Balay tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]]; 11749371c9d4SSatish Balay row--; 11754c1414c8SBarry Smith sum4 -= *v4-- * tmp0; 11764c1414c8SBarry Smith sum5 -= *v5-- * tmp0; 11779371c9d4SSatish Balay tmp0 = x[*c--] = tmp[row] = sum4 * a_a[ad[row]]; 11789371c9d4SSatish Balay row--; 11794c1414c8SBarry Smith sum5 -= *v5-- * tmp0; 11809371c9d4SSatish Balay x[*c--] = tmp[row] = sum5 * a_a[ad[row]]; 11819371c9d4SSatish Balay row--; 11824c1414c8SBarry Smith break; 1183d71ae5a4SJacob Faibussowitsch default: 1184d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported "); 11854c1414c8SBarry Smith } 11864c1414c8SBarry Smith } 11879566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 11889566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 11899566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 11909566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(xx, &x)); 11919566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n)); 11923ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 11934c1414c8SBarry Smith } 11944c1414c8SBarry Smith 1195d71ae5a4SJacob Faibussowitsch PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B, Mat A, const MatFactorInfo *info) 1196d71ae5a4SJacob Faibussowitsch { 119728f1b45aSHong Zhang Mat C = B; 119828f1b45aSHong Zhang Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data; 119928f1b45aSHong Zhang IS isrow = b->row, isicol = b->icol; 120028f1b45aSHong Zhang const PetscInt *r, *ic, *ics; 120128f1b45aSHong Zhang const PetscInt n = A->rmap->n, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag; 120228f1b45aSHong Zhang PetscInt i, j, k, nz, nzL, row, *pj; 120328f1b45aSHong Zhang const PetscInt *ajtmp, *bjtmp; 12049877982aSShri Abhyankar MatScalar *pc, *pc1, *pc2, *pc3, *pc4, mul1, mul2, mul3, mul4, *pv, *rtmp1, *rtmp2, *rtmp3, *rtmp4; 12059877982aSShri Abhyankar const MatScalar *aa = a->a, *v, *v1, *v2, *v3, *v4; 120628f1b45aSHong Zhang FactorShiftCtx sctx; 12074f81c4b7SBarry Smith const PetscInt *ddiag; 120828f1b45aSHong Zhang PetscReal rs; 120928f1b45aSHong Zhang MatScalar d; 12104f81c4b7SBarry Smith PetscInt inod, nodesz, node_max, col; 12114f81c4b7SBarry Smith const PetscInt *ns; 121207b50cabSHong Zhang PetscInt *tmp_vec1, *tmp_vec2, *nsmap; 12130e95ead3SHong Zhang 121428f1b45aSHong Zhang PetscFunctionBegin; 121528f1b45aSHong Zhang /* MatPivotSetUp(): initialize shift context sctx */ 12169566063dSJacob Faibussowitsch PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx))); 121728f1b45aSHong Zhang 1218f4db908eSBarry Smith if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 121928f1b45aSHong Zhang ddiag = a->diag; 122028f1b45aSHong Zhang sctx.shift_top = info->zeropivot; 122128f1b45aSHong Zhang for (i = 0; i < n; i++) { 122228f1b45aSHong Zhang /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 122328f1b45aSHong Zhang d = (aa)[ddiag[i]]; 122428f1b45aSHong Zhang rs = -PetscAbsScalar(d) - PetscRealPart(d); 122528f1b45aSHong Zhang v = aa + ai[i]; 122628f1b45aSHong Zhang nz = ai[i + 1] - ai[i]; 12272205254eSKarl Rupp for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]); 122828f1b45aSHong Zhang if (rs > sctx.shift_top) sctx.shift_top = rs; 122928f1b45aSHong Zhang } 123028f1b45aSHong Zhang sctx.shift_top *= 1.1; 123128f1b45aSHong Zhang sctx.nshift_max = 5; 123228f1b45aSHong Zhang sctx.shift_lo = 0.; 123328f1b45aSHong Zhang sctx.shift_hi = 1.; 123428f1b45aSHong Zhang } 123528f1b45aSHong Zhang 12369566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow, &r)); 12379566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isicol, &ic)); 123868785679SHong Zhang 12399566063dSJacob Faibussowitsch PetscCall(PetscCalloc4(n, &rtmp1, n, &rtmp2, n, &rtmp3, n, &rtmp4)); 124028f1b45aSHong Zhang ics = ic; 124128f1b45aSHong Zhang 124228f1b45aSHong Zhang node_max = a->inode.node_count; 1243*4d12350bSJunchao Zhang ns = a->inode.size_csr; 124428b400f6SJacob Faibussowitsch PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information"); 124528f1b45aSHong Zhang 12469877982aSShri Abhyankar /* If max inode size > 4, split it into two inodes.*/ 124768785679SHong Zhang /* also map the inode sizes according to the ordering */ 12489566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(n + 1, &tmp_vec1)); 124968785679SHong Zhang for (i = 0, j = 0; i < node_max; ++i, ++j) { 1250*4d12350bSJunchao Zhang nodesz = ns[i + 1] - ns[i]; 1251*4d12350bSJunchao Zhang if (nodesz > 4) { 1252048b5e81SShri Abhyankar tmp_vec1[j] = 4; 125368785679SHong Zhang ++j; 1254*4d12350bSJunchao Zhang tmp_vec1[j] = nodesz - tmp_vec1[j - 1]; 125568785679SHong Zhang } else { 1256*4d12350bSJunchao Zhang tmp_vec1[j] = nodesz; 125768785679SHong Zhang } 125868785679SHong Zhang } 125968785679SHong Zhang /* Use the correct node_max */ 126068785679SHong Zhang node_max = j; 126168785679SHong Zhang 126268785679SHong Zhang /* Now reorder the inode info based on mat re-ordering info */ 126368785679SHong Zhang /* First create a row -> inode_size_array_index map */ 12649566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(n + 1, &nsmap)); 12659566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(node_max + 1, &tmp_vec2)); 1266*4d12350bSJunchao Zhang tmp_vec2[0] = 0; 126768785679SHong Zhang for (i = 0, row = 0; i < node_max; i++) { 126868785679SHong Zhang nodesz = tmp_vec1[i]; 1269ad540459SPierre Jolivet for (j = 0; j < nodesz; j++, row++) nsmap[row] = i; 127068785679SHong Zhang } 127168785679SHong Zhang /* Using nsmap, create a reordered ns structure */ 127268785679SHong Zhang for (i = 0, j = 0; i < node_max; i++) { 127368785679SHong Zhang nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1274*4d12350bSJunchao Zhang tmp_vec2[i + 1] = tmp_vec2[i] + nodesz; 127568785679SHong Zhang j += nodesz; 127668785679SHong Zhang } 12779566063dSJacob Faibussowitsch PetscCall(PetscFree(nsmap)); 12789566063dSJacob Faibussowitsch PetscCall(PetscFree(tmp_vec1)); 1279b89f182dSHong Zhang 128068785679SHong Zhang /* Now use the correct ns */ 128168785679SHong Zhang ns = tmp_vec2; 128268785679SHong Zhang 128328f1b45aSHong Zhang do { 128407b50cabSHong Zhang sctx.newshift = PETSC_FALSE; 128528f1b45aSHong Zhang /* Now loop over each block-row, and do the factorization */ 128628f1b45aSHong Zhang for (inod = 0, i = 0; inod < node_max; inod++) { /* i: row index; inod: inode index */ 1287*4d12350bSJunchao Zhang nodesz = ns[inod + 1] - ns[inod]; 128828f1b45aSHong Zhang 128928f1b45aSHong Zhang switch (nodesz) { 129028f1b45aSHong Zhang case 1: 1291b89f182dSHong Zhang /* zero rtmp1 */ 129228f1b45aSHong Zhang /* L part */ 129328f1b45aSHong Zhang nz = bi[i + 1] - bi[i]; 129428f1b45aSHong Zhang bjtmp = bj + bi[i]; 1295b89f182dSHong Zhang for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0; 129628f1b45aSHong Zhang 129728f1b45aSHong Zhang /* U part */ 129828f1b45aSHong Zhang nz = bdiag[i] - bdiag[i + 1]; 129928f1b45aSHong Zhang bjtmp = bj + bdiag[i + 1] + 1; 1300b89f182dSHong Zhang for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0; 130128f1b45aSHong Zhang 130228f1b45aSHong Zhang /* load in initial (unfactored row) */ 130328f1b45aSHong Zhang nz = ai[r[i] + 1] - ai[r[i]]; 130428f1b45aSHong Zhang ajtmp = aj + ai[r[i]]; 130528f1b45aSHong Zhang v = aa + ai[r[i]]; 13062205254eSKarl Rupp for (j = 0; j < nz; j++) rtmp1[ics[ajtmp[j]]] = v[j]; 13072205254eSKarl Rupp 130828f1b45aSHong Zhang /* ZeropivotApply() */ 1309b89f182dSHong Zhang rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */ 131028f1b45aSHong Zhang 131128f1b45aSHong Zhang /* elimination */ 131228f1b45aSHong Zhang bjtmp = bj + bi[i]; 131328f1b45aSHong Zhang row = *bjtmp++; 131428f1b45aSHong Zhang nzL = bi[i + 1] - bi[i]; 131528f1b45aSHong Zhang for (k = 0; k < nzL; k++) { 1316b89f182dSHong Zhang pc = rtmp1 + row; 131728f1b45aSHong Zhang if (*pc != 0.0) { 131828f1b45aSHong Zhang pv = b->a + bdiag[row]; 1319b89f182dSHong Zhang mul1 = *pc * (*pv); 1320b89f182dSHong Zhang *pc = mul1; 132128f1b45aSHong Zhang pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 132228f1b45aSHong Zhang pv = b->a + bdiag[row + 1] + 1; 132328f1b45aSHong Zhang nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 1324b89f182dSHong Zhang for (j = 0; j < nz; j++) rtmp1[pj[j]] -= mul1 * pv[j]; 13259566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(1 + 2.0 * nz)); 132628f1b45aSHong Zhang } 132728f1b45aSHong Zhang row = *bjtmp++; 132828f1b45aSHong Zhang } 132928f1b45aSHong Zhang 133028f1b45aSHong Zhang /* finished row so stick it into b->a */ 133128f1b45aSHong Zhang rs = 0.0; 133228f1b45aSHong Zhang /* L part */ 133328f1b45aSHong Zhang pv = b->a + bi[i]; 133428f1b45aSHong Zhang pj = b->j + bi[i]; 133528f1b45aSHong Zhang nz = bi[i + 1] - bi[i]; 133628f1b45aSHong Zhang for (j = 0; j < nz; j++) { 13379371c9d4SSatish Balay pv[j] = rtmp1[pj[j]]; 13389371c9d4SSatish Balay rs += PetscAbsScalar(pv[j]); 133928f1b45aSHong Zhang } 134028f1b45aSHong Zhang 134128f1b45aSHong Zhang /* U part */ 134228f1b45aSHong Zhang pv = b->a + bdiag[i + 1] + 1; 134328f1b45aSHong Zhang pj = b->j + bdiag[i + 1] + 1; 134428f1b45aSHong Zhang nz = bdiag[i] - bdiag[i + 1] - 1; 134528f1b45aSHong Zhang for (j = 0; j < nz; j++) { 13469371c9d4SSatish Balay pv[j] = rtmp1[pj[j]]; 13479371c9d4SSatish Balay rs += PetscAbsScalar(pv[j]); 134828f1b45aSHong Zhang } 134928f1b45aSHong Zhang 1350b89f182dSHong Zhang /* Check zero pivot */ 135128f1b45aSHong Zhang sctx.rs = rs; 1352b89f182dSHong Zhang sctx.pv = rtmp1[i]; 13539566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 135407b50cabSHong Zhang if (sctx.newshift) break; 135528f1b45aSHong Zhang 1356a5b23f4aSJose E. Roman /* Mark diagonal and invert diagonal for simpler triangular solves */ 135728f1b45aSHong Zhang pv = b->a + bdiag[i]; 1358b89f182dSHong Zhang *pv = 1.0 / sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */ 135928f1b45aSHong Zhang break; 136028f1b45aSHong Zhang 136128f1b45aSHong Zhang case 2: 1362b89f182dSHong Zhang /* zero rtmp1 and rtmp2 */ 136328f1b45aSHong Zhang /* L part */ 136428f1b45aSHong Zhang nz = bi[i + 1] - bi[i]; 136528f1b45aSHong Zhang bjtmp = bj + bi[i]; 136628f1b45aSHong Zhang for (j = 0; j < nz; j++) { 136768785679SHong Zhang col = bjtmp[j]; 13689371c9d4SSatish Balay rtmp1[col] = 0.0; 13699371c9d4SSatish Balay rtmp2[col] = 0.0; 137028f1b45aSHong Zhang } 137128f1b45aSHong Zhang 137228f1b45aSHong Zhang /* U part */ 137328f1b45aSHong Zhang nz = bdiag[i] - bdiag[i + 1]; 137428f1b45aSHong Zhang bjtmp = bj + bdiag[i + 1] + 1; 137528f1b45aSHong Zhang for (j = 0; j < nz; j++) { 137668785679SHong Zhang col = bjtmp[j]; 13779371c9d4SSatish Balay rtmp1[col] = 0.0; 13789371c9d4SSatish Balay rtmp2[col] = 0.0; 137928f1b45aSHong Zhang } 138028f1b45aSHong Zhang 138128f1b45aSHong Zhang /* load in initial (unfactored row) */ 138228f1b45aSHong Zhang nz = ai[r[i] + 1] - ai[r[i]]; 138328f1b45aSHong Zhang ajtmp = aj + ai[r[i]]; 13849371c9d4SSatish Balay v1 = aa + ai[r[i]]; 13859371c9d4SSatish Balay v2 = aa + ai[r[i] + 1]; 138628f1b45aSHong Zhang for (j = 0; j < nz; j++) { 138768785679SHong Zhang col = ics[ajtmp[j]]; 13889371c9d4SSatish Balay rtmp1[col] = v1[j]; 13899371c9d4SSatish Balay rtmp2[col] = v2[j]; 139028f1b45aSHong Zhang } 139128f1b45aSHong Zhang /* ZeropivotApply(): shift the diagonal of the matrix */ 13929371c9d4SSatish Balay rtmp1[i] += sctx.shift_amount; 13939371c9d4SSatish Balay rtmp2[i + 1] += sctx.shift_amount; 139428f1b45aSHong Zhang 139528f1b45aSHong Zhang /* elimination */ 139628f1b45aSHong Zhang bjtmp = bj + bi[i]; 139728f1b45aSHong Zhang row = *bjtmp++; /* pivot row */ 139828f1b45aSHong Zhang nzL = bi[i + 1] - bi[i]; 139928f1b45aSHong Zhang for (k = 0; k < nzL; k++) { 1400b89f182dSHong Zhang pc1 = rtmp1 + row; 1401b89f182dSHong Zhang pc2 = rtmp2 + row; 140228f1b45aSHong Zhang if (*pc1 != 0.0 || *pc2 != 0.0) { 140328f1b45aSHong Zhang pv = b->a + bdiag[row]; 14049371c9d4SSatish Balay mul1 = *pc1 * (*pv); 14059371c9d4SSatish Balay mul2 = *pc2 * (*pv); 14069371c9d4SSatish Balay *pc1 = mul1; 14079371c9d4SSatish Balay *pc2 = mul2; 140828f1b45aSHong Zhang 140928f1b45aSHong Zhang pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 141028f1b45aSHong Zhang pv = b->a + bdiag[row + 1] + 1; 141128f1b45aSHong Zhang nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 141228f1b45aSHong Zhang for (j = 0; j < nz; j++) { 141368785679SHong Zhang col = pj[j]; 1414b89f182dSHong Zhang rtmp1[col] -= mul1 * pv[j]; 1415b89f182dSHong Zhang rtmp2[col] -= mul2 * pv[j]; 141628f1b45aSHong Zhang } 14179566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2 + 4.0 * nz)); 141828f1b45aSHong Zhang } 141928f1b45aSHong Zhang row = *bjtmp++; 142028f1b45aSHong Zhang } 142128f1b45aSHong Zhang 1422b89f182dSHong Zhang /* finished row i; check zero pivot, then stick row i into b->a */ 142328f1b45aSHong Zhang rs = 0.0; 142428f1b45aSHong Zhang /* L part */ 1425b89f182dSHong Zhang pc1 = b->a + bi[i]; 142628f1b45aSHong Zhang pj = b->j + bi[i]; 142728f1b45aSHong Zhang nz = bi[i + 1] - bi[i]; 142828f1b45aSHong Zhang for (j = 0; j < nz; j++) { 142968785679SHong Zhang col = pj[j]; 14309371c9d4SSatish Balay pc1[j] = rtmp1[col]; 14319371c9d4SSatish Balay rs += PetscAbsScalar(pc1[j]); 143228f1b45aSHong Zhang } 143328f1b45aSHong Zhang /* U part */ 1434b89f182dSHong Zhang pc1 = b->a + bdiag[i + 1] + 1; 143528f1b45aSHong Zhang pj = b->j + bdiag[i + 1] + 1; 14360e7a5c2bSHong Zhang nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */ 143728f1b45aSHong Zhang for (j = 0; j < nz; j++) { 143868785679SHong Zhang col = pj[j]; 14399371c9d4SSatish Balay pc1[j] = rtmp1[col]; 14409371c9d4SSatish Balay rs += PetscAbsScalar(pc1[j]); 144128f1b45aSHong Zhang } 144228f1b45aSHong Zhang 144328f1b45aSHong Zhang sctx.rs = rs; 1444b89f182dSHong Zhang sctx.pv = rtmp1[i]; 14459566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 144607b50cabSHong Zhang if (sctx.newshift) break; 1447b89f182dSHong Zhang pc1 = b->a + bdiag[i]; /* Mark diagonal */ 1448b89f182dSHong Zhang *pc1 = 1.0 / sctx.pv; 1449b89f182dSHong Zhang 1450b89f182dSHong Zhang /* Now take care of diagonal 2x2 block. */ 1451b89f182dSHong Zhang pc2 = rtmp2 + i; 1452b89f182dSHong Zhang if (*pc2 != 0.0) { 1453b89f182dSHong Zhang mul1 = (*pc2) * (*pc1); /* *pc1=diag[i] is inverted! */ 1454b89f182dSHong Zhang *pc2 = mul1; /* insert L entry */ 1455b89f182dSHong Zhang pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */ 1456b89f182dSHong Zhang nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */ 1457b89f182dSHong Zhang for (j = 0; j < nz; j++) { 14589371c9d4SSatish Balay col = pj[j]; 14599371c9d4SSatish Balay rtmp2[col] -= mul1 * rtmp1[col]; 146028f1b45aSHong Zhang } 14619566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(1 + 2.0 * nz)); 1462b89f182dSHong Zhang } 1463b89f182dSHong Zhang 1464b89f182dSHong Zhang /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1465b89f182dSHong Zhang rs = 0.0; 1466b89f182dSHong Zhang /* L part */ 1467b89f182dSHong Zhang pc2 = b->a + bi[i + 1]; 1468b89f182dSHong Zhang pj = b->j + bi[i + 1]; 1469b89f182dSHong Zhang nz = bi[i + 2] - bi[i + 1]; 1470b89f182dSHong Zhang for (j = 0; j < nz; j++) { 1471b89f182dSHong Zhang col = pj[j]; 14729371c9d4SSatish Balay pc2[j] = rtmp2[col]; 14739371c9d4SSatish Balay rs += PetscAbsScalar(pc2[j]); 1474b89f182dSHong Zhang } 1475b89f182dSHong Zhang /* U part */ 1476b89f182dSHong Zhang pc2 = b->a + bdiag[i + 2] + 1; 14770e7a5c2bSHong Zhang pj = b->j + bdiag[i + 2] + 1; 14780e7a5c2bSHong Zhang nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */ 1479b89f182dSHong Zhang for (j = 0; j < nz; j++) { 1480b89f182dSHong Zhang col = pj[j]; 14819371c9d4SSatish Balay pc2[j] = rtmp2[col]; 14829371c9d4SSatish Balay rs += PetscAbsScalar(pc2[j]); 1483b89f182dSHong Zhang } 1484b89f182dSHong Zhang 148528f1b45aSHong Zhang sctx.rs = rs; 1486b89f182dSHong Zhang sctx.pv = rtmp2[i + 1]; 14879566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1)); 148807b50cabSHong Zhang if (sctx.newshift) break; 148928f1b45aSHong Zhang pc2 = b->a + bdiag[i + 1]; 1490b89f182dSHong Zhang *pc2 = 1.0 / sctx.pv; 149128f1b45aSHong Zhang break; 1492b89f182dSHong Zhang 149368785679SHong Zhang case 3: 149468785679SHong Zhang /* zero rtmp */ 149568785679SHong Zhang /* L part */ 149668785679SHong Zhang nz = bi[i + 1] - bi[i]; 149768785679SHong Zhang bjtmp = bj + bi[i]; 149868785679SHong Zhang for (j = 0; j < nz; j++) { 149968785679SHong Zhang col = bjtmp[j]; 15009371c9d4SSatish Balay rtmp1[col] = 0.0; 15019371c9d4SSatish Balay rtmp2[col] = 0.0; 15029371c9d4SSatish Balay rtmp3[col] = 0.0; 150368785679SHong Zhang } 150468785679SHong Zhang 150568785679SHong Zhang /* U part */ 150668785679SHong Zhang nz = bdiag[i] - bdiag[i + 1]; 150768785679SHong Zhang bjtmp = bj + bdiag[i + 1] + 1; 150868785679SHong Zhang for (j = 0; j < nz; j++) { 150968785679SHong Zhang col = bjtmp[j]; 15109371c9d4SSatish Balay rtmp1[col] = 0.0; 15119371c9d4SSatish Balay rtmp2[col] = 0.0; 15129371c9d4SSatish Balay rtmp3[col] = 0.0; 151368785679SHong Zhang } 151468785679SHong Zhang 151568785679SHong Zhang /* load in initial (unfactored row) */ 151668785679SHong Zhang nz = ai[r[i] + 1] - ai[r[i]]; 151768785679SHong Zhang ajtmp = aj + ai[r[i]]; 15189371c9d4SSatish Balay v1 = aa + ai[r[i]]; 15199371c9d4SSatish Balay v2 = aa + ai[r[i] + 1]; 15209371c9d4SSatish Balay v3 = aa + ai[r[i] + 2]; 152168785679SHong Zhang for (j = 0; j < nz; j++) { 152268785679SHong Zhang col = ics[ajtmp[j]]; 15239371c9d4SSatish Balay rtmp1[col] = v1[j]; 15249371c9d4SSatish Balay rtmp2[col] = v2[j]; 15259371c9d4SSatish Balay rtmp3[col] = v3[j]; 152668785679SHong Zhang } 152768785679SHong Zhang /* ZeropivotApply(): shift the diagonal of the matrix */ 15289371c9d4SSatish Balay rtmp1[i] += sctx.shift_amount; 15299371c9d4SSatish Balay rtmp2[i + 1] += sctx.shift_amount; 15309371c9d4SSatish Balay rtmp3[i + 2] += sctx.shift_amount; 153168785679SHong Zhang 153268785679SHong Zhang /* elimination */ 153368785679SHong Zhang bjtmp = bj + bi[i]; 153468785679SHong Zhang row = *bjtmp++; /* pivot row */ 153568785679SHong Zhang nzL = bi[i + 1] - bi[i]; 153668785679SHong Zhang for (k = 0; k < nzL; k++) { 1537b89f182dSHong Zhang pc1 = rtmp1 + row; 1538b89f182dSHong Zhang pc2 = rtmp2 + row; 1539b89f182dSHong Zhang pc3 = rtmp3 + row; 154068785679SHong Zhang if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) { 154168785679SHong Zhang pv = b->a + bdiag[row]; 15429371c9d4SSatish Balay mul1 = *pc1 * (*pv); 15439371c9d4SSatish Balay mul2 = *pc2 * (*pv); 15449371c9d4SSatish Balay mul3 = *pc3 * (*pv); 15459371c9d4SSatish Balay *pc1 = mul1; 15469371c9d4SSatish Balay *pc2 = mul2; 15479371c9d4SSatish Balay *pc3 = mul3; 154868785679SHong Zhang 154968785679SHong Zhang pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 155068785679SHong Zhang pv = b->a + bdiag[row + 1] + 1; 155168785679SHong Zhang nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 155268785679SHong Zhang for (j = 0; j < nz; j++) { 155368785679SHong Zhang col = pj[j]; 1554b89f182dSHong Zhang rtmp1[col] -= mul1 * pv[j]; 1555b89f182dSHong Zhang rtmp2[col] -= mul2 * pv[j]; 1556b89f182dSHong Zhang rtmp3[col] -= mul3 * pv[j]; 155768785679SHong Zhang } 15589566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(3 + 6.0 * nz)); 155968785679SHong Zhang } 156068785679SHong Zhang row = *bjtmp++; 156168785679SHong Zhang } 156268785679SHong Zhang 1563b89f182dSHong Zhang /* finished row i; check zero pivot, then stick row i into b->a */ 1564b89f182dSHong Zhang rs = 0.0; 1565b89f182dSHong Zhang /* L part */ 1566b89f182dSHong Zhang pc1 = b->a + bi[i]; 1567b89f182dSHong Zhang pj = b->j + bi[i]; 1568b89f182dSHong Zhang nz = bi[i + 1] - bi[i]; 1569b89f182dSHong Zhang for (j = 0; j < nz; j++) { 1570b89f182dSHong Zhang col = pj[j]; 15719371c9d4SSatish Balay pc1[j] = rtmp1[col]; 15729371c9d4SSatish Balay rs += PetscAbsScalar(pc1[j]); 1573b89f182dSHong Zhang } 1574b89f182dSHong Zhang /* U part */ 1575b89f182dSHong Zhang pc1 = b->a + bdiag[i + 1] + 1; 1576b89f182dSHong Zhang pj = b->j + bdiag[i + 1] + 1; 15770e7a5c2bSHong Zhang nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */ 1578b89f182dSHong Zhang for (j = 0; j < nz; j++) { 1579b89f182dSHong Zhang col = pj[j]; 15809371c9d4SSatish Balay pc1[j] = rtmp1[col]; 15819371c9d4SSatish Balay rs += PetscAbsScalar(pc1[j]); 1582b89f182dSHong Zhang } 158368785679SHong Zhang 1584b89f182dSHong Zhang sctx.rs = rs; 1585b89f182dSHong Zhang sctx.pv = rtmp1[i]; 15869566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 158707b50cabSHong Zhang if (sctx.newshift) break; 1588b89f182dSHong Zhang pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1589b89f182dSHong Zhang *pc1 = 1.0 / sctx.pv; 1590b89f182dSHong Zhang 1591b89f182dSHong Zhang /* Now take care of 1st column of diagonal 3x3 block. */ 1592b89f182dSHong Zhang pc2 = rtmp2 + i; 1593b89f182dSHong Zhang pc3 = rtmp3 + i; 1594b89f182dSHong Zhang if (*pc2 != 0.0 || *pc3 != 0.0) { 15959371c9d4SSatish Balay mul2 = (*pc2) * (*pc1); 15969371c9d4SSatish Balay *pc2 = mul2; 15979371c9d4SSatish Balay mul3 = (*pc3) * (*pc1); 15989371c9d4SSatish Balay *pc3 = mul3; 159968785679SHong Zhang pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */ 160068785679SHong Zhang nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */ 160168785679SHong Zhang for (j = 0; j < nz; j++) { 160268785679SHong Zhang col = pj[j]; 1603b89f182dSHong Zhang rtmp2[col] -= mul2 * rtmp1[col]; 1604b89f182dSHong Zhang rtmp3[col] -= mul3 * rtmp1[col]; 160568785679SHong Zhang } 16069566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2 + 4.0 * nz)); 160768785679SHong Zhang } 160868785679SHong Zhang 1609b89f182dSHong Zhang /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1610b89f182dSHong Zhang rs = 0.0; 1611b89f182dSHong Zhang /* L part */ 1612b89f182dSHong Zhang pc2 = b->a + bi[i + 1]; 1613b89f182dSHong Zhang pj = b->j + bi[i + 1]; 1614b89f182dSHong Zhang nz = bi[i + 2] - bi[i + 1]; 1615b89f182dSHong Zhang for (j = 0; j < nz; j++) { 1616b89f182dSHong Zhang col = pj[j]; 16179371c9d4SSatish Balay pc2[j] = rtmp2[col]; 16189371c9d4SSatish Balay rs += PetscAbsScalar(pc2[j]); 1619b89f182dSHong Zhang } 1620b89f182dSHong Zhang /* U part */ 1621b89f182dSHong Zhang pc2 = b->a + bdiag[i + 2] + 1; 16220e7a5c2bSHong Zhang pj = b->j + bdiag[i + 2] + 1; 16230e7a5c2bSHong Zhang nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */ 1624b89f182dSHong Zhang for (j = 0; j < nz; j++) { 1625b89f182dSHong Zhang col = pj[j]; 16269371c9d4SSatish Balay pc2[j] = rtmp2[col]; 16279371c9d4SSatish Balay rs += PetscAbsScalar(pc2[j]); 1628b89f182dSHong Zhang } 1629b89f182dSHong Zhang 1630b89f182dSHong Zhang sctx.rs = rs; 1631b89f182dSHong Zhang sctx.pv = rtmp2[i + 1]; 16329566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1)); 163307b50cabSHong Zhang if (sctx.newshift) break; 1634b89f182dSHong Zhang pc2 = b->a + bdiag[i + 1]; 1635b89f182dSHong Zhang *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */ 1636b89f182dSHong Zhang 1637b89f182dSHong Zhang /* Now take care of 2nd column of diagonal 3x3 block. */ 1638b89f182dSHong Zhang pc3 = rtmp3 + i + 1; 163968785679SHong Zhang if (*pc3 != 0.0) { 16409371c9d4SSatish Balay mul3 = (*pc3) * (*pc2); 16419371c9d4SSatish Balay *pc3 = mul3; 164268785679SHong Zhang pj = b->j + bdiag[i + 2] + 1; /* beginning of U(i+1,:) */ 164368785679SHong Zhang nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */ 164468785679SHong Zhang for (j = 0; j < nz; j++) { 164568785679SHong Zhang col = pj[j]; 1646b89f182dSHong Zhang rtmp3[col] -= mul3 * rtmp2[col]; 164768785679SHong Zhang } 16489566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(1 + 2.0 * nz)); 164968785679SHong Zhang } 165068785679SHong Zhang 1651b89f182dSHong Zhang /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 165268785679SHong Zhang rs = 0.0; 165368785679SHong Zhang /* L part */ 1654b89f182dSHong Zhang pc3 = b->a + bi[i + 2]; 1655b89f182dSHong Zhang pj = b->j + bi[i + 2]; 1656b89f182dSHong Zhang nz = bi[i + 3] - bi[i + 2]; 165768785679SHong Zhang for (j = 0; j < nz; j++) { 165868785679SHong Zhang col = pj[j]; 16599371c9d4SSatish Balay pc3[j] = rtmp3[col]; 16609371c9d4SSatish Balay rs += PetscAbsScalar(pc3[j]); 166168785679SHong Zhang } 166268785679SHong Zhang /* U part */ 1663b89f182dSHong Zhang pc3 = b->a + bdiag[i + 3] + 1; 16640e7a5c2bSHong Zhang pj = b->j + bdiag[i + 3] + 1; 16650e7a5c2bSHong Zhang nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */ 166668785679SHong Zhang for (j = 0; j < nz; j++) { 166768785679SHong Zhang col = pj[j]; 16689371c9d4SSatish Balay pc3[j] = rtmp3[col]; 16699371c9d4SSatish Balay rs += PetscAbsScalar(pc3[j]); 167068785679SHong Zhang } 167168785679SHong Zhang 167268785679SHong Zhang sctx.rs = rs; 1673b89f182dSHong Zhang sctx.pv = rtmp3[i + 2]; 16749566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2)); 167507b50cabSHong Zhang if (sctx.newshift) break; 167668785679SHong Zhang pc3 = b->a + bdiag[i + 2]; 1677b89f182dSHong Zhang *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */ 167868785679SHong Zhang break; 16799877982aSShri Abhyankar case 4: 16809877982aSShri Abhyankar /* zero rtmp */ 16819877982aSShri Abhyankar /* L part */ 16829877982aSShri Abhyankar nz = bi[i + 1] - bi[i]; 16839877982aSShri Abhyankar bjtmp = bj + bi[i]; 16849877982aSShri Abhyankar for (j = 0; j < nz; j++) { 16859877982aSShri Abhyankar col = bjtmp[j]; 16869371c9d4SSatish Balay rtmp1[col] = 0.0; 16879371c9d4SSatish Balay rtmp2[col] = 0.0; 16889371c9d4SSatish Balay rtmp3[col] = 0.0; 16899371c9d4SSatish Balay rtmp4[col] = 0.0; 16909877982aSShri Abhyankar } 16919877982aSShri Abhyankar 16929877982aSShri Abhyankar /* U part */ 16939877982aSShri Abhyankar nz = bdiag[i] - bdiag[i + 1]; 16949877982aSShri Abhyankar bjtmp = bj + bdiag[i + 1] + 1; 16959877982aSShri Abhyankar for (j = 0; j < nz; j++) { 16969877982aSShri Abhyankar col = bjtmp[j]; 16979371c9d4SSatish Balay rtmp1[col] = 0.0; 16989371c9d4SSatish Balay rtmp2[col] = 0.0; 16999371c9d4SSatish Balay rtmp3[col] = 0.0; 17009371c9d4SSatish Balay rtmp4[col] = 0.0; 17019877982aSShri Abhyankar } 17029877982aSShri Abhyankar 17039877982aSShri Abhyankar /* load in initial (unfactored row) */ 17049877982aSShri Abhyankar nz = ai[r[i] + 1] - ai[r[i]]; 17059877982aSShri Abhyankar ajtmp = aj + ai[r[i]]; 17069371c9d4SSatish Balay v1 = aa + ai[r[i]]; 17079371c9d4SSatish Balay v2 = aa + ai[r[i] + 1]; 17089371c9d4SSatish Balay v3 = aa + ai[r[i] + 2]; 17099371c9d4SSatish Balay v4 = aa + ai[r[i] + 3]; 17109877982aSShri Abhyankar for (j = 0; j < nz; j++) { 17119877982aSShri Abhyankar col = ics[ajtmp[j]]; 17129371c9d4SSatish Balay rtmp1[col] = v1[j]; 17139371c9d4SSatish Balay rtmp2[col] = v2[j]; 17149371c9d4SSatish Balay rtmp3[col] = v3[j]; 17159371c9d4SSatish Balay rtmp4[col] = v4[j]; 17169877982aSShri Abhyankar } 17179877982aSShri Abhyankar /* ZeropivotApply(): shift the diagonal of the matrix */ 17189371c9d4SSatish Balay rtmp1[i] += sctx.shift_amount; 17199371c9d4SSatish Balay rtmp2[i + 1] += sctx.shift_amount; 17209371c9d4SSatish Balay rtmp3[i + 2] += sctx.shift_amount; 17219371c9d4SSatish Balay rtmp4[i + 3] += sctx.shift_amount; 17229877982aSShri Abhyankar 17239877982aSShri Abhyankar /* elimination */ 17249877982aSShri Abhyankar bjtmp = bj + bi[i]; 17259877982aSShri Abhyankar row = *bjtmp++; /* pivot row */ 17269877982aSShri Abhyankar nzL = bi[i + 1] - bi[i]; 17279877982aSShri Abhyankar for (k = 0; k < nzL; k++) { 17289877982aSShri Abhyankar pc1 = rtmp1 + row; 17299877982aSShri Abhyankar pc2 = rtmp2 + row; 17309877982aSShri Abhyankar pc3 = rtmp3 + row; 17319877982aSShri Abhyankar pc4 = rtmp4 + row; 17329877982aSShri Abhyankar if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 17339877982aSShri Abhyankar pv = b->a + bdiag[row]; 17349371c9d4SSatish Balay mul1 = *pc1 * (*pv); 17359371c9d4SSatish Balay mul2 = *pc2 * (*pv); 17369371c9d4SSatish Balay mul3 = *pc3 * (*pv); 17379371c9d4SSatish Balay mul4 = *pc4 * (*pv); 17389371c9d4SSatish Balay *pc1 = mul1; 17399371c9d4SSatish Balay *pc2 = mul2; 17409371c9d4SSatish Balay *pc3 = mul3; 17419371c9d4SSatish Balay *pc4 = mul4; 17429877982aSShri Abhyankar 17439877982aSShri Abhyankar pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 17449877982aSShri Abhyankar pv = b->a + bdiag[row + 1] + 1; 17459877982aSShri Abhyankar nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 17469877982aSShri Abhyankar for (j = 0; j < nz; j++) { 17479877982aSShri Abhyankar col = pj[j]; 17489877982aSShri Abhyankar rtmp1[col] -= mul1 * pv[j]; 17499877982aSShri Abhyankar rtmp2[col] -= mul2 * pv[j]; 17509877982aSShri Abhyankar rtmp3[col] -= mul3 * pv[j]; 17519877982aSShri Abhyankar rtmp4[col] -= mul4 * pv[j]; 17529877982aSShri Abhyankar } 17539566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(4 + 8.0 * nz)); 17549877982aSShri Abhyankar } 17559877982aSShri Abhyankar row = *bjtmp++; 17569877982aSShri Abhyankar } 17579877982aSShri Abhyankar 17589877982aSShri Abhyankar /* finished row i; check zero pivot, then stick row i into b->a */ 17599877982aSShri Abhyankar rs = 0.0; 17609877982aSShri Abhyankar /* L part */ 17619877982aSShri Abhyankar pc1 = b->a + bi[i]; 17629877982aSShri Abhyankar pj = b->j + bi[i]; 17639877982aSShri Abhyankar nz = bi[i + 1] - bi[i]; 17649877982aSShri Abhyankar for (j = 0; j < nz; j++) { 17659877982aSShri Abhyankar col = pj[j]; 17669371c9d4SSatish Balay pc1[j] = rtmp1[col]; 17679371c9d4SSatish Balay rs += PetscAbsScalar(pc1[j]); 17689877982aSShri Abhyankar } 17699877982aSShri Abhyankar /* U part */ 17709877982aSShri Abhyankar pc1 = b->a + bdiag[i + 1] + 1; 17719877982aSShri Abhyankar pj = b->j + bdiag[i + 1] + 1; 17729877982aSShri Abhyankar nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */ 17739877982aSShri Abhyankar for (j = 0; j < nz; j++) { 17749877982aSShri Abhyankar col = pj[j]; 17759371c9d4SSatish Balay pc1[j] = rtmp1[col]; 17769371c9d4SSatish Balay rs += PetscAbsScalar(pc1[j]); 17779877982aSShri Abhyankar } 17789877982aSShri Abhyankar 17799877982aSShri Abhyankar sctx.rs = rs; 17809877982aSShri Abhyankar sctx.pv = rtmp1[i]; 17819566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 178207b50cabSHong Zhang if (sctx.newshift) break; 17839877982aSShri Abhyankar pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 17849877982aSShri Abhyankar *pc1 = 1.0 / sctx.pv; 17859877982aSShri Abhyankar 17869877982aSShri Abhyankar /* Now take care of 1st column of diagonal 4x4 block. */ 17879877982aSShri Abhyankar pc2 = rtmp2 + i; 17889877982aSShri Abhyankar pc3 = rtmp3 + i; 17899877982aSShri Abhyankar pc4 = rtmp4 + i; 17909877982aSShri Abhyankar if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 17919371c9d4SSatish Balay mul2 = (*pc2) * (*pc1); 17929371c9d4SSatish Balay *pc2 = mul2; 17939371c9d4SSatish Balay mul3 = (*pc3) * (*pc1); 17949371c9d4SSatish Balay *pc3 = mul3; 17959371c9d4SSatish Balay mul4 = (*pc4) * (*pc1); 17969371c9d4SSatish Balay *pc4 = mul4; 17979877982aSShri Abhyankar pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */ 17989877982aSShri Abhyankar nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */ 17999877982aSShri Abhyankar for (j = 0; j < nz; j++) { 18009877982aSShri Abhyankar col = pj[j]; 18019877982aSShri Abhyankar rtmp2[col] -= mul2 * rtmp1[col]; 18029877982aSShri Abhyankar rtmp3[col] -= mul3 * rtmp1[col]; 18039877982aSShri Abhyankar rtmp4[col] -= mul4 * rtmp1[col]; 18049877982aSShri Abhyankar } 18059566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(3 + 6.0 * nz)); 18069877982aSShri Abhyankar } 18079877982aSShri Abhyankar 18089877982aSShri Abhyankar /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 18099877982aSShri Abhyankar rs = 0.0; 18109877982aSShri Abhyankar /* L part */ 18119877982aSShri Abhyankar pc2 = b->a + bi[i + 1]; 18129877982aSShri Abhyankar pj = b->j + bi[i + 1]; 18139877982aSShri Abhyankar nz = bi[i + 2] - bi[i + 1]; 18149877982aSShri Abhyankar for (j = 0; j < nz; j++) { 18159877982aSShri Abhyankar col = pj[j]; 18169371c9d4SSatish Balay pc2[j] = rtmp2[col]; 18179371c9d4SSatish Balay rs += PetscAbsScalar(pc2[j]); 18189877982aSShri Abhyankar } 18199877982aSShri Abhyankar /* U part */ 18209877982aSShri Abhyankar pc2 = b->a + bdiag[i + 2] + 1; 18219877982aSShri Abhyankar pj = b->j + bdiag[i + 2] + 1; 18229877982aSShri Abhyankar nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */ 18239877982aSShri Abhyankar for (j = 0; j < nz; j++) { 18249877982aSShri Abhyankar col = pj[j]; 18259371c9d4SSatish Balay pc2[j] = rtmp2[col]; 18269371c9d4SSatish Balay rs += PetscAbsScalar(pc2[j]); 18279877982aSShri Abhyankar } 18289877982aSShri Abhyankar 18299877982aSShri Abhyankar sctx.rs = rs; 18309877982aSShri Abhyankar sctx.pv = rtmp2[i + 1]; 18319566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1)); 183207b50cabSHong Zhang if (sctx.newshift) break; 18339877982aSShri Abhyankar pc2 = b->a + bdiag[i + 1]; 18349877982aSShri Abhyankar *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */ 18359877982aSShri Abhyankar 18369877982aSShri Abhyankar /* Now take care of 2nd column of diagonal 4x4 block. */ 18379877982aSShri Abhyankar pc3 = rtmp3 + i + 1; 18389877982aSShri Abhyankar pc4 = rtmp4 + i + 1; 18399877982aSShri Abhyankar if (*pc3 != 0.0 || *pc4 != 0.0) { 18409371c9d4SSatish Balay mul3 = (*pc3) * (*pc2); 18419371c9d4SSatish Balay *pc3 = mul3; 18429371c9d4SSatish Balay mul4 = (*pc4) * (*pc2); 18439371c9d4SSatish Balay *pc4 = mul4; 18449877982aSShri Abhyankar pj = b->j + bdiag[i + 2] + 1; /* beginning of U(i+1,:) */ 18459877982aSShri Abhyankar nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */ 18469877982aSShri Abhyankar for (j = 0; j < nz; j++) { 18479877982aSShri Abhyankar col = pj[j]; 18489877982aSShri Abhyankar rtmp3[col] -= mul3 * rtmp2[col]; 18499877982aSShri Abhyankar rtmp4[col] -= mul4 * rtmp2[col]; 18509877982aSShri Abhyankar } 18519566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(4.0 * nz)); 18529877982aSShri Abhyankar } 18539877982aSShri Abhyankar 18549877982aSShri Abhyankar /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 18559877982aSShri Abhyankar rs = 0.0; 18569877982aSShri Abhyankar /* L part */ 18579877982aSShri Abhyankar pc3 = b->a + bi[i + 2]; 18589877982aSShri Abhyankar pj = b->j + bi[i + 2]; 18599877982aSShri Abhyankar nz = bi[i + 3] - bi[i + 2]; 18609877982aSShri Abhyankar for (j = 0; j < nz; j++) { 18619877982aSShri Abhyankar col = pj[j]; 18629371c9d4SSatish Balay pc3[j] = rtmp3[col]; 18639371c9d4SSatish Balay rs += PetscAbsScalar(pc3[j]); 18649877982aSShri Abhyankar } 18659877982aSShri Abhyankar /* U part */ 18669877982aSShri Abhyankar pc3 = b->a + bdiag[i + 3] + 1; 18679877982aSShri Abhyankar pj = b->j + bdiag[i + 3] + 1; 18689877982aSShri Abhyankar nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */ 18699877982aSShri Abhyankar for (j = 0; j < nz; j++) { 18709877982aSShri Abhyankar col = pj[j]; 18719371c9d4SSatish Balay pc3[j] = rtmp3[col]; 18729371c9d4SSatish Balay rs += PetscAbsScalar(pc3[j]); 18739877982aSShri Abhyankar } 18749877982aSShri Abhyankar 18759877982aSShri Abhyankar sctx.rs = rs; 18769877982aSShri Abhyankar sctx.pv = rtmp3[i + 2]; 18779566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2)); 187807b50cabSHong Zhang if (sctx.newshift) break; 18799877982aSShri Abhyankar pc3 = b->a + bdiag[i + 2]; 18809877982aSShri Abhyankar *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */ 18819877982aSShri Abhyankar 18829877982aSShri Abhyankar /* Now take care of 3rd column of diagonal 4x4 block. */ 18839877982aSShri Abhyankar pc4 = rtmp4 + i + 2; 18849877982aSShri Abhyankar if (*pc4 != 0.0) { 18859371c9d4SSatish Balay mul4 = (*pc4) * (*pc3); 18869371c9d4SSatish Balay *pc4 = mul4; 18879877982aSShri Abhyankar pj = b->j + bdiag[i + 3] + 1; /* beginning of U(i+2,:) */ 18889877982aSShri Abhyankar nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* num of entries in U(i+2,:) excluding diag */ 18899877982aSShri Abhyankar for (j = 0; j < nz; j++) { 18909877982aSShri Abhyankar col = pj[j]; 18919877982aSShri Abhyankar rtmp4[col] -= mul4 * rtmp3[col]; 18929877982aSShri Abhyankar } 18939566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(1 + 2.0 * nz)); 18949877982aSShri Abhyankar } 18959877982aSShri Abhyankar 18969877982aSShri Abhyankar /* finished i+3; check zero pivot, then stick row i+3 into b->a */ 18979877982aSShri Abhyankar rs = 0.0; 18989877982aSShri Abhyankar /* L part */ 18999877982aSShri Abhyankar pc4 = b->a + bi[i + 3]; 19009877982aSShri Abhyankar pj = b->j + bi[i + 3]; 19019877982aSShri Abhyankar nz = bi[i + 4] - bi[i + 3]; 19029877982aSShri Abhyankar for (j = 0; j < nz; j++) { 19039877982aSShri Abhyankar col = pj[j]; 19049371c9d4SSatish Balay pc4[j] = rtmp4[col]; 19059371c9d4SSatish Balay rs += PetscAbsScalar(pc4[j]); 19069877982aSShri Abhyankar } 19079877982aSShri Abhyankar /* U part */ 19089877982aSShri Abhyankar pc4 = b->a + bdiag[i + 4] + 1; 19099877982aSShri Abhyankar pj = b->j + bdiag[i + 4] + 1; 19109877982aSShri Abhyankar nz = bdiag[i + 3] - bdiag[i + 4] - 1; /* exclude diagonal */ 19119877982aSShri Abhyankar for (j = 0; j < nz; j++) { 19129877982aSShri Abhyankar col = pj[j]; 19139371c9d4SSatish Balay pc4[j] = rtmp4[col]; 19149371c9d4SSatish Balay rs += PetscAbsScalar(pc4[j]); 19159877982aSShri Abhyankar } 19169877982aSShri Abhyankar 19179877982aSShri Abhyankar sctx.rs = rs; 19189877982aSShri Abhyankar sctx.pv = rtmp4[i + 3]; 19199566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(B, A, info, &sctx, i + 3)); 192007b50cabSHong Zhang if (sctx.newshift) break; 19219877982aSShri Abhyankar pc4 = b->a + bdiag[i + 3]; 19229877982aSShri Abhyankar *pc4 = 1.0 / sctx.pv; /* Mark diag[i+3] */ 19239877982aSShri Abhyankar break; 192468785679SHong Zhang 1925d71ae5a4SJacob Faibussowitsch default: 1926d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported "); 192728f1b45aSHong Zhang } 1928c2b86aeeSHong Zhang if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */ 192928f1b45aSHong Zhang i += nodesz; /* Update the row */ 193068785679SHong Zhang } 193128f1b45aSHong Zhang 193228f1b45aSHong Zhang /* MatPivotRefine() */ 193307b50cabSHong Zhang if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) { 193428f1b45aSHong Zhang /* 193528f1b45aSHong Zhang * if no shift in this attempt & shifting & started shifting & can refine, 193628f1b45aSHong Zhang * then try lower shift 193728f1b45aSHong Zhang */ 193828f1b45aSHong Zhang sctx.shift_hi = sctx.shift_fraction; 193928f1b45aSHong Zhang sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.; 194028f1b45aSHong Zhang sctx.shift_amount = sctx.shift_fraction * sctx.shift_top; 194107b50cabSHong Zhang sctx.newshift = PETSC_TRUE; 194228f1b45aSHong Zhang sctx.nshift++; 194328f1b45aSHong Zhang } 194407b50cabSHong Zhang } while (sctx.newshift); 194528f1b45aSHong Zhang 19469566063dSJacob Faibussowitsch PetscCall(PetscFree4(rtmp1, rtmp2, rtmp3, rtmp4)); 19479566063dSJacob Faibussowitsch PetscCall(PetscFree(tmp_vec2)); 19489566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isicol, &ic)); 19499566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &r)); 195028f1b45aSHong Zhang 1951*4d12350bSJunchao Zhang if (b->inode.size_csr) { 1952abb87a52SBarry Smith C->ops->solve = MatSolve_SeqAIJ_Inode; 1953abb87a52SBarry Smith } else { 1954d3ac4fa3SBarry Smith C->ops->solve = MatSolve_SeqAIJ; 1955abb87a52SBarry Smith } 195628f1b45aSHong Zhang C->ops->solveadd = MatSolveAdd_SeqAIJ; 195728f1b45aSHong Zhang C->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 195828f1b45aSHong Zhang C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 195928f1b45aSHong Zhang C->ops->matsolve = MatMatSolve_SeqAIJ; 1960a3d9026eSPierre Jolivet C->ops->matsolvetranspose = MatMatSolveTranspose_SeqAIJ; 196128f1b45aSHong Zhang C->assembled = PETSC_TRUE; 196228f1b45aSHong Zhang C->preallocated = PETSC_TRUE; 19632205254eSKarl Rupp 19649566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(C->cmap->n)); 196528f1b45aSHong Zhang 196628f1b45aSHong Zhang /* MatShiftView(A,info,&sctx) */ 196728f1b45aSHong Zhang if (sctx.nshift) { 1968f4db908eSBarry Smith if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 19699566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top)); 1970f4db908eSBarry Smith } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 19719566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 1972f4db908eSBarry Smith } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) { 19739566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount)); 197428f1b45aSHong Zhang } 197528f1b45aSHong Zhang } 19763ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 197728f1b45aSHong Zhang } 1978628f99d7SShri Abhyankar 1979ff6a9541SJacob Faibussowitsch #if 0 1980ff6a9541SJacob Faibussowitsch // unused 1981ff6a9541SJacob Faibussowitsch static PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B, Mat A, const MatFactorInfo *info) 1982d71ae5a4SJacob Faibussowitsch { 1983628f99d7SShri Abhyankar Mat C = B; 1984628f99d7SShri Abhyankar Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data; 1985628f99d7SShri Abhyankar IS iscol = b->col, isrow = b->row, isicol = b->icol; 1986628f99d7SShri Abhyankar const PetscInt *r, *ic, *c, *ics; 1987628f99d7SShri Abhyankar PetscInt n = A->rmap->n, *bi = b->i; 1988628f99d7SShri Abhyankar PetscInt *bj = b->j, *nbj = b->j + 1, *ajtmp, *bjtmp, nz, nz_tmp, row, prow; 19898758e1faSBarry Smith PetscInt i, j, idx, *bd = b->diag, node_max, nodesz; 19908758e1faSBarry Smith PetscInt *ai = a->i, *aj = a->j; 1991628f99d7SShri Abhyankar PetscInt *ns, *tmp_vec1, *tmp_vec2, *nsmap, *pj; 1992628f99d7SShri Abhyankar PetscScalar mul1, mul2, mul3, tmp; 1993628f99d7SShri Abhyankar MatScalar *pc1, *pc2, *pc3, *ba = b->a, *pv, *rtmp11, *rtmp22, *rtmp33; 1994628f99d7SShri Abhyankar const MatScalar *v1, *v2, *v3, *aa = a->a, *rtmp1; 1995628f99d7SShri Abhyankar PetscReal rs = 0.0; 1996628f99d7SShri Abhyankar FactorShiftCtx sctx; 1997628f99d7SShri Abhyankar 1998628f99d7SShri Abhyankar PetscFunctionBegin; 1999628f99d7SShri Abhyankar sctx.shift_top = 0; 2000628f99d7SShri Abhyankar sctx.nshift_max = 0; 2001628f99d7SShri Abhyankar sctx.shift_lo = 0; 2002628f99d7SShri Abhyankar sctx.shift_hi = 0; 2003628f99d7SShri Abhyankar sctx.shift_fraction = 0; 2004628f99d7SShri Abhyankar 2005628f99d7SShri Abhyankar /* if both shift schemes are chosen by user, only use info->shiftpd */ 2006f4db908eSBarry Smith if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 2007628f99d7SShri Abhyankar sctx.shift_top = 0; 2008628f99d7SShri Abhyankar for (i = 0; i < n; i++) { 2009628f99d7SShri Abhyankar /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 2010628f99d7SShri Abhyankar rs = 0.0; 2011628f99d7SShri Abhyankar ajtmp = aj + ai[i]; 2012628f99d7SShri Abhyankar rtmp1 = aa + ai[i]; 2013628f99d7SShri Abhyankar nz = ai[i + 1] - ai[i]; 2014628f99d7SShri Abhyankar for (j = 0; j < nz; j++) { 2015628f99d7SShri Abhyankar if (*ajtmp != i) { 2016628f99d7SShri Abhyankar rs += PetscAbsScalar(*rtmp1++); 2017628f99d7SShri Abhyankar } else { 2018628f99d7SShri Abhyankar rs -= PetscRealPart(*rtmp1++); 2019628f99d7SShri Abhyankar } 2020628f99d7SShri Abhyankar ajtmp++; 2021628f99d7SShri Abhyankar } 2022628f99d7SShri Abhyankar if (rs > sctx.shift_top) sctx.shift_top = rs; 2023628f99d7SShri Abhyankar } 2024628f99d7SShri Abhyankar if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12; 2025628f99d7SShri Abhyankar sctx.shift_top *= 1.1; 2026628f99d7SShri Abhyankar sctx.nshift_max = 5; 2027628f99d7SShri Abhyankar sctx.shift_lo = 0.; 2028628f99d7SShri Abhyankar sctx.shift_hi = 1.; 2029628f99d7SShri Abhyankar } 2030628f99d7SShri Abhyankar sctx.shift_amount = 0; 2031628f99d7SShri Abhyankar sctx.nshift = 0; 2032628f99d7SShri Abhyankar 20339566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow, &r)); 20349566063dSJacob Faibussowitsch PetscCall(ISGetIndices(iscol, &c)); 20359566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isicol, &ic)); 20369566063dSJacob Faibussowitsch PetscCall(PetscCalloc3(n, &rtmp11, n, &rtmp22, n, &rtmp33)); 2037628f99d7SShri Abhyankar ics = ic; 2038628f99d7SShri Abhyankar 2039628f99d7SShri Abhyankar node_max = a->inode.node_count; 2040628f99d7SShri Abhyankar ns = a->inode.size; 204128b400f6SJacob Faibussowitsch PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information"); 2042628f99d7SShri Abhyankar 2043628f99d7SShri Abhyankar /* If max inode size > 3, split it into two inodes.*/ 2044628f99d7SShri Abhyankar /* also map the inode sizes according to the ordering */ 20459566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(n + 1, &tmp_vec1)); 2046628f99d7SShri Abhyankar for (i = 0, j = 0; i < node_max; ++i, ++j) { 2047628f99d7SShri Abhyankar if (ns[i] > 3) { 2048628f99d7SShri Abhyankar tmp_vec1[j] = ns[i] / 2; /* Assuming ns[i] < =5 */ 2049628f99d7SShri Abhyankar ++j; 2050628f99d7SShri Abhyankar tmp_vec1[j] = ns[i] - tmp_vec1[j - 1]; 2051628f99d7SShri Abhyankar } else { 2052628f99d7SShri Abhyankar tmp_vec1[j] = ns[i]; 2053628f99d7SShri Abhyankar } 2054628f99d7SShri Abhyankar } 2055628f99d7SShri Abhyankar /* Use the correct node_max */ 2056628f99d7SShri Abhyankar node_max = j; 2057628f99d7SShri Abhyankar 2058628f99d7SShri Abhyankar /* Now reorder the inode info based on mat re-ordering info */ 2059628f99d7SShri Abhyankar /* First create a row -> inode_size_array_index map */ 20609566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(n + 1, &nsmap, node_max + 1, &tmp_vec2)); 2061628f99d7SShri Abhyankar for (i = 0, row = 0; i < node_max; i++) { 2062628f99d7SShri Abhyankar nodesz = tmp_vec1[i]; 2063ad540459SPierre Jolivet for (j = 0; j < nodesz; j++, row++) nsmap[row] = i; 2064628f99d7SShri Abhyankar } 2065628f99d7SShri Abhyankar /* Using nsmap, create a reordered ns structure */ 2066628f99d7SShri Abhyankar for (i = 0, j = 0; i < node_max; i++) { 2067628f99d7SShri Abhyankar nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 2068628f99d7SShri Abhyankar tmp_vec2[i] = nodesz; 2069628f99d7SShri Abhyankar j += nodesz; 2070628f99d7SShri Abhyankar } 20719566063dSJacob Faibussowitsch PetscCall(PetscFree2(nsmap, tmp_vec1)); 2072628f99d7SShri Abhyankar /* Now use the correct ns */ 2073628f99d7SShri Abhyankar ns = tmp_vec2; 2074628f99d7SShri Abhyankar 2075628f99d7SShri Abhyankar do { 207607b50cabSHong Zhang sctx.newshift = PETSC_FALSE; 2077628f99d7SShri Abhyankar /* Now loop over each block-row, and do the factorization */ 2078628f99d7SShri Abhyankar for (i = 0, row = 0; i < node_max; i++) { 2079628f99d7SShri Abhyankar nodesz = ns[i]; 2080628f99d7SShri Abhyankar nz = bi[row + 1] - bi[row]; 2081628f99d7SShri Abhyankar bjtmp = bj + bi[row]; 2082628f99d7SShri Abhyankar 2083628f99d7SShri Abhyankar switch (nodesz) { 2084628f99d7SShri Abhyankar case 1: 2085628f99d7SShri Abhyankar for (j = 0; j < nz; j++) { 2086628f99d7SShri Abhyankar idx = bjtmp[j]; 2087628f99d7SShri Abhyankar rtmp11[idx] = 0.0; 2088628f99d7SShri Abhyankar } 2089628f99d7SShri Abhyankar 2090628f99d7SShri Abhyankar /* load in initial (unfactored row) */ 2091628f99d7SShri Abhyankar idx = r[row]; 2092628f99d7SShri Abhyankar nz_tmp = ai[idx + 1] - ai[idx]; 2093628f99d7SShri Abhyankar ajtmp = aj + ai[idx]; 2094628f99d7SShri Abhyankar v1 = aa + ai[idx]; 2095628f99d7SShri Abhyankar 2096628f99d7SShri Abhyankar for (j = 0; j < nz_tmp; j++) { 2097628f99d7SShri Abhyankar idx = ics[ajtmp[j]]; 2098628f99d7SShri Abhyankar rtmp11[idx] = v1[j]; 2099628f99d7SShri Abhyankar } 2100628f99d7SShri Abhyankar rtmp11[ics[r[row]]] += sctx.shift_amount; 2101628f99d7SShri Abhyankar 2102628f99d7SShri Abhyankar prow = *bjtmp++; 2103628f99d7SShri Abhyankar while (prow < row) { 2104628f99d7SShri Abhyankar pc1 = rtmp11 + prow; 2105628f99d7SShri Abhyankar if (*pc1 != 0.0) { 2106628f99d7SShri Abhyankar pv = ba + bd[prow]; 2107628f99d7SShri Abhyankar pj = nbj + bd[prow]; 2108628f99d7SShri Abhyankar mul1 = *pc1 * *pv++; 2109628f99d7SShri Abhyankar *pc1 = mul1; 2110628f99d7SShri Abhyankar nz_tmp = bi[prow + 1] - bd[prow] - 1; 21119566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp)); 2112628f99d7SShri Abhyankar for (j = 0; j < nz_tmp; j++) { 2113628f99d7SShri Abhyankar tmp = pv[j]; 2114628f99d7SShri Abhyankar idx = pj[j]; 2115628f99d7SShri Abhyankar rtmp11[idx] -= mul1 * tmp; 2116628f99d7SShri Abhyankar } 2117628f99d7SShri Abhyankar } 2118628f99d7SShri Abhyankar prow = *bjtmp++; 2119628f99d7SShri Abhyankar } 2120628f99d7SShri Abhyankar pj = bj + bi[row]; 2121628f99d7SShri Abhyankar pc1 = ba + bi[row]; 2122628f99d7SShri Abhyankar 2123628f99d7SShri Abhyankar sctx.pv = rtmp11[row]; 2124628f99d7SShri Abhyankar rtmp11[row] = 1.0 / rtmp11[row]; /* invert diag */ 2125628f99d7SShri Abhyankar rs = 0.0; 2126628f99d7SShri Abhyankar for (j = 0; j < nz; j++) { 2127628f99d7SShri Abhyankar idx = pj[j]; 2128628f99d7SShri Abhyankar pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */ 2129628f99d7SShri Abhyankar if (idx != row) rs += PetscAbsScalar(pc1[j]); 2130628f99d7SShri Abhyankar } 2131628f99d7SShri Abhyankar sctx.rs = rs; 21329566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(B, A, info, &sctx, row)); 213307b50cabSHong Zhang if (sctx.newshift) goto endofwhile; 2134628f99d7SShri Abhyankar break; 2135628f99d7SShri Abhyankar 2136628f99d7SShri Abhyankar case 2: 2137628f99d7SShri Abhyankar for (j = 0; j < nz; j++) { 2138628f99d7SShri Abhyankar idx = bjtmp[j]; 2139628f99d7SShri Abhyankar rtmp11[idx] = 0.0; 2140628f99d7SShri Abhyankar rtmp22[idx] = 0.0; 2141628f99d7SShri Abhyankar } 2142628f99d7SShri Abhyankar 2143628f99d7SShri Abhyankar /* load in initial (unfactored row) */ 2144628f99d7SShri Abhyankar idx = r[row]; 2145628f99d7SShri Abhyankar nz_tmp = ai[idx + 1] - ai[idx]; 2146628f99d7SShri Abhyankar ajtmp = aj + ai[idx]; 2147628f99d7SShri Abhyankar v1 = aa + ai[idx]; 2148628f99d7SShri Abhyankar v2 = aa + ai[idx + 1]; 2149628f99d7SShri Abhyankar for (j = 0; j < nz_tmp; j++) { 2150628f99d7SShri Abhyankar idx = ics[ajtmp[j]]; 2151628f99d7SShri Abhyankar rtmp11[idx] = v1[j]; 2152628f99d7SShri Abhyankar rtmp22[idx] = v2[j]; 2153628f99d7SShri Abhyankar } 2154628f99d7SShri Abhyankar rtmp11[ics[r[row]]] += sctx.shift_amount; 2155628f99d7SShri Abhyankar rtmp22[ics[r[row + 1]]] += sctx.shift_amount; 2156628f99d7SShri Abhyankar 2157628f99d7SShri Abhyankar prow = *bjtmp++; 2158628f99d7SShri Abhyankar while (prow < row) { 2159628f99d7SShri Abhyankar pc1 = rtmp11 + prow; 2160628f99d7SShri Abhyankar pc2 = rtmp22 + prow; 2161628f99d7SShri Abhyankar if (*pc1 != 0.0 || *pc2 != 0.0) { 2162628f99d7SShri Abhyankar pv = ba + bd[prow]; 2163628f99d7SShri Abhyankar pj = nbj + bd[prow]; 2164628f99d7SShri Abhyankar mul1 = *pc1 * *pv; 2165628f99d7SShri Abhyankar mul2 = *pc2 * *pv; 2166628f99d7SShri Abhyankar ++pv; 2167628f99d7SShri Abhyankar *pc1 = mul1; 2168628f99d7SShri Abhyankar *pc2 = mul2; 2169628f99d7SShri Abhyankar 2170628f99d7SShri Abhyankar nz_tmp = bi[prow + 1] - bd[prow] - 1; 2171628f99d7SShri Abhyankar for (j = 0; j < nz_tmp; j++) { 2172628f99d7SShri Abhyankar tmp = pv[j]; 2173628f99d7SShri Abhyankar idx = pj[j]; 2174628f99d7SShri Abhyankar rtmp11[idx] -= mul1 * tmp; 2175628f99d7SShri Abhyankar rtmp22[idx] -= mul2 * tmp; 2176628f99d7SShri Abhyankar } 21779566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp)); 2178628f99d7SShri Abhyankar } 2179628f99d7SShri Abhyankar prow = *bjtmp++; 2180628f99d7SShri Abhyankar } 2181628f99d7SShri Abhyankar 2182628f99d7SShri Abhyankar /* Now take care of diagonal 2x2 block. Note: prow = row here */ 2183628f99d7SShri Abhyankar pc1 = rtmp11 + prow; 2184628f99d7SShri Abhyankar pc2 = rtmp22 + prow; 2185628f99d7SShri Abhyankar 2186628f99d7SShri Abhyankar sctx.pv = *pc1; 2187628f99d7SShri Abhyankar pj = bj + bi[prow]; 2188628f99d7SShri Abhyankar rs = 0.0; 2189628f99d7SShri Abhyankar for (j = 0; j < nz; j++) { 2190628f99d7SShri Abhyankar idx = pj[j]; 2191628f99d7SShri Abhyankar if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]); 2192628f99d7SShri Abhyankar } 2193628f99d7SShri Abhyankar sctx.rs = rs; 21949566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(B, A, info, &sctx, row)); 219507b50cabSHong Zhang if (sctx.newshift) goto endofwhile; 2196628f99d7SShri Abhyankar 2197628f99d7SShri Abhyankar if (*pc2 != 0.0) { 2198628f99d7SShri Abhyankar pj = nbj + bd[prow]; 2199628f99d7SShri Abhyankar mul2 = (*pc2) / (*pc1); /* since diag is not yet inverted.*/ 2200628f99d7SShri Abhyankar *pc2 = mul2; 2201628f99d7SShri Abhyankar nz_tmp = bi[prow + 1] - bd[prow] - 1; 2202628f99d7SShri Abhyankar for (j = 0; j < nz_tmp; j++) { 2203628f99d7SShri Abhyankar idx = pj[j]; 2204628f99d7SShri Abhyankar tmp = rtmp11[idx]; 2205628f99d7SShri Abhyankar rtmp22[idx] -= mul2 * tmp; 2206628f99d7SShri Abhyankar } 22079566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp)); 2208628f99d7SShri Abhyankar } 2209628f99d7SShri Abhyankar 2210628f99d7SShri Abhyankar pj = bj + bi[row]; 2211628f99d7SShri Abhyankar pc1 = ba + bi[row]; 2212628f99d7SShri Abhyankar pc2 = ba + bi[row + 1]; 2213628f99d7SShri Abhyankar 2214628f99d7SShri Abhyankar sctx.pv = rtmp22[row + 1]; 2215628f99d7SShri Abhyankar rs = 0.0; 2216628f99d7SShri Abhyankar rtmp11[row] = 1.0 / rtmp11[row]; 2217628f99d7SShri Abhyankar rtmp22[row + 1] = 1.0 / rtmp22[row + 1]; 2218628f99d7SShri Abhyankar /* copy row entries from dense representation to sparse */ 2219628f99d7SShri Abhyankar for (j = 0; j < nz; j++) { 2220628f99d7SShri Abhyankar idx = pj[j]; 2221628f99d7SShri Abhyankar pc1[j] = rtmp11[idx]; 2222628f99d7SShri Abhyankar pc2[j] = rtmp22[idx]; 2223628f99d7SShri Abhyankar if (idx != row + 1) rs += PetscAbsScalar(pc2[j]); 2224628f99d7SShri Abhyankar } 2225628f99d7SShri Abhyankar sctx.rs = rs; 22269566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1)); 222707b50cabSHong Zhang if (sctx.newshift) goto endofwhile; 2228628f99d7SShri Abhyankar break; 2229628f99d7SShri Abhyankar 2230628f99d7SShri Abhyankar case 3: 2231628f99d7SShri Abhyankar for (j = 0; j < nz; j++) { 2232628f99d7SShri Abhyankar idx = bjtmp[j]; 2233628f99d7SShri Abhyankar rtmp11[idx] = 0.0; 2234628f99d7SShri Abhyankar rtmp22[idx] = 0.0; 2235628f99d7SShri Abhyankar rtmp33[idx] = 0.0; 2236628f99d7SShri Abhyankar } 2237628f99d7SShri Abhyankar /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */ 2238628f99d7SShri Abhyankar idx = r[row]; 2239628f99d7SShri Abhyankar nz_tmp = ai[idx + 1] - ai[idx]; 2240628f99d7SShri Abhyankar ajtmp = aj + ai[idx]; 2241628f99d7SShri Abhyankar v1 = aa + ai[idx]; 2242628f99d7SShri Abhyankar v2 = aa + ai[idx + 1]; 2243628f99d7SShri Abhyankar v3 = aa + ai[idx + 2]; 2244628f99d7SShri Abhyankar for (j = 0; j < nz_tmp; j++) { 2245628f99d7SShri Abhyankar idx = ics[ajtmp[j]]; 2246628f99d7SShri Abhyankar rtmp11[idx] = v1[j]; 2247628f99d7SShri Abhyankar rtmp22[idx] = v2[j]; 2248628f99d7SShri Abhyankar rtmp33[idx] = v3[j]; 2249628f99d7SShri Abhyankar } 2250628f99d7SShri Abhyankar rtmp11[ics[r[row]]] += sctx.shift_amount; 2251628f99d7SShri Abhyankar rtmp22[ics[r[row + 1]]] += sctx.shift_amount; 2252628f99d7SShri Abhyankar rtmp33[ics[r[row + 2]]] += sctx.shift_amount; 2253628f99d7SShri Abhyankar 2254628f99d7SShri Abhyankar /* loop over all pivot row blocks above this row block */ 2255628f99d7SShri Abhyankar prow = *bjtmp++; 2256628f99d7SShri Abhyankar while (prow < row) { 2257628f99d7SShri Abhyankar pc1 = rtmp11 + prow; 2258628f99d7SShri Abhyankar pc2 = rtmp22 + prow; 2259628f99d7SShri Abhyankar pc3 = rtmp33 + prow; 2260628f99d7SShri Abhyankar if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) { 2261628f99d7SShri Abhyankar pv = ba + bd[prow]; 2262628f99d7SShri Abhyankar pj = nbj + bd[prow]; 2263628f99d7SShri Abhyankar mul1 = *pc1 * *pv; 2264628f99d7SShri Abhyankar mul2 = *pc2 * *pv; 2265628f99d7SShri Abhyankar mul3 = *pc3 * *pv; 2266628f99d7SShri Abhyankar ++pv; 2267628f99d7SShri Abhyankar *pc1 = mul1; 2268628f99d7SShri Abhyankar *pc2 = mul2; 2269628f99d7SShri Abhyankar *pc3 = mul3; 2270628f99d7SShri Abhyankar 2271628f99d7SShri Abhyankar nz_tmp = bi[prow + 1] - bd[prow] - 1; 2272628f99d7SShri Abhyankar /* update this row based on pivot row */ 2273628f99d7SShri Abhyankar for (j = 0; j < nz_tmp; j++) { 2274628f99d7SShri Abhyankar tmp = pv[j]; 2275628f99d7SShri Abhyankar idx = pj[j]; 2276628f99d7SShri Abhyankar rtmp11[idx] -= mul1 * tmp; 2277628f99d7SShri Abhyankar rtmp22[idx] -= mul2 * tmp; 2278628f99d7SShri Abhyankar rtmp33[idx] -= mul3 * tmp; 2279628f99d7SShri Abhyankar } 22809566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(3 + 6.0 * nz_tmp)); 2281628f99d7SShri Abhyankar } 2282628f99d7SShri Abhyankar prow = *bjtmp++; 2283628f99d7SShri Abhyankar } 2284628f99d7SShri Abhyankar 2285628f99d7SShri Abhyankar /* Now take care of diagonal 3x3 block in this set of rows */ 2286628f99d7SShri Abhyankar /* note: prow = row here */ 2287628f99d7SShri Abhyankar pc1 = rtmp11 + prow; 2288628f99d7SShri Abhyankar pc2 = rtmp22 + prow; 2289628f99d7SShri Abhyankar pc3 = rtmp33 + prow; 2290628f99d7SShri Abhyankar 2291628f99d7SShri Abhyankar sctx.pv = *pc1; 2292628f99d7SShri Abhyankar pj = bj + bi[prow]; 2293628f99d7SShri Abhyankar rs = 0.0; 2294628f99d7SShri Abhyankar for (j = 0; j < nz; j++) { 2295628f99d7SShri Abhyankar idx = pj[j]; 2296628f99d7SShri Abhyankar if (idx != row) rs += PetscAbsScalar(rtmp11[idx]); 2297628f99d7SShri Abhyankar } 2298628f99d7SShri Abhyankar sctx.rs = rs; 22999566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(B, A, info, &sctx, row)); 230007b50cabSHong Zhang if (sctx.newshift) goto endofwhile; 2301628f99d7SShri Abhyankar 2302628f99d7SShri Abhyankar if (*pc2 != 0.0 || *pc3 != 0.0) { 2303628f99d7SShri Abhyankar mul2 = (*pc2) / (*pc1); 2304628f99d7SShri Abhyankar mul3 = (*pc3) / (*pc1); 2305628f99d7SShri Abhyankar *pc2 = mul2; 2306628f99d7SShri Abhyankar *pc3 = mul3; 2307628f99d7SShri Abhyankar nz_tmp = bi[prow + 1] - bd[prow] - 1; 2308628f99d7SShri Abhyankar pj = nbj + bd[prow]; 2309628f99d7SShri Abhyankar for (j = 0; j < nz_tmp; j++) { 2310628f99d7SShri Abhyankar idx = pj[j]; 2311628f99d7SShri Abhyankar tmp = rtmp11[idx]; 2312628f99d7SShri Abhyankar rtmp22[idx] -= mul2 * tmp; 2313628f99d7SShri Abhyankar rtmp33[idx] -= mul3 * tmp; 2314628f99d7SShri Abhyankar } 23159566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp)); 2316628f99d7SShri Abhyankar } 2317628f99d7SShri Abhyankar ++prow; 2318628f99d7SShri Abhyankar 2319628f99d7SShri Abhyankar pc2 = rtmp22 + prow; 2320628f99d7SShri Abhyankar pc3 = rtmp33 + prow; 2321628f99d7SShri Abhyankar sctx.pv = *pc2; 2322628f99d7SShri Abhyankar pj = bj + bi[prow]; 2323628f99d7SShri Abhyankar rs = 0.0; 2324628f99d7SShri Abhyankar for (j = 0; j < nz; j++) { 2325628f99d7SShri Abhyankar idx = pj[j]; 2326628f99d7SShri Abhyankar if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]); 2327628f99d7SShri Abhyankar } 2328628f99d7SShri Abhyankar sctx.rs = rs; 23299566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1)); 233007b50cabSHong Zhang if (sctx.newshift) goto endofwhile; 2331628f99d7SShri Abhyankar 2332628f99d7SShri Abhyankar if (*pc3 != 0.0) { 2333628f99d7SShri Abhyankar mul3 = (*pc3) / (*pc2); 2334628f99d7SShri Abhyankar *pc3 = mul3; 2335628f99d7SShri Abhyankar pj = nbj + bd[prow]; 2336628f99d7SShri Abhyankar nz_tmp = bi[prow + 1] - bd[prow] - 1; 2337628f99d7SShri Abhyankar for (j = 0; j < nz_tmp; j++) { 2338628f99d7SShri Abhyankar idx = pj[j]; 2339628f99d7SShri Abhyankar tmp = rtmp22[idx]; 2340628f99d7SShri Abhyankar rtmp33[idx] -= mul3 * tmp; 2341628f99d7SShri Abhyankar } 23429566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp)); 2343628f99d7SShri Abhyankar } 2344628f99d7SShri Abhyankar 2345628f99d7SShri Abhyankar pj = bj + bi[row]; 2346628f99d7SShri Abhyankar pc1 = ba + bi[row]; 2347628f99d7SShri Abhyankar pc2 = ba + bi[row + 1]; 2348628f99d7SShri Abhyankar pc3 = ba + bi[row + 2]; 2349628f99d7SShri Abhyankar 2350628f99d7SShri Abhyankar sctx.pv = rtmp33[row + 2]; 2351628f99d7SShri Abhyankar rs = 0.0; 2352628f99d7SShri Abhyankar rtmp11[row] = 1.0 / rtmp11[row]; 2353628f99d7SShri Abhyankar rtmp22[row + 1] = 1.0 / rtmp22[row + 1]; 2354628f99d7SShri Abhyankar rtmp33[row + 2] = 1.0 / rtmp33[row + 2]; 2355628f99d7SShri Abhyankar /* copy row entries from dense representation to sparse */ 2356628f99d7SShri Abhyankar for (j = 0; j < nz; j++) { 2357628f99d7SShri Abhyankar idx = pj[j]; 2358628f99d7SShri Abhyankar pc1[j] = rtmp11[idx]; 2359628f99d7SShri Abhyankar pc2[j] = rtmp22[idx]; 2360628f99d7SShri Abhyankar pc3[j] = rtmp33[idx]; 2361628f99d7SShri Abhyankar if (idx != row + 2) rs += PetscAbsScalar(pc3[j]); 2362628f99d7SShri Abhyankar } 2363628f99d7SShri Abhyankar 2364628f99d7SShri Abhyankar sctx.rs = rs; 23659566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(B, A, info, &sctx, row + 2)); 236607b50cabSHong Zhang if (sctx.newshift) goto endofwhile; 2367628f99d7SShri Abhyankar break; 2368628f99d7SShri Abhyankar 2369d71ae5a4SJacob Faibussowitsch default: 2370d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported "); 2371628f99d7SShri Abhyankar } 2372628f99d7SShri Abhyankar row += nodesz; /* Update the row */ 2373628f99d7SShri Abhyankar } 2374628f99d7SShri Abhyankar endofwhile:; 237507b50cabSHong Zhang } while (sctx.newshift); 23769566063dSJacob Faibussowitsch PetscCall(PetscFree3(rtmp11, rtmp22, rtmp33)); 23779566063dSJacob Faibussowitsch PetscCall(PetscFree(tmp_vec2)); 23789566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isicol, &ic)); 23799566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &r)); 23809566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &c)); 23812205254eSKarl Rupp 2382d3ac4fa3SBarry Smith (B)->ops->solve = MatSolve_SeqAIJ_inplace; 2383628f99d7SShri Abhyankar /* do not set solve add, since MatSolve_Inode + Add is faster */ 2384628f99d7SShri Abhyankar C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace; 2385628f99d7SShri Abhyankar C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace; 2386628f99d7SShri Abhyankar C->assembled = PETSC_TRUE; 2387628f99d7SShri Abhyankar C->preallocated = PETSC_TRUE; 2388628f99d7SShri Abhyankar if (sctx.nshift) { 2389f4db908eSBarry Smith if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 23909566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top)); 2391f4db908eSBarry Smith } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 23929566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 2393628f99d7SShri Abhyankar } 2394628f99d7SShri Abhyankar } 23959566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(C->cmap->n)); 23969566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCheckInode(C)); 23973ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2398628f99d7SShri Abhyankar } 2399ff6a9541SJacob Faibussowitsch #endif 2400628f99d7SShri Abhyankar 2401d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A, Vec bb, Vec xx) 2402d71ae5a4SJacob Faibussowitsch { 2403019b515eSShri Abhyankar Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2404019b515eSShri Abhyankar IS iscol = a->col, isrow = a->row; 2405019b515eSShri Abhyankar const PetscInt *r, *c, *rout, *cout; 2406*4d12350bSJunchao Zhang PetscInt i, j; 24078758e1faSBarry Smith PetscInt node_max, row, nsz, aii, i0, i1, nz; 24088758e1faSBarry Smith const PetscInt *ai = a->i, *a_j = a->j, *ns, *vi, *ad, *aj; 2409019b515eSShri Abhyankar PetscScalar *x, *tmp, *tmps, tmp0, tmp1; 2410019b515eSShri Abhyankar PetscScalar sum1, sum2, sum3, sum4, sum5; 2411019b515eSShri Abhyankar const MatScalar *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa; 2412019b515eSShri Abhyankar const PetscScalar *b; 2413019b515eSShri Abhyankar 2414019b515eSShri Abhyankar PetscFunctionBegin; 2415*4d12350bSJunchao Zhang PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 2416019b515eSShri Abhyankar node_max = a->inode.node_count; 2417*4d12350bSJunchao Zhang ns = a->inode.size_csr; /* Node Size array */ 2418019b515eSShri Abhyankar 24199566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 24209566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(xx, &x)); 2421019b515eSShri Abhyankar tmp = a->solve_work; 2422019b515eSShri Abhyankar 24239371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 24249371c9d4SSatish Balay r = rout; 24259371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 24269371c9d4SSatish Balay c = cout; 2427019b515eSShri Abhyankar 2428019b515eSShri Abhyankar /* forward solve the lower triangular */ 2429019b515eSShri Abhyankar tmps = tmp; 2430019b515eSShri Abhyankar aa = a_a; 2431019b515eSShri Abhyankar aj = a_j; 2432019b515eSShri Abhyankar ad = a->diag; 2433019b515eSShri Abhyankar 2434*4d12350bSJunchao Zhang for (i = 0; i < node_max; ++i) { 2435*4d12350bSJunchao Zhang row = ns[i]; 2436*4d12350bSJunchao Zhang nsz = ns[i + 1] - ns[i]; 2437019b515eSShri Abhyankar aii = ai[row]; 2438019b515eSShri Abhyankar v1 = aa + aii; 2439019b515eSShri Abhyankar vi = aj + aii; 2440019b515eSShri Abhyankar nz = ai[row + 1] - ai[row]; 2441019b515eSShri Abhyankar 244298991853SShri Abhyankar if (i < node_max - 1) { 244398991853SShri Abhyankar /* Prefetch the indices for the next block */ 244450d8bf02SJed Brown PetscPrefetchBlock(aj + ai[row + nsz], ai[row + nsz + 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */ 244598991853SShri Abhyankar /* Prefetch the data for the next block */ 2446*4d12350bSJunchao Zhang PetscPrefetchBlock(aa + ai[row + nsz], ai[ns[i + 2]] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); 244798991853SShri Abhyankar } 244898991853SShri Abhyankar 2449019b515eSShri Abhyankar switch (nsz) { /* Each loop in 'case' is unrolled */ 2450019b515eSShri Abhyankar case 1: 2451019b515eSShri Abhyankar sum1 = b[r[row]]; 2452019b515eSShri Abhyankar for (j = 0; j < nz - 1; j += 2) { 2453019b515eSShri Abhyankar i0 = vi[j]; 2454019b515eSShri Abhyankar i1 = vi[j + 1]; 2455019b515eSShri Abhyankar tmp0 = tmps[i0]; 2456019b515eSShri Abhyankar tmp1 = tmps[i1]; 2457019b515eSShri Abhyankar sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2458019b515eSShri Abhyankar } 2459019b515eSShri Abhyankar if (j == nz - 1) { 2460019b515eSShri Abhyankar tmp0 = tmps[vi[j]]; 2461019b515eSShri Abhyankar sum1 -= v1[j] * tmp0; 2462019b515eSShri Abhyankar } 2463019b515eSShri Abhyankar tmp[row++] = sum1; 2464019b515eSShri Abhyankar break; 2465019b515eSShri Abhyankar case 2: 2466019b515eSShri Abhyankar sum1 = b[r[row]]; 2467019b515eSShri Abhyankar sum2 = b[r[row + 1]]; 2468019b515eSShri Abhyankar v2 = aa + ai[row + 1]; 2469019b515eSShri Abhyankar 2470019b515eSShri Abhyankar for (j = 0; j < nz - 1; j += 2) { 2471019b515eSShri Abhyankar i0 = vi[j]; 2472019b515eSShri Abhyankar i1 = vi[j + 1]; 2473019b515eSShri Abhyankar tmp0 = tmps[i0]; 2474019b515eSShri Abhyankar tmp1 = tmps[i1]; 2475019b515eSShri Abhyankar sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2476019b515eSShri Abhyankar sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1; 2477019b515eSShri Abhyankar } 2478019b515eSShri Abhyankar if (j == nz - 1) { 2479019b515eSShri Abhyankar tmp0 = tmps[vi[j]]; 2480019b515eSShri Abhyankar sum1 -= v1[j] * tmp0; 2481019b515eSShri Abhyankar sum2 -= v2[j] * tmp0; 2482019b515eSShri Abhyankar } 2483019b515eSShri Abhyankar sum2 -= v2[nz] * sum1; 2484019b515eSShri Abhyankar tmp[row++] = sum1; 2485019b515eSShri Abhyankar tmp[row++] = sum2; 2486019b515eSShri Abhyankar break; 2487019b515eSShri Abhyankar case 3: 2488019b515eSShri Abhyankar sum1 = b[r[row]]; 2489019b515eSShri Abhyankar sum2 = b[r[row + 1]]; 2490019b515eSShri Abhyankar sum3 = b[r[row + 2]]; 2491019b515eSShri Abhyankar v2 = aa + ai[row + 1]; 2492019b515eSShri Abhyankar v3 = aa + ai[row + 2]; 2493019b515eSShri Abhyankar 2494019b515eSShri Abhyankar for (j = 0; j < nz - 1; j += 2) { 2495019b515eSShri Abhyankar i0 = vi[j]; 2496019b515eSShri Abhyankar i1 = vi[j + 1]; 2497019b515eSShri Abhyankar tmp0 = tmps[i0]; 2498019b515eSShri Abhyankar tmp1 = tmps[i1]; 2499019b515eSShri Abhyankar sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2500019b515eSShri Abhyankar sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1; 2501019b515eSShri Abhyankar sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1; 2502019b515eSShri Abhyankar } 2503019b515eSShri Abhyankar if (j == nz - 1) { 2504019b515eSShri Abhyankar tmp0 = tmps[vi[j]]; 2505019b515eSShri Abhyankar sum1 -= v1[j] * tmp0; 2506019b515eSShri Abhyankar sum2 -= v2[j] * tmp0; 2507019b515eSShri Abhyankar sum3 -= v3[j] * tmp0; 2508019b515eSShri Abhyankar } 2509019b515eSShri Abhyankar sum2 -= v2[nz] * sum1; 2510019b515eSShri Abhyankar sum3 -= v3[nz] * sum1; 2511019b515eSShri Abhyankar sum3 -= v3[nz + 1] * sum2; 2512019b515eSShri Abhyankar tmp[row++] = sum1; 2513019b515eSShri Abhyankar tmp[row++] = sum2; 2514019b515eSShri Abhyankar tmp[row++] = sum3; 2515019b515eSShri Abhyankar break; 2516019b515eSShri Abhyankar 2517019b515eSShri Abhyankar case 4: 2518019b515eSShri Abhyankar sum1 = b[r[row]]; 2519019b515eSShri Abhyankar sum2 = b[r[row + 1]]; 2520019b515eSShri Abhyankar sum3 = b[r[row + 2]]; 2521019b515eSShri Abhyankar sum4 = b[r[row + 3]]; 2522019b515eSShri Abhyankar v2 = aa + ai[row + 1]; 2523019b515eSShri Abhyankar v3 = aa + ai[row + 2]; 2524019b515eSShri Abhyankar v4 = aa + ai[row + 3]; 2525019b515eSShri Abhyankar 2526019b515eSShri Abhyankar for (j = 0; j < nz - 1; j += 2) { 2527019b515eSShri Abhyankar i0 = vi[j]; 2528019b515eSShri Abhyankar i1 = vi[j + 1]; 2529019b515eSShri Abhyankar tmp0 = tmps[i0]; 2530019b515eSShri Abhyankar tmp1 = tmps[i1]; 2531019b515eSShri Abhyankar sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2532019b515eSShri Abhyankar sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1; 2533019b515eSShri Abhyankar sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1; 2534019b515eSShri Abhyankar sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1; 2535019b515eSShri Abhyankar } 2536019b515eSShri Abhyankar if (j == nz - 1) { 2537019b515eSShri Abhyankar tmp0 = tmps[vi[j]]; 2538019b515eSShri Abhyankar sum1 -= v1[j] * tmp0; 2539019b515eSShri Abhyankar sum2 -= v2[j] * tmp0; 2540019b515eSShri Abhyankar sum3 -= v3[j] * tmp0; 2541019b515eSShri Abhyankar sum4 -= v4[j] * tmp0; 2542019b515eSShri Abhyankar } 2543019b515eSShri Abhyankar sum2 -= v2[nz] * sum1; 2544019b515eSShri Abhyankar sum3 -= v3[nz] * sum1; 2545019b515eSShri Abhyankar sum4 -= v4[nz] * sum1; 2546019b515eSShri Abhyankar sum3 -= v3[nz + 1] * sum2; 2547019b515eSShri Abhyankar sum4 -= v4[nz + 1] * sum2; 2548019b515eSShri Abhyankar sum4 -= v4[nz + 2] * sum3; 2549019b515eSShri Abhyankar 2550019b515eSShri Abhyankar tmp[row++] = sum1; 2551019b515eSShri Abhyankar tmp[row++] = sum2; 2552019b515eSShri Abhyankar tmp[row++] = sum3; 2553019b515eSShri Abhyankar tmp[row++] = sum4; 2554019b515eSShri Abhyankar break; 2555019b515eSShri Abhyankar case 5: 2556019b515eSShri Abhyankar sum1 = b[r[row]]; 2557019b515eSShri Abhyankar sum2 = b[r[row + 1]]; 2558019b515eSShri Abhyankar sum3 = b[r[row + 2]]; 2559019b515eSShri Abhyankar sum4 = b[r[row + 3]]; 2560019b515eSShri Abhyankar sum5 = b[r[row + 4]]; 2561019b515eSShri Abhyankar v2 = aa + ai[row + 1]; 2562019b515eSShri Abhyankar v3 = aa + ai[row + 2]; 2563019b515eSShri Abhyankar v4 = aa + ai[row + 3]; 2564019b515eSShri Abhyankar v5 = aa + ai[row + 4]; 2565019b515eSShri Abhyankar 2566019b515eSShri Abhyankar for (j = 0; j < nz - 1; j += 2) { 2567019b515eSShri Abhyankar i0 = vi[j]; 2568019b515eSShri Abhyankar i1 = vi[j + 1]; 2569019b515eSShri Abhyankar tmp0 = tmps[i0]; 2570019b515eSShri Abhyankar tmp1 = tmps[i1]; 2571019b515eSShri Abhyankar sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2572019b515eSShri Abhyankar sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1; 2573019b515eSShri Abhyankar sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1; 2574019b515eSShri Abhyankar sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1; 2575019b515eSShri Abhyankar sum5 -= v5[j] * tmp0 + v5[j + 1] * tmp1; 2576019b515eSShri Abhyankar } 2577019b515eSShri Abhyankar if (j == nz - 1) { 2578019b515eSShri Abhyankar tmp0 = tmps[vi[j]]; 2579019b515eSShri Abhyankar sum1 -= v1[j] * tmp0; 2580019b515eSShri Abhyankar sum2 -= v2[j] * tmp0; 2581019b515eSShri Abhyankar sum3 -= v3[j] * tmp0; 2582019b515eSShri Abhyankar sum4 -= v4[j] * tmp0; 2583019b515eSShri Abhyankar sum5 -= v5[j] * tmp0; 2584019b515eSShri Abhyankar } 2585019b515eSShri Abhyankar 2586019b515eSShri Abhyankar sum2 -= v2[nz] * sum1; 2587019b515eSShri Abhyankar sum3 -= v3[nz] * sum1; 2588019b515eSShri Abhyankar sum4 -= v4[nz] * sum1; 2589019b515eSShri Abhyankar sum5 -= v5[nz] * sum1; 2590019b515eSShri Abhyankar sum3 -= v3[nz + 1] * sum2; 2591019b515eSShri Abhyankar sum4 -= v4[nz + 1] * sum2; 2592019b515eSShri Abhyankar sum5 -= v5[nz + 1] * sum2; 2593019b515eSShri Abhyankar sum4 -= v4[nz + 2] * sum3; 2594019b515eSShri Abhyankar sum5 -= v5[nz + 2] * sum3; 2595019b515eSShri Abhyankar sum5 -= v5[nz + 3] * sum4; 2596019b515eSShri Abhyankar 2597019b515eSShri Abhyankar tmp[row++] = sum1; 2598019b515eSShri Abhyankar tmp[row++] = sum2; 2599019b515eSShri Abhyankar tmp[row++] = sum3; 2600019b515eSShri Abhyankar tmp[row++] = sum4; 2601019b515eSShri Abhyankar tmp[row++] = sum5; 2602019b515eSShri Abhyankar break; 2603d71ae5a4SJacob Faibussowitsch default: 2604d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported "); 2605019b515eSShri Abhyankar } 2606019b515eSShri Abhyankar } 2607019b515eSShri Abhyankar /* backward solve the upper triangular */ 2608*4d12350bSJunchao Zhang for (i = node_max - 1; i >= 0; i--) { 2609*4d12350bSJunchao Zhang row = ns[i + 1] - 1; 2610*4d12350bSJunchao Zhang nsz = ns[i + 1] - ns[i]; 2611019b515eSShri Abhyankar aii = ad[row + 1] + 1; 2612019b515eSShri Abhyankar v1 = aa + aii; 2613019b515eSShri Abhyankar vi = aj + aii; 2614019b515eSShri Abhyankar nz = ad[row] - ad[row + 1] - 1; 261598991853SShri Abhyankar 261698991853SShri Abhyankar if (i > 0) { 261798991853SShri Abhyankar /* Prefetch the indices for the next block */ 261850d8bf02SJed Brown PetscPrefetchBlock(aj + ad[row - nsz + 1] + 1, ad[row - nsz] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA); 261998991853SShri Abhyankar /* Prefetch the data for the next block */ 2620*4d12350bSJunchao Zhang PetscPrefetchBlock(aa + ad[row - nsz + 1] + 1, ad[ns[i - 1] + 1] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA); 262198991853SShri Abhyankar } 262298991853SShri Abhyankar 2623019b515eSShri Abhyankar switch (nsz) { /* Each loop in 'case' is unrolled */ 2624019b515eSShri Abhyankar case 1: 2625019b515eSShri Abhyankar sum1 = tmp[row]; 2626019b515eSShri Abhyankar 2627019b515eSShri Abhyankar for (j = 0; j < nz - 1; j += 2) { 2628019b515eSShri Abhyankar i0 = vi[j]; 2629019b515eSShri Abhyankar i1 = vi[j + 1]; 2630019b515eSShri Abhyankar tmp0 = tmps[i0]; 2631019b515eSShri Abhyankar tmp1 = tmps[i1]; 2632019b515eSShri Abhyankar sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2633019b515eSShri Abhyankar } 2634019b515eSShri Abhyankar if (j == nz - 1) { 2635019b515eSShri Abhyankar tmp0 = tmps[vi[j]]; 2636019b515eSShri Abhyankar sum1 -= v1[j] * tmp0; 2637019b515eSShri Abhyankar } 26389371c9d4SSatish Balay x[c[row]] = tmp[row] = sum1 * v1[nz]; 26399371c9d4SSatish Balay row--; 2640019b515eSShri Abhyankar break; 2641019b515eSShri Abhyankar case 2: 2642019b515eSShri Abhyankar sum1 = tmp[row]; 2643019b515eSShri Abhyankar sum2 = tmp[row - 1]; 2644019b515eSShri Abhyankar v2 = aa + ad[row] + 1; 2645019b515eSShri Abhyankar for (j = 0; j < nz - 1; j += 2) { 2646019b515eSShri Abhyankar i0 = vi[j]; 2647019b515eSShri Abhyankar i1 = vi[j + 1]; 2648019b515eSShri Abhyankar tmp0 = tmps[i0]; 2649019b515eSShri Abhyankar tmp1 = tmps[i1]; 2650019b515eSShri Abhyankar sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2651019b515eSShri Abhyankar sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1; 2652019b515eSShri Abhyankar } 2653019b515eSShri Abhyankar if (j == nz - 1) { 2654019b515eSShri Abhyankar tmp0 = tmps[vi[j]]; 2655019b515eSShri Abhyankar sum1 -= v1[j] * tmp0; 2656019b515eSShri Abhyankar sum2 -= v2[j + 1] * tmp0; 2657019b515eSShri Abhyankar } 2658019b515eSShri Abhyankar 26599371c9d4SSatish Balay tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz]; 26609371c9d4SSatish Balay row--; 2661019b515eSShri Abhyankar sum2 -= v2[0] * tmp0; 26629371c9d4SSatish Balay x[c[row]] = tmp[row] = sum2 * v2[nz + 1]; 26639371c9d4SSatish Balay row--; 2664019b515eSShri Abhyankar break; 2665019b515eSShri Abhyankar case 3: 2666019b515eSShri Abhyankar sum1 = tmp[row]; 2667019b515eSShri Abhyankar sum2 = tmp[row - 1]; 2668019b515eSShri Abhyankar sum3 = tmp[row - 2]; 2669019b515eSShri Abhyankar v2 = aa + ad[row] + 1; 2670019b515eSShri Abhyankar v3 = aa + ad[row - 1] + 1; 2671019b515eSShri Abhyankar for (j = 0; j < nz - 1; j += 2) { 2672019b515eSShri Abhyankar i0 = vi[j]; 2673019b515eSShri Abhyankar i1 = vi[j + 1]; 2674019b515eSShri Abhyankar tmp0 = tmps[i0]; 2675019b515eSShri Abhyankar tmp1 = tmps[i1]; 2676019b515eSShri Abhyankar sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2677019b515eSShri Abhyankar sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1; 2678019b515eSShri Abhyankar sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1; 2679019b515eSShri Abhyankar } 2680019b515eSShri Abhyankar if (j == nz - 1) { 2681019b515eSShri Abhyankar tmp0 = tmps[vi[j]]; 2682019b515eSShri Abhyankar sum1 -= v1[j] * tmp0; 2683019b515eSShri Abhyankar sum2 -= v2[j + 1] * tmp0; 2684019b515eSShri Abhyankar sum3 -= v3[j + 2] * tmp0; 2685019b515eSShri Abhyankar } 26869371c9d4SSatish Balay tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz]; 26879371c9d4SSatish Balay row--; 2688019b515eSShri Abhyankar sum2 -= v2[0] * tmp0; 2689019b515eSShri Abhyankar sum3 -= v3[1] * tmp0; 26909371c9d4SSatish Balay tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1]; 26919371c9d4SSatish Balay row--; 2692019b515eSShri Abhyankar sum3 -= v3[0] * tmp0; 26939371c9d4SSatish Balay x[c[row]] = tmp[row] = sum3 * v3[nz + 2]; 26949371c9d4SSatish Balay row--; 2695019b515eSShri Abhyankar 2696019b515eSShri Abhyankar break; 2697019b515eSShri Abhyankar case 4: 2698019b515eSShri Abhyankar sum1 = tmp[row]; 2699019b515eSShri Abhyankar sum2 = tmp[row - 1]; 2700019b515eSShri Abhyankar sum3 = tmp[row - 2]; 2701019b515eSShri Abhyankar sum4 = tmp[row - 3]; 2702019b515eSShri Abhyankar v2 = aa + ad[row] + 1; 2703019b515eSShri Abhyankar v3 = aa + ad[row - 1] + 1; 2704019b515eSShri Abhyankar v4 = aa + ad[row - 2] + 1; 2705019b515eSShri Abhyankar 2706019b515eSShri Abhyankar for (j = 0; j < nz - 1; j += 2) { 2707019b515eSShri Abhyankar i0 = vi[j]; 2708019b515eSShri Abhyankar i1 = vi[j + 1]; 2709019b515eSShri Abhyankar tmp0 = tmps[i0]; 2710019b515eSShri Abhyankar tmp1 = tmps[i1]; 2711019b515eSShri Abhyankar sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2712019b515eSShri Abhyankar sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1; 2713019b515eSShri Abhyankar sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1; 2714019b515eSShri Abhyankar sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1; 2715019b515eSShri Abhyankar } 2716019b515eSShri Abhyankar if (j == nz - 1) { 2717019b515eSShri Abhyankar tmp0 = tmps[vi[j]]; 2718019b515eSShri Abhyankar sum1 -= v1[j] * tmp0; 2719019b515eSShri Abhyankar sum2 -= v2[j + 1] * tmp0; 2720019b515eSShri Abhyankar sum3 -= v3[j + 2] * tmp0; 2721019b515eSShri Abhyankar sum4 -= v4[j + 3] * tmp0; 2722019b515eSShri Abhyankar } 2723019b515eSShri Abhyankar 27249371c9d4SSatish Balay tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz]; 27259371c9d4SSatish Balay row--; 2726019b515eSShri Abhyankar sum2 -= v2[0] * tmp0; 2727019b515eSShri Abhyankar sum3 -= v3[1] * tmp0; 2728019b515eSShri Abhyankar sum4 -= v4[2] * tmp0; 27299371c9d4SSatish Balay tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1]; 27309371c9d4SSatish Balay row--; 2731019b515eSShri Abhyankar sum3 -= v3[0] * tmp0; 2732019b515eSShri Abhyankar sum4 -= v4[1] * tmp0; 27339371c9d4SSatish Balay tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2]; 27349371c9d4SSatish Balay row--; 2735019b515eSShri Abhyankar sum4 -= v4[0] * tmp0; 27369371c9d4SSatish Balay x[c[row]] = tmp[row] = sum4 * v4[nz + 3]; 27379371c9d4SSatish Balay row--; 2738019b515eSShri Abhyankar break; 2739019b515eSShri Abhyankar case 5: 2740019b515eSShri Abhyankar sum1 = tmp[row]; 2741019b515eSShri Abhyankar sum2 = tmp[row - 1]; 2742019b515eSShri Abhyankar sum3 = tmp[row - 2]; 2743019b515eSShri Abhyankar sum4 = tmp[row - 3]; 2744019b515eSShri Abhyankar sum5 = tmp[row - 4]; 2745019b515eSShri Abhyankar v2 = aa + ad[row] + 1; 2746019b515eSShri Abhyankar v3 = aa + ad[row - 1] + 1; 2747019b515eSShri Abhyankar v4 = aa + ad[row - 2] + 1; 2748019b515eSShri Abhyankar v5 = aa + ad[row - 3] + 1; 2749019b515eSShri Abhyankar for (j = 0; j < nz - 1; j += 2) { 2750019b515eSShri Abhyankar i0 = vi[j]; 2751019b515eSShri Abhyankar i1 = vi[j + 1]; 2752019b515eSShri Abhyankar tmp0 = tmps[i0]; 2753019b515eSShri Abhyankar tmp1 = tmps[i1]; 2754019b515eSShri Abhyankar sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2755019b515eSShri Abhyankar sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1; 2756019b515eSShri Abhyankar sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1; 2757019b515eSShri Abhyankar sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1; 2758019b515eSShri Abhyankar sum5 -= v5[j + 4] * tmp0 + v5[j + 5] * tmp1; 2759019b515eSShri Abhyankar } 2760019b515eSShri Abhyankar if (j == nz - 1) { 2761019b515eSShri Abhyankar tmp0 = tmps[vi[j]]; 2762019b515eSShri Abhyankar sum1 -= v1[j] * tmp0; 2763019b515eSShri Abhyankar sum2 -= v2[j + 1] * tmp0; 2764019b515eSShri Abhyankar sum3 -= v3[j + 2] * tmp0; 2765019b515eSShri Abhyankar sum4 -= v4[j + 3] * tmp0; 2766019b515eSShri Abhyankar sum5 -= v5[j + 4] * tmp0; 2767019b515eSShri Abhyankar } 2768019b515eSShri Abhyankar 27699371c9d4SSatish Balay tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz]; 27709371c9d4SSatish Balay row--; 2771019b515eSShri Abhyankar sum2 -= v2[0] * tmp0; 2772019b515eSShri Abhyankar sum3 -= v3[1] * tmp0; 2773019b515eSShri Abhyankar sum4 -= v4[2] * tmp0; 2774019b515eSShri Abhyankar sum5 -= v5[3] * tmp0; 27759371c9d4SSatish Balay tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1]; 27769371c9d4SSatish Balay row--; 2777019b515eSShri Abhyankar sum3 -= v3[0] * tmp0; 2778019b515eSShri Abhyankar sum4 -= v4[1] * tmp0; 2779019b515eSShri Abhyankar sum5 -= v5[2] * tmp0; 27809371c9d4SSatish Balay tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2]; 27819371c9d4SSatish Balay row--; 2782019b515eSShri Abhyankar sum4 -= v4[0] * tmp0; 2783019b515eSShri Abhyankar sum5 -= v5[1] * tmp0; 27849371c9d4SSatish Balay tmp0 = x[c[row]] = tmp[row] = sum4 * v4[nz + 3]; 27859371c9d4SSatish Balay row--; 2786019b515eSShri Abhyankar sum5 -= v5[0] * tmp0; 27879371c9d4SSatish Balay x[c[row]] = tmp[row] = sum5 * v5[nz + 4]; 27889371c9d4SSatish Balay row--; 2789019b515eSShri Abhyankar break; 2790d71ae5a4SJacob Faibussowitsch default: 2791d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported "); 2792019b515eSShri Abhyankar } 2793019b515eSShri Abhyankar } 27949566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 27959566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 27969566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 27979566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(xx, &x)); 27989566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n)); 27993ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2800019b515eSShri Abhyankar } 2801019b515eSShri Abhyankar 28024c1414c8SBarry Smith /* 28034c1414c8SBarry Smith Makes a longer coloring[] array and calls the usual code with that 28044c1414c8SBarry Smith */ 280566976f2fSJacob Faibussowitsch static PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat, PetscInt ncolors, PetscInt nin, ISColoringValue coloring[], ISColoring *iscoloring) 2806d71ae5a4SJacob Faibussowitsch { 28074c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)mat->data; 2808*4d12350bSJunchao Zhang PetscInt n = mat->cmap->n, m = a->inode.node_count, j, *ns = a->inode.size_csr, row; 28094c1414c8SBarry Smith PetscInt *colorused, i; 28104c1414c8SBarry Smith ISColoringValue *newcolor; 28114c1414c8SBarry Smith 28124c1414c8SBarry Smith PetscFunctionBegin; 2813*4d12350bSJunchao Zhang PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 28149566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(n + 1, &newcolor)); 28154c1414c8SBarry Smith /* loop over inodes, marking a color for each column*/ 28164c1414c8SBarry Smith row = 0; 28174c1414c8SBarry Smith for (i = 0; i < m; i++) { 2818*4d12350bSJunchao Zhang for (j = 0; j < (ns[i + 1] - ns[i]); j++) PetscCall(ISColoringValueCast(coloring[i] + j * ncolors, newcolor + row++)); 28194c1414c8SBarry Smith } 28204c1414c8SBarry Smith 28214c1414c8SBarry Smith /* eliminate unneeded colors */ 28229566063dSJacob Faibussowitsch PetscCall(PetscCalloc1(5 * ncolors, &colorused)); 2823ad540459SPierre Jolivet for (i = 0; i < n; i++) colorused[newcolor[i]] = 1; 28244c1414c8SBarry Smith 2825ad540459SPierre Jolivet for (i = 1; i < 5 * ncolors; i++) colorused[i] += colorused[i - 1]; 28264c1414c8SBarry Smith ncolors = colorused[5 * ncolors - 1]; 28276497c311SBarry Smith for (i = 0; i < n; i++) PetscCall(ISColoringValueCast(colorused[newcolor[i]] - 1, newcolor + i)); 28289566063dSJacob Faibussowitsch PetscCall(PetscFree(colorused)); 28299566063dSJacob Faibussowitsch PetscCall(ISColoringCreate(PetscObjectComm((PetscObject)mat), ncolors, n, newcolor, PETSC_OWN_POINTER, iscoloring)); 28309566063dSJacob Faibussowitsch PetscCall(PetscFree(coloring)); 28313ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 28324c1414c8SBarry Smith } 28334c1414c8SBarry Smith 2834af0996ceSBarry Smith #include <petsc/private/kernels/blockinvert.h> 28352af78befSBarry Smith 2836d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx) 2837d71ae5a4SJacob Faibussowitsch { 28382af78befSBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 28397aaeff0aSMatthew G. Knepley PetscScalar sum1 = 0.0, sum2 = 0.0, sum3 = 0.0, sum4 = 0.0, sum5 = 0.0, tmp0, tmp1, tmp2, tmp3; 28405850ef23SBarry Smith MatScalar *ibdiag, *bdiag, work[25], *t; 2841a8b09249SBarry Smith PetscScalar *x, tmp4, tmp5, x1, x2, x3, x4, x5; 28427aaeff0aSMatthew G. Knepley const MatScalar *v = a->a, *v1 = NULL, *v2 = NULL, *v3 = NULL, *v4 = NULL, *v5 = NULL; 28435850ef23SBarry Smith const PetscScalar *xb, *b; 28447b6c816cSBarry Smith PetscReal zeropivot = 100. * PETSC_MACHINE_EPSILON, shift = 0.0; 2845*4d12350bSJunchao Zhang PetscInt n, m = a->inode.node_count, cnt = 0, i, j, row, i1, i2, nodesz; 28468758e1faSBarry Smith PetscInt sz, k, ipvt[5]; 28477b6c816cSBarry Smith PetscBool allowzeropivot, zeropivotdetected; 2848*4d12350bSJunchao Zhang const PetscInt *sizes = a->inode.size_csr, *idx, *diag = a->diag, *ii = a->i; 28492af78befSBarry Smith 28502af78befSBarry Smith PetscFunctionBegin; 2851a455e926SHong Zhang allowzeropivot = PetscNot(A->erroriffailure); 2852*4d12350bSJunchao Zhang PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 285308401ef6SPierre Jolivet PetscCheck(omega == 1.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for omega != 1.0; use -mat_no_inode"); 285408401ef6SPierre Jolivet PetscCheck(fshift == 0.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for fshift != 0.0; use -mat_no_inode"); 28552af78befSBarry Smith 285671f1c65dSBarry Smith if (!a->inode.ibdiagvalid) { 28572af78befSBarry Smith if (!a->inode.ibdiag) { 28582af78befSBarry Smith /* calculate space needed for diagonal blocks */ 2859*4d12350bSJunchao Zhang for (i = 0; i < m; i++) { 2860*4d12350bSJunchao Zhang nodesz = sizes[i + 1] - sizes[i]; 2861*4d12350bSJunchao Zhang cnt += nodesz * nodesz; 2862*4d12350bSJunchao Zhang } 2863f0d39aaaSBarry Smith a->inode.bdiagsize = cnt; 28642205254eSKarl Rupp 28659566063dSJacob Faibussowitsch PetscCall(PetscMalloc3(cnt, &a->inode.ibdiag, cnt, &a->inode.bdiag, A->rmap->n, &a->inode.ssor_work)); 286671f1c65dSBarry Smith } 286771f1c65dSBarry Smith 286871f1c65dSBarry Smith /* copy over the diagonal blocks and invert them */ 28692af78befSBarry Smith ibdiag = a->inode.ibdiag; 28702af78befSBarry Smith bdiag = a->inode.bdiag; 28712af78befSBarry Smith cnt = 0; 28722af78befSBarry Smith for (i = 0, row = 0; i < m; i++) { 2873*4d12350bSJunchao Zhang nodesz = sizes[i + 1] - sizes[i]; 2874*4d12350bSJunchao Zhang for (j = 0; j < nodesz; j++) { 2875*4d12350bSJunchao Zhang for (k = 0; k < nodesz; k++) bdiag[cnt + k * nodesz + j] = v[diag[row + j] - j + k]; 28762af78befSBarry Smith } 2877*4d12350bSJunchao Zhang PetscCall(PetscArraycpy(ibdiag + cnt, bdiag + cnt, nodesz * nodesz)); 28782af78befSBarry Smith 2879*4d12350bSJunchao Zhang switch (nodesz) { 28802af78befSBarry Smith case 1: 28812af78befSBarry Smith /* Create matrix data structure */ 28828e0e2a9aSHong Zhang if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) { 28838e0e2a9aSHong Zhang if (allowzeropivot) { 28847b6c816cSBarry Smith A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 28857b6c816cSBarry Smith A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]); 28867b6c816cSBarry Smith A->factorerror_zeropivot_row = row; 28879566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "Zero pivot, row %" PetscInt_FMT "\n", row)); 288898921bdaSJacob Faibussowitsch } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot on row %" PetscInt_FMT, row); 28898e0e2a9aSHong Zhang } 289064c62002SMatthew Knepley ibdiag[cnt] = 1.0 / ibdiag[cnt]; 28912af78befSBarry Smith break; 28922af78befSBarry Smith case 2: 28939566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_inverse_A_2(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected)); 28947b6c816cSBarry Smith if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 28952af78befSBarry Smith break; 28962af78befSBarry Smith case 3: 28979566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_inverse_A_3(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected)); 28987b6c816cSBarry Smith if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 28992af78befSBarry Smith break; 29002af78befSBarry Smith case 4: 29019566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_inverse_A_4(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected)); 29027b6c816cSBarry Smith if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 29032af78befSBarry Smith break; 29042af78befSBarry Smith case 5: 29059566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_inverse_A_5(ibdiag + cnt, ipvt, work, shift, allowzeropivot, &zeropivotdetected)); 29067b6c816cSBarry Smith if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 29072af78befSBarry Smith break; 2908d71ae5a4SJacob Faibussowitsch default: 2909*4d12350bSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz); 29102af78befSBarry Smith } 2911*4d12350bSJunchao Zhang cnt += nodesz * nodesz; 2912*4d12350bSJunchao Zhang row += nodesz; 29132af78befSBarry Smith } 291471f1c65dSBarry Smith a->inode.ibdiagvalid = PETSC_TRUE; 29152af78befSBarry Smith } 29162af78befSBarry Smith ibdiag = a->inode.ibdiag; 29172af78befSBarry Smith bdiag = a->inode.bdiag; 29185850ef23SBarry Smith t = a->inode.ssor_work; 29192af78befSBarry Smith 29209566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 29219566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 29225850ef23SBarry Smith /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 29235850ef23SBarry Smith if (flag & SOR_ZERO_INITIAL_GUESS) { 29242af78befSBarry Smith if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 29258862d2efSBarry Smith for (i = 0, row = 0; i < m; i++) { 29268862d2efSBarry Smith sz = diag[row] - ii[row]; 29278862d2efSBarry Smith v1 = a->a + ii[row]; 29288862d2efSBarry Smith idx = a->j + ii[row]; 29298862d2efSBarry Smith 29304108e4d5SBarry Smith /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2931*4d12350bSJunchao Zhang nodesz = sizes[i + 1] - sizes[i]; 2932*4d12350bSJunchao Zhang switch (nodesz) { 29338862d2efSBarry Smith case 1: 29348862d2efSBarry Smith 29358862d2efSBarry Smith sum1 = b[row]; 29368862d2efSBarry Smith for (n = 0; n < sz - 1; n += 2) { 29378862d2efSBarry Smith i1 = idx[0]; 29388862d2efSBarry Smith i2 = idx[1]; 29398862d2efSBarry Smith idx += 2; 29408862d2efSBarry Smith tmp0 = x[i1]; 29418862d2efSBarry Smith tmp1 = x[i2]; 29429371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 29439371c9d4SSatish Balay v1 += 2; 29448862d2efSBarry Smith } 29458862d2efSBarry Smith 29468862d2efSBarry Smith if (n == sz - 1) { 2947f0d39aaaSBarry Smith tmp0 = x[*idx]; 2948f0d39aaaSBarry Smith sum1 -= *v1 * tmp0; 29498862d2efSBarry Smith } 29505850ef23SBarry Smith t[row] = sum1; 29518862d2efSBarry Smith x[row++] = sum1 * (*ibdiag++); 29528862d2efSBarry Smith break; 2953f0d39aaaSBarry Smith case 2: 2954f0d39aaaSBarry Smith v2 = a->a + ii[row + 1]; 2955f0d39aaaSBarry Smith sum1 = b[row]; 2956f0d39aaaSBarry Smith sum2 = b[row + 1]; 2957f0d39aaaSBarry Smith for (n = 0; n < sz - 1; n += 2) { 2958f0d39aaaSBarry Smith i1 = idx[0]; 2959f0d39aaaSBarry Smith i2 = idx[1]; 2960f0d39aaaSBarry Smith idx += 2; 2961f0d39aaaSBarry Smith tmp0 = x[i1]; 2962f0d39aaaSBarry Smith tmp1 = x[i2]; 29639371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 29649371c9d4SSatish Balay v1 += 2; 29659371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 29669371c9d4SSatish Balay v2 += 2; 2967f0d39aaaSBarry Smith } 2968f0d39aaaSBarry Smith 2969f0d39aaaSBarry Smith if (n == sz - 1) { 2970f0d39aaaSBarry Smith tmp0 = x[*idx]; 2971f0d39aaaSBarry Smith sum1 -= v1[0] * tmp0; 2972f0d39aaaSBarry Smith sum2 -= v2[0] * tmp0; 2973f0d39aaaSBarry Smith } 29745850ef23SBarry Smith t[row] = sum1; 29755850ef23SBarry Smith t[row + 1] = sum2; 2976f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[2]; 2977f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[3]; 2978f0d39aaaSBarry Smith ibdiag += 4; 2979f0d39aaaSBarry Smith break; 2980f0d39aaaSBarry Smith case 3: 2981f0d39aaaSBarry Smith v2 = a->a + ii[row + 1]; 2982f0d39aaaSBarry Smith v3 = a->a + ii[row + 2]; 2983f0d39aaaSBarry Smith sum1 = b[row]; 2984f0d39aaaSBarry Smith sum2 = b[row + 1]; 2985f0d39aaaSBarry Smith sum3 = b[row + 2]; 2986f0d39aaaSBarry Smith for (n = 0; n < sz - 1; n += 2) { 2987f0d39aaaSBarry Smith i1 = idx[0]; 2988f0d39aaaSBarry Smith i2 = idx[1]; 2989f0d39aaaSBarry Smith idx += 2; 2990f0d39aaaSBarry Smith tmp0 = x[i1]; 2991f0d39aaaSBarry Smith tmp1 = x[i2]; 29929371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 29939371c9d4SSatish Balay v1 += 2; 29949371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 29959371c9d4SSatish Balay v2 += 2; 29969371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 29979371c9d4SSatish Balay v3 += 2; 2998f0d39aaaSBarry Smith } 2999f0d39aaaSBarry Smith 3000f0d39aaaSBarry Smith if (n == sz - 1) { 3001f0d39aaaSBarry Smith tmp0 = x[*idx]; 3002f0d39aaaSBarry Smith sum1 -= v1[0] * tmp0; 3003f0d39aaaSBarry Smith sum2 -= v2[0] * tmp0; 3004f0d39aaaSBarry Smith sum3 -= v3[0] * tmp0; 3005f0d39aaaSBarry Smith } 30065850ef23SBarry Smith t[row] = sum1; 30075850ef23SBarry Smith t[row + 1] = sum2; 30085850ef23SBarry Smith t[row + 2] = sum3; 3009f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6]; 3010f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7]; 3011f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8]; 3012f0d39aaaSBarry Smith ibdiag += 9; 3013f0d39aaaSBarry Smith break; 3014f0d39aaaSBarry Smith case 4: 3015f0d39aaaSBarry Smith v2 = a->a + ii[row + 1]; 3016f0d39aaaSBarry Smith v3 = a->a + ii[row + 2]; 3017f0d39aaaSBarry Smith v4 = a->a + ii[row + 3]; 3018f0d39aaaSBarry Smith sum1 = b[row]; 3019f0d39aaaSBarry Smith sum2 = b[row + 1]; 3020f0d39aaaSBarry Smith sum3 = b[row + 2]; 3021f0d39aaaSBarry Smith sum4 = b[row + 3]; 3022f0d39aaaSBarry Smith for (n = 0; n < sz - 1; n += 2) { 3023f0d39aaaSBarry Smith i1 = idx[0]; 3024f0d39aaaSBarry Smith i2 = idx[1]; 3025f0d39aaaSBarry Smith idx += 2; 3026f0d39aaaSBarry Smith tmp0 = x[i1]; 3027f0d39aaaSBarry Smith tmp1 = x[i2]; 30289371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 30299371c9d4SSatish Balay v1 += 2; 30309371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 30319371c9d4SSatish Balay v2 += 2; 30329371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 30339371c9d4SSatish Balay v3 += 2; 30349371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 30359371c9d4SSatish Balay v4 += 2; 3036f0d39aaaSBarry Smith } 3037f0d39aaaSBarry Smith 3038f0d39aaaSBarry Smith if (n == sz - 1) { 3039f0d39aaaSBarry Smith tmp0 = x[*idx]; 3040f0d39aaaSBarry Smith sum1 -= v1[0] * tmp0; 3041f0d39aaaSBarry Smith sum2 -= v2[0] * tmp0; 3042f0d39aaaSBarry Smith sum3 -= v3[0] * tmp0; 3043f0d39aaaSBarry Smith sum4 -= v4[0] * tmp0; 3044f0d39aaaSBarry Smith } 30455850ef23SBarry Smith t[row] = sum1; 30465850ef23SBarry Smith t[row + 1] = sum2; 30475850ef23SBarry Smith t[row + 2] = sum3; 30485850ef23SBarry Smith t[row + 3] = sum4; 3049f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12]; 3050f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13]; 3051f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14]; 3052f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15]; 3053f0d39aaaSBarry Smith ibdiag += 16; 3054f0d39aaaSBarry Smith break; 3055f0d39aaaSBarry Smith case 5: 3056f0d39aaaSBarry Smith v2 = a->a + ii[row + 1]; 3057f0d39aaaSBarry Smith v3 = a->a + ii[row + 2]; 3058f0d39aaaSBarry Smith v4 = a->a + ii[row + 3]; 3059f0d39aaaSBarry Smith v5 = a->a + ii[row + 4]; 3060f0d39aaaSBarry Smith sum1 = b[row]; 3061f0d39aaaSBarry Smith sum2 = b[row + 1]; 3062f0d39aaaSBarry Smith sum3 = b[row + 2]; 3063f0d39aaaSBarry Smith sum4 = b[row + 3]; 3064f0d39aaaSBarry Smith sum5 = b[row + 4]; 3065f0d39aaaSBarry Smith for (n = 0; n < sz - 1; n += 2) { 3066f0d39aaaSBarry Smith i1 = idx[0]; 3067f0d39aaaSBarry Smith i2 = idx[1]; 3068f0d39aaaSBarry Smith idx += 2; 3069f0d39aaaSBarry Smith tmp0 = x[i1]; 3070f0d39aaaSBarry Smith tmp1 = x[i2]; 30719371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 30729371c9d4SSatish Balay v1 += 2; 30739371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 30749371c9d4SSatish Balay v2 += 2; 30759371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 30769371c9d4SSatish Balay v3 += 2; 30779371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 30789371c9d4SSatish Balay v4 += 2; 30799371c9d4SSatish Balay sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 30809371c9d4SSatish Balay v5 += 2; 3081f0d39aaaSBarry Smith } 3082f0d39aaaSBarry Smith 3083f0d39aaaSBarry Smith if (n == sz - 1) { 3084f0d39aaaSBarry Smith tmp0 = x[*idx]; 3085f0d39aaaSBarry Smith sum1 -= v1[0] * tmp0; 3086f0d39aaaSBarry Smith sum2 -= v2[0] * tmp0; 3087f0d39aaaSBarry Smith sum3 -= v3[0] * tmp0; 3088f0d39aaaSBarry Smith sum4 -= v4[0] * tmp0; 3089f0d39aaaSBarry Smith sum5 -= v5[0] * tmp0; 3090f0d39aaaSBarry Smith } 30915850ef23SBarry Smith t[row] = sum1; 30925850ef23SBarry Smith t[row + 1] = sum2; 30935850ef23SBarry Smith t[row + 2] = sum3; 30945850ef23SBarry Smith t[row + 3] = sum4; 30955850ef23SBarry Smith t[row + 4] = sum5; 3096f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20]; 3097f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21]; 3098f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22]; 3099f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23]; 3100f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24]; 3101f0d39aaaSBarry Smith ibdiag += 25; 3102f0d39aaaSBarry Smith break; 3103d71ae5a4SJacob Faibussowitsch default: 3104*4d12350bSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz); 31058862d2efSBarry Smith } 31062af78befSBarry Smith } 31072af78befSBarry Smith 31085850ef23SBarry Smith xb = t; 31099566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(a->nz)); 31102af78befSBarry Smith } else xb = b; 31112af78befSBarry Smith if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 3112f0d39aaaSBarry Smith ibdiag = a->inode.ibdiag + a->inode.bdiagsize; 3113d0f46423SBarry Smith for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) { 3114*4d12350bSJunchao Zhang nodesz = sizes[i + 1] - sizes[i]; 3115*4d12350bSJunchao Zhang ibdiag -= nodesz * nodesz; 31168862d2efSBarry Smith sz = ii[row + 1] - diag[row] - 1; 31178862d2efSBarry Smith v1 = a->a + diag[row] + 1; 31188862d2efSBarry Smith idx = a->j + diag[row] + 1; 31192af78befSBarry Smith 31204108e4d5SBarry Smith /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3121*4d12350bSJunchao Zhang switch (nodesz) { 31228862d2efSBarry Smith case 1: 31238862d2efSBarry Smith 31248862d2efSBarry Smith sum1 = xb[row]; 31258862d2efSBarry Smith for (n = 0; n < sz - 1; n += 2) { 31268862d2efSBarry Smith i1 = idx[0]; 31278862d2efSBarry Smith i2 = idx[1]; 31288862d2efSBarry Smith idx += 2; 31298862d2efSBarry Smith tmp0 = x[i1]; 31308862d2efSBarry Smith tmp1 = x[i2]; 31319371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 31329371c9d4SSatish Balay v1 += 2; 31338862d2efSBarry Smith } 31348862d2efSBarry Smith 31358862d2efSBarry Smith if (n == sz - 1) { 3136f0d39aaaSBarry Smith tmp0 = x[*idx]; 3137f0d39aaaSBarry Smith sum1 -= *v1 * tmp0; 31388862d2efSBarry Smith } 3139f0d39aaaSBarry Smith x[row--] = sum1 * (*ibdiag); 3140f0d39aaaSBarry Smith break; 3141f0d39aaaSBarry Smith 3142f0d39aaaSBarry Smith case 2: 3143f0d39aaaSBarry Smith 3144f0d39aaaSBarry Smith sum1 = xb[row]; 3145f0d39aaaSBarry Smith sum2 = xb[row - 1]; 3146f0d39aaaSBarry Smith /* note that sum1 is associated with the second of the two rows */ 3147f0d39aaaSBarry Smith v2 = a->a + diag[row - 1] + 2; 3148f0d39aaaSBarry Smith for (n = 0; n < sz - 1; n += 2) { 3149f0d39aaaSBarry Smith i1 = idx[0]; 3150f0d39aaaSBarry Smith i2 = idx[1]; 3151f0d39aaaSBarry Smith idx += 2; 3152f0d39aaaSBarry Smith tmp0 = x[i1]; 3153f0d39aaaSBarry Smith tmp1 = x[i2]; 31549371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 31559371c9d4SSatish Balay v1 += 2; 31569371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 31579371c9d4SSatish Balay v2 += 2; 3158f0d39aaaSBarry Smith } 3159f0d39aaaSBarry Smith 3160f0d39aaaSBarry Smith if (n == sz - 1) { 3161f0d39aaaSBarry Smith tmp0 = x[*idx]; 3162f0d39aaaSBarry Smith sum1 -= *v1 * tmp0; 3163f0d39aaaSBarry Smith sum2 -= *v2 * tmp0; 3164f0d39aaaSBarry Smith } 3165f0d39aaaSBarry Smith x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3]; 3166f0d39aaaSBarry Smith x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2]; 3167f0d39aaaSBarry Smith break; 3168f0d39aaaSBarry Smith case 3: 3169f0d39aaaSBarry Smith 3170f0d39aaaSBarry Smith sum1 = xb[row]; 3171f0d39aaaSBarry Smith sum2 = xb[row - 1]; 3172f0d39aaaSBarry Smith sum3 = xb[row - 2]; 3173f0d39aaaSBarry Smith v2 = a->a + diag[row - 1] + 2; 3174f0d39aaaSBarry Smith v3 = a->a + diag[row - 2] + 3; 3175f0d39aaaSBarry Smith for (n = 0; n < sz - 1; n += 2) { 3176f0d39aaaSBarry Smith i1 = idx[0]; 3177f0d39aaaSBarry Smith i2 = idx[1]; 3178f0d39aaaSBarry Smith idx += 2; 3179f0d39aaaSBarry Smith tmp0 = x[i1]; 3180f0d39aaaSBarry Smith tmp1 = x[i2]; 31819371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 31829371c9d4SSatish Balay v1 += 2; 31839371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 31849371c9d4SSatish Balay v2 += 2; 31859371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 31869371c9d4SSatish Balay v3 += 2; 3187f0d39aaaSBarry Smith } 3188f0d39aaaSBarry Smith 3189f0d39aaaSBarry Smith if (n == sz - 1) { 3190f0d39aaaSBarry Smith tmp0 = x[*idx]; 3191f0d39aaaSBarry Smith sum1 -= *v1 * tmp0; 3192f0d39aaaSBarry Smith sum2 -= *v2 * tmp0; 3193f0d39aaaSBarry Smith sum3 -= *v3 * tmp0; 3194f0d39aaaSBarry Smith } 3195f0d39aaaSBarry Smith x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8]; 3196f0d39aaaSBarry Smith x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7]; 3197f0d39aaaSBarry Smith x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6]; 3198f0d39aaaSBarry Smith break; 3199f0d39aaaSBarry Smith case 4: 3200f0d39aaaSBarry Smith 3201f0d39aaaSBarry Smith sum1 = xb[row]; 3202f0d39aaaSBarry Smith sum2 = xb[row - 1]; 3203f0d39aaaSBarry Smith sum3 = xb[row - 2]; 3204f0d39aaaSBarry Smith sum4 = xb[row - 3]; 3205f0d39aaaSBarry Smith v2 = a->a + diag[row - 1] + 2; 3206f0d39aaaSBarry Smith v3 = a->a + diag[row - 2] + 3; 3207f0d39aaaSBarry Smith v4 = a->a + diag[row - 3] + 4; 3208f0d39aaaSBarry Smith for (n = 0; n < sz - 1; n += 2) { 3209f0d39aaaSBarry Smith i1 = idx[0]; 3210f0d39aaaSBarry Smith i2 = idx[1]; 3211f0d39aaaSBarry Smith idx += 2; 3212f0d39aaaSBarry Smith tmp0 = x[i1]; 3213f0d39aaaSBarry Smith tmp1 = x[i2]; 32149371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 32159371c9d4SSatish Balay v1 += 2; 32169371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 32179371c9d4SSatish Balay v2 += 2; 32189371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 32199371c9d4SSatish Balay v3 += 2; 32209371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 32219371c9d4SSatish Balay v4 += 2; 3222f0d39aaaSBarry Smith } 3223f0d39aaaSBarry Smith 3224f0d39aaaSBarry Smith if (n == sz - 1) { 3225f0d39aaaSBarry Smith tmp0 = x[*idx]; 3226f0d39aaaSBarry Smith sum1 -= *v1 * tmp0; 3227f0d39aaaSBarry Smith sum2 -= *v2 * tmp0; 3228f0d39aaaSBarry Smith sum3 -= *v3 * tmp0; 3229f0d39aaaSBarry Smith sum4 -= *v4 * tmp0; 3230f0d39aaaSBarry Smith } 3231f0d39aaaSBarry Smith x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15]; 3232f0d39aaaSBarry Smith x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14]; 3233f0d39aaaSBarry Smith x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13]; 3234f0d39aaaSBarry Smith x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12]; 3235f0d39aaaSBarry Smith break; 3236f0d39aaaSBarry Smith case 5: 3237f0d39aaaSBarry Smith 3238f0d39aaaSBarry Smith sum1 = xb[row]; 3239f0d39aaaSBarry Smith sum2 = xb[row - 1]; 3240f0d39aaaSBarry Smith sum3 = xb[row - 2]; 3241f0d39aaaSBarry Smith sum4 = xb[row - 3]; 3242f0d39aaaSBarry Smith sum5 = xb[row - 4]; 3243f0d39aaaSBarry Smith v2 = a->a + diag[row - 1] + 2; 3244f0d39aaaSBarry Smith v3 = a->a + diag[row - 2] + 3; 3245f0d39aaaSBarry Smith v4 = a->a + diag[row - 3] + 4; 3246f0d39aaaSBarry Smith v5 = a->a + diag[row - 4] + 5; 3247f0d39aaaSBarry Smith for (n = 0; n < sz - 1; n += 2) { 3248f0d39aaaSBarry Smith i1 = idx[0]; 3249f0d39aaaSBarry Smith i2 = idx[1]; 3250f0d39aaaSBarry Smith idx += 2; 3251f0d39aaaSBarry Smith tmp0 = x[i1]; 3252f0d39aaaSBarry Smith tmp1 = x[i2]; 32539371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 32549371c9d4SSatish Balay v1 += 2; 32559371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 32569371c9d4SSatish Balay v2 += 2; 32579371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 32589371c9d4SSatish Balay v3 += 2; 32599371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 32609371c9d4SSatish Balay v4 += 2; 32619371c9d4SSatish Balay sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 32629371c9d4SSatish Balay v5 += 2; 3263f0d39aaaSBarry Smith } 3264f0d39aaaSBarry Smith 3265f0d39aaaSBarry Smith if (n == sz - 1) { 3266f0d39aaaSBarry Smith tmp0 = x[*idx]; 3267f0d39aaaSBarry Smith sum1 -= *v1 * tmp0; 3268f0d39aaaSBarry Smith sum2 -= *v2 * tmp0; 3269f0d39aaaSBarry Smith sum3 -= *v3 * tmp0; 3270f0d39aaaSBarry Smith sum4 -= *v4 * tmp0; 3271f0d39aaaSBarry Smith sum5 -= *v5 * tmp0; 3272f0d39aaaSBarry Smith } 3273f0d39aaaSBarry Smith x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24]; 3274f0d39aaaSBarry Smith x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23]; 3275f0d39aaaSBarry Smith x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22]; 3276f0d39aaaSBarry Smith x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21]; 3277f0d39aaaSBarry Smith x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20]; 32788862d2efSBarry Smith break; 3279d71ae5a4SJacob Faibussowitsch default: 3280*4d12350bSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz); 32818862d2efSBarry Smith } 32822af78befSBarry Smith } 32832af78befSBarry Smith 32849566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(a->nz)); 32852af78befSBarry Smith } 32862af78befSBarry Smith its--; 32875850ef23SBarry Smith } 32885850ef23SBarry Smith while (its--) { 32895850ef23SBarry Smith if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 3290*4d12350bSJunchao Zhang for (i = 0, row = 0, ibdiag = a->inode.ibdiag; i < m; row += nodesz, ibdiag += nodesz * nodesz, i++) { 3291*4d12350bSJunchao Zhang nodesz = sizes[i + 1] - sizes[i]; 3292d876e2b0SMark Adams sz = diag[row] - ii[row]; 32935850ef23SBarry Smith v1 = a->a + ii[row]; 32945850ef23SBarry Smith idx = a->j + ii[row]; 32955850ef23SBarry Smith /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3296*4d12350bSJunchao Zhang switch (nodesz) { 32975850ef23SBarry Smith case 1: 32985850ef23SBarry Smith sum1 = b[row]; 32995850ef23SBarry Smith for (n = 0; n < sz - 1; n += 2) { 33005850ef23SBarry Smith i1 = idx[0]; 33015850ef23SBarry Smith i2 = idx[1]; 33025850ef23SBarry Smith idx += 2; 33035850ef23SBarry Smith tmp0 = x[i1]; 33045850ef23SBarry Smith tmp1 = x[i2]; 33059371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 33069371c9d4SSatish Balay v1 += 2; 33075850ef23SBarry Smith } 33085850ef23SBarry Smith if (n == sz - 1) { 3309d876e2b0SMark Adams tmp0 = x[*idx++]; 3310d876e2b0SMark Adams sum1 -= *v1 * tmp0; 3311d876e2b0SMark Adams v1++; 3312d876e2b0SMark Adams } 3313d876e2b0SMark Adams t[row] = sum1; 3314d876e2b0SMark Adams sz = ii[row + 1] - diag[row] - 1; 3315d876e2b0SMark Adams idx = a->j + diag[row] + 1; 3316d876e2b0SMark Adams v1 += 1; 3317d876e2b0SMark Adams for (n = 0; n < sz - 1; n += 2) { 3318d876e2b0SMark Adams i1 = idx[0]; 3319d876e2b0SMark Adams i2 = idx[1]; 3320d876e2b0SMark Adams idx += 2; 3321d876e2b0SMark Adams tmp0 = x[i1]; 3322d876e2b0SMark Adams tmp1 = x[i2]; 33239371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 33249371c9d4SSatish Balay v1 += 2; 3325d876e2b0SMark Adams } 3326d876e2b0SMark Adams if (n == sz - 1) { 3327d876e2b0SMark Adams tmp0 = x[*idx++]; 33285850ef23SBarry Smith sum1 -= *v1 * tmp0; 33295850ef23SBarry Smith } 33305850ef23SBarry Smith /* in MatSOR_SeqAIJ this line would be 33315850ef23SBarry Smith * 33325850ef23SBarry Smith * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++); 33335850ef23SBarry Smith * 33345850ef23SBarry Smith * but omega == 1, so this becomes 33355850ef23SBarry Smith * 3336d876e2b0SMark Adams * x[row] = sum1*(*ibdiag++); 33375850ef23SBarry Smith * 33385850ef23SBarry Smith */ 3339d876e2b0SMark Adams x[row] = sum1 * (*ibdiag); 33405850ef23SBarry Smith break; 33415850ef23SBarry Smith case 2: 33425850ef23SBarry Smith v2 = a->a + ii[row + 1]; 33435850ef23SBarry Smith sum1 = b[row]; 33445850ef23SBarry Smith sum2 = b[row + 1]; 33455850ef23SBarry Smith for (n = 0; n < sz - 1; n += 2) { 33465850ef23SBarry Smith i1 = idx[0]; 33475850ef23SBarry Smith i2 = idx[1]; 33485850ef23SBarry Smith idx += 2; 33495850ef23SBarry Smith tmp0 = x[i1]; 33505850ef23SBarry Smith tmp1 = x[i2]; 33519371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 33529371c9d4SSatish Balay v1 += 2; 33539371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 33549371c9d4SSatish Balay v2 += 2; 33555850ef23SBarry Smith } 3356d876e2b0SMark Adams if (n == sz - 1) { 3357d876e2b0SMark Adams tmp0 = x[*idx++]; 3358d876e2b0SMark Adams sum1 -= v1[0] * tmp0; 3359d876e2b0SMark Adams sum2 -= v2[0] * tmp0; 33609371c9d4SSatish Balay v1++; 33619371c9d4SSatish Balay v2++; 3362d876e2b0SMark Adams } 3363d876e2b0SMark Adams t[row] = sum1; 3364d876e2b0SMark Adams t[row + 1] = sum2; 3365d876e2b0SMark Adams sz = ii[row + 1] - diag[row] - 2; 3366d876e2b0SMark Adams idx = a->j + diag[row] + 2; 3367d876e2b0SMark Adams v1 += 2; 3368d876e2b0SMark Adams v2 += 2; 3369d876e2b0SMark Adams for (n = 0; n < sz - 1; n += 2) { 3370d876e2b0SMark Adams i1 = idx[0]; 3371d876e2b0SMark Adams i2 = idx[1]; 3372d876e2b0SMark Adams idx += 2; 3373d876e2b0SMark Adams tmp0 = x[i1]; 3374d876e2b0SMark Adams tmp1 = x[i2]; 33759371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 33769371c9d4SSatish Balay v1 += 2; 33779371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 33789371c9d4SSatish Balay v2 += 2; 3379d876e2b0SMark Adams } 33805850ef23SBarry Smith if (n == sz - 1) { 33815850ef23SBarry Smith tmp0 = x[*idx]; 33825850ef23SBarry Smith sum1 -= v1[0] * tmp0; 33835850ef23SBarry Smith sum2 -= v2[0] * tmp0; 33845850ef23SBarry Smith } 3385d876e2b0SMark Adams x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[2]; 3386d876e2b0SMark Adams x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3]; 33875850ef23SBarry Smith break; 33885850ef23SBarry Smith case 3: 33895850ef23SBarry Smith v2 = a->a + ii[row + 1]; 33905850ef23SBarry Smith v3 = a->a + ii[row + 2]; 33915850ef23SBarry Smith sum1 = b[row]; 33925850ef23SBarry Smith sum2 = b[row + 1]; 33935850ef23SBarry Smith sum3 = b[row + 2]; 33945850ef23SBarry Smith for (n = 0; n < sz - 1; n += 2) { 33955850ef23SBarry Smith i1 = idx[0]; 33965850ef23SBarry Smith i2 = idx[1]; 33975850ef23SBarry Smith idx += 2; 33985850ef23SBarry Smith tmp0 = x[i1]; 33995850ef23SBarry Smith tmp1 = x[i2]; 34009371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 34019371c9d4SSatish Balay v1 += 2; 34029371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 34039371c9d4SSatish Balay v2 += 2; 34049371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 34059371c9d4SSatish Balay v3 += 2; 34065850ef23SBarry Smith } 3407d876e2b0SMark Adams if (n == sz - 1) { 3408d876e2b0SMark Adams tmp0 = x[*idx++]; 3409d876e2b0SMark Adams sum1 -= v1[0] * tmp0; 3410d876e2b0SMark Adams sum2 -= v2[0] * tmp0; 3411d876e2b0SMark Adams sum3 -= v3[0] * tmp0; 34129371c9d4SSatish Balay v1++; 34139371c9d4SSatish Balay v2++; 34149371c9d4SSatish Balay v3++; 3415d876e2b0SMark Adams } 3416d876e2b0SMark Adams t[row] = sum1; 3417d876e2b0SMark Adams t[row + 1] = sum2; 3418d876e2b0SMark Adams t[row + 2] = sum3; 3419d876e2b0SMark Adams sz = ii[row + 1] - diag[row] - 3; 3420d876e2b0SMark Adams idx = a->j + diag[row] + 3; 3421d876e2b0SMark Adams v1 += 3; 3422d876e2b0SMark Adams v2 += 3; 3423d876e2b0SMark Adams v3 += 3; 3424d876e2b0SMark Adams for (n = 0; n < sz - 1; n += 2) { 3425d876e2b0SMark Adams i1 = idx[0]; 3426d876e2b0SMark Adams i2 = idx[1]; 3427d876e2b0SMark Adams idx += 2; 3428d876e2b0SMark Adams tmp0 = x[i1]; 3429d876e2b0SMark Adams tmp1 = x[i2]; 34309371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 34319371c9d4SSatish Balay v1 += 2; 34329371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 34339371c9d4SSatish Balay v2 += 2; 34349371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 34359371c9d4SSatish Balay v3 += 2; 3436d876e2b0SMark Adams } 34375850ef23SBarry Smith if (n == sz - 1) { 34385850ef23SBarry Smith tmp0 = x[*idx]; 34395850ef23SBarry Smith sum1 -= v1[0] * tmp0; 34405850ef23SBarry Smith sum2 -= v2[0] * tmp0; 34415850ef23SBarry Smith sum3 -= v3[0] * tmp0; 34425850ef23SBarry Smith } 3443d876e2b0SMark Adams x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6]; 3444d876e2b0SMark Adams x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7]; 3445d876e2b0SMark Adams x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8]; 34465850ef23SBarry Smith break; 34475850ef23SBarry Smith case 4: 34485850ef23SBarry Smith v2 = a->a + ii[row + 1]; 34495850ef23SBarry Smith v3 = a->a + ii[row + 2]; 34505850ef23SBarry Smith v4 = a->a + ii[row + 3]; 34515850ef23SBarry Smith sum1 = b[row]; 34525850ef23SBarry Smith sum2 = b[row + 1]; 34535850ef23SBarry Smith sum3 = b[row + 2]; 34545850ef23SBarry Smith sum4 = b[row + 3]; 34555850ef23SBarry Smith for (n = 0; n < sz - 1; n += 2) { 34565850ef23SBarry Smith i1 = idx[0]; 34575850ef23SBarry Smith i2 = idx[1]; 34585850ef23SBarry Smith idx += 2; 34595850ef23SBarry Smith tmp0 = x[i1]; 34605850ef23SBarry Smith tmp1 = x[i2]; 34619371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 34629371c9d4SSatish Balay v1 += 2; 34639371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 34649371c9d4SSatish Balay v2 += 2; 34659371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 34669371c9d4SSatish Balay v3 += 2; 34679371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 34689371c9d4SSatish Balay v4 += 2; 34695850ef23SBarry Smith } 3470d876e2b0SMark Adams if (n == sz - 1) { 3471d876e2b0SMark Adams tmp0 = x[*idx++]; 3472d876e2b0SMark Adams sum1 -= v1[0] * tmp0; 3473d876e2b0SMark Adams sum2 -= v2[0] * tmp0; 3474d876e2b0SMark Adams sum3 -= v3[0] * tmp0; 3475d876e2b0SMark Adams sum4 -= v4[0] * tmp0; 34769371c9d4SSatish Balay v1++; 34779371c9d4SSatish Balay v2++; 34789371c9d4SSatish Balay v3++; 34799371c9d4SSatish Balay v4++; 3480d876e2b0SMark Adams } 3481d876e2b0SMark Adams t[row] = sum1; 3482d876e2b0SMark Adams t[row + 1] = sum2; 3483d876e2b0SMark Adams t[row + 2] = sum3; 3484d876e2b0SMark Adams t[row + 3] = sum4; 3485d876e2b0SMark Adams sz = ii[row + 1] - diag[row] - 4; 3486d876e2b0SMark Adams idx = a->j + diag[row] + 4; 3487d876e2b0SMark Adams v1 += 4; 3488d876e2b0SMark Adams v2 += 4; 3489d876e2b0SMark Adams v3 += 4; 3490d876e2b0SMark Adams v4 += 4; 3491d876e2b0SMark Adams for (n = 0; n < sz - 1; n += 2) { 3492d876e2b0SMark Adams i1 = idx[0]; 3493d876e2b0SMark Adams i2 = idx[1]; 3494d876e2b0SMark Adams idx += 2; 3495d876e2b0SMark Adams tmp0 = x[i1]; 3496d876e2b0SMark Adams tmp1 = x[i2]; 34979371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 34989371c9d4SSatish Balay v1 += 2; 34999371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 35009371c9d4SSatish Balay v2 += 2; 35019371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 35029371c9d4SSatish Balay v3 += 2; 35039371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 35049371c9d4SSatish Balay v4 += 2; 3505d876e2b0SMark Adams } 35065850ef23SBarry Smith if (n == sz - 1) { 35075850ef23SBarry Smith tmp0 = x[*idx]; 35085850ef23SBarry Smith sum1 -= v1[0] * tmp0; 35095850ef23SBarry Smith sum2 -= v2[0] * tmp0; 35105850ef23SBarry Smith sum3 -= v3[0] * tmp0; 35115850ef23SBarry Smith sum4 -= v4[0] * tmp0; 35125850ef23SBarry Smith } 3513d876e2b0SMark Adams x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12]; 3514d876e2b0SMark Adams x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13]; 3515d876e2b0SMark Adams x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14]; 3516d876e2b0SMark Adams x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15]; 35175850ef23SBarry Smith break; 35185850ef23SBarry Smith case 5: 35195850ef23SBarry Smith v2 = a->a + ii[row + 1]; 35205850ef23SBarry Smith v3 = a->a + ii[row + 2]; 35215850ef23SBarry Smith v4 = a->a + ii[row + 3]; 35225850ef23SBarry Smith v5 = a->a + ii[row + 4]; 35235850ef23SBarry Smith sum1 = b[row]; 35245850ef23SBarry Smith sum2 = b[row + 1]; 35255850ef23SBarry Smith sum3 = b[row + 2]; 35265850ef23SBarry Smith sum4 = b[row + 3]; 35275850ef23SBarry Smith sum5 = b[row + 4]; 35285850ef23SBarry Smith for (n = 0; n < sz - 1; n += 2) { 35295850ef23SBarry Smith i1 = idx[0]; 35305850ef23SBarry Smith i2 = idx[1]; 35315850ef23SBarry Smith idx += 2; 35325850ef23SBarry Smith tmp0 = x[i1]; 35335850ef23SBarry Smith tmp1 = x[i2]; 35349371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 35359371c9d4SSatish Balay v1 += 2; 35369371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 35379371c9d4SSatish Balay v2 += 2; 35389371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 35399371c9d4SSatish Balay v3 += 2; 35409371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 35419371c9d4SSatish Balay v4 += 2; 35429371c9d4SSatish Balay sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 35439371c9d4SSatish Balay v5 += 2; 35445850ef23SBarry Smith } 35455850ef23SBarry Smith if (n == sz - 1) { 3546d876e2b0SMark Adams tmp0 = x[*idx++]; 35475850ef23SBarry Smith sum1 -= v1[0] * tmp0; 35485850ef23SBarry Smith sum2 -= v2[0] * tmp0; 35495850ef23SBarry Smith sum3 -= v3[0] * tmp0; 35505850ef23SBarry Smith sum4 -= v4[0] * tmp0; 35515850ef23SBarry Smith sum5 -= v5[0] * tmp0; 35529371c9d4SSatish Balay v1++; 35539371c9d4SSatish Balay v2++; 35549371c9d4SSatish Balay v3++; 35559371c9d4SSatish Balay v4++; 35569371c9d4SSatish Balay v5++; 35575850ef23SBarry Smith } 3558d876e2b0SMark Adams t[row] = sum1; 3559d876e2b0SMark Adams t[row + 1] = sum2; 3560d876e2b0SMark Adams t[row + 2] = sum3; 3561d876e2b0SMark Adams t[row + 3] = sum4; 3562d876e2b0SMark Adams t[row + 4] = sum5; 3563d876e2b0SMark Adams sz = ii[row + 1] - diag[row] - 5; 3564d876e2b0SMark Adams idx = a->j + diag[row] + 5; 3565d876e2b0SMark Adams v1 += 5; 3566d876e2b0SMark Adams v2 += 5; 3567d876e2b0SMark Adams v3 += 5; 3568d876e2b0SMark Adams v4 += 5; 3569d876e2b0SMark Adams v5 += 5; 35705850ef23SBarry Smith for (n = 0; n < sz - 1; n += 2) { 35715850ef23SBarry Smith i1 = idx[0]; 35725850ef23SBarry Smith i2 = idx[1]; 35735850ef23SBarry Smith idx += 2; 35745850ef23SBarry Smith tmp0 = x[i1]; 35755850ef23SBarry Smith tmp1 = x[i2]; 35769371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 35779371c9d4SSatish Balay v1 += 2; 35789371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 35799371c9d4SSatish Balay v2 += 2; 35809371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 35819371c9d4SSatish Balay v3 += 2; 35829371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 35839371c9d4SSatish Balay v4 += 2; 35849371c9d4SSatish Balay sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 35859371c9d4SSatish Balay v5 += 2; 35865850ef23SBarry Smith } 35875850ef23SBarry Smith if (n == sz - 1) { 35885850ef23SBarry Smith tmp0 = x[*idx]; 3589d876e2b0SMark Adams sum1 -= v1[0] * tmp0; 3590d876e2b0SMark Adams sum2 -= v2[0] * tmp0; 3591d876e2b0SMark Adams sum3 -= v3[0] * tmp0; 3592d876e2b0SMark Adams sum4 -= v4[0] * tmp0; 3593d876e2b0SMark Adams sum5 -= v5[0] * tmp0; 35945850ef23SBarry Smith } 3595d876e2b0SMark Adams x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20]; 3596d876e2b0SMark Adams x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21]; 3597d876e2b0SMark Adams x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22]; 3598d876e2b0SMark Adams x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23]; 3599d876e2b0SMark Adams x[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24]; 3600d876e2b0SMark Adams break; 3601d71ae5a4SJacob Faibussowitsch default: 3602*4d12350bSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz); 3603d876e2b0SMark Adams } 3604d876e2b0SMark Adams } 3605d876e2b0SMark Adams xb = t; 36069566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz)); /* undercounts diag inverse */ 3607d876e2b0SMark Adams } else xb = b; 3608d876e2b0SMark Adams 3609d876e2b0SMark Adams if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 3610d876e2b0SMark Adams ibdiag = a->inode.ibdiag + a->inode.bdiagsize; 3611d876e2b0SMark Adams for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) { 3612*4d12350bSJunchao Zhang nodesz = sizes[i + 1] - sizes[i]; 3613*4d12350bSJunchao Zhang ibdiag -= nodesz * nodesz; 3614d876e2b0SMark Adams 3615d876e2b0SMark Adams /* set RHS */ 3616d876e2b0SMark Adams if (xb == b) { 3617d876e2b0SMark Adams /* whole (old way) */ 3618d876e2b0SMark Adams sz = ii[row + 1] - ii[row]; 3619d876e2b0SMark Adams idx = a->j + ii[row]; 3620*4d12350bSJunchao Zhang switch (nodesz) { 3621d71ae5a4SJacob Faibussowitsch case 5: 3622d71ae5a4SJacob Faibussowitsch v5 = a->a + ii[row - 4]; /* fall through */ 3623d71ae5a4SJacob Faibussowitsch case 4: 3624d71ae5a4SJacob Faibussowitsch v4 = a->a + ii[row - 3]; /* fall through */ 3625d71ae5a4SJacob Faibussowitsch case 3: 3626d71ae5a4SJacob Faibussowitsch v3 = a->a + ii[row - 2]; /* fall through */ 3627d71ae5a4SJacob Faibussowitsch case 2: 3628d71ae5a4SJacob Faibussowitsch v2 = a->a + ii[row - 1]; /* fall through */ 3629d71ae5a4SJacob Faibussowitsch case 1: 3630d71ae5a4SJacob Faibussowitsch v1 = a->a + ii[row]; 3631d71ae5a4SJacob Faibussowitsch break; 3632d71ae5a4SJacob Faibussowitsch default: 3633*4d12350bSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz); 3634d876e2b0SMark Adams } 3635d876e2b0SMark Adams } else { 3636d876e2b0SMark Adams /* upper, no diag */ 3637d876e2b0SMark Adams sz = ii[row + 1] - diag[row] - 1; 3638d876e2b0SMark Adams idx = a->j + diag[row] + 1; 3639*4d12350bSJunchao Zhang switch (nodesz) { 3640d71ae5a4SJacob Faibussowitsch case 5: 3641d71ae5a4SJacob Faibussowitsch v5 = a->a + diag[row - 4] + 5; /* fall through */ 3642d71ae5a4SJacob Faibussowitsch case 4: 3643d71ae5a4SJacob Faibussowitsch v4 = a->a + diag[row - 3] + 4; /* fall through */ 3644d71ae5a4SJacob Faibussowitsch case 3: 3645d71ae5a4SJacob Faibussowitsch v3 = a->a + diag[row - 2] + 3; /* fall through */ 3646d71ae5a4SJacob Faibussowitsch case 2: 3647d71ae5a4SJacob Faibussowitsch v2 = a->a + diag[row - 1] + 2; /* fall through */ 3648d71ae5a4SJacob Faibussowitsch case 1: 3649d71ae5a4SJacob Faibussowitsch v1 = a->a + diag[row] + 1; 3650d876e2b0SMark Adams } 3651d876e2b0SMark Adams } 3652d876e2b0SMark Adams /* set sum */ 3653*4d12350bSJunchao Zhang switch (nodesz) { 3654d71ae5a4SJacob Faibussowitsch case 5: 3655d71ae5a4SJacob Faibussowitsch sum5 = xb[row - 4]; /* fall through */ 3656d71ae5a4SJacob Faibussowitsch case 4: 3657d71ae5a4SJacob Faibussowitsch sum4 = xb[row - 3]; /* fall through */ 3658d71ae5a4SJacob Faibussowitsch case 3: 3659d71ae5a4SJacob Faibussowitsch sum3 = xb[row - 2]; /* fall through */ 3660d71ae5a4SJacob Faibussowitsch case 2: 3661d71ae5a4SJacob Faibussowitsch sum2 = xb[row - 1]; /* fall through */ 3662d876e2b0SMark Adams case 1: 3663d876e2b0SMark Adams /* note that sum1 is associated with the last row */ 3664d876e2b0SMark Adams sum1 = xb[row]; 3665d876e2b0SMark Adams } 3666d876e2b0SMark Adams /* do sums */ 3667d876e2b0SMark Adams for (n = 0; n < sz - 1; n += 2) { 3668d876e2b0SMark Adams i1 = idx[0]; 3669d876e2b0SMark Adams i2 = idx[1]; 3670d876e2b0SMark Adams idx += 2; 3671d876e2b0SMark Adams tmp0 = x[i1]; 3672d876e2b0SMark Adams tmp1 = x[i2]; 3673*4d12350bSJunchao Zhang switch (nodesz) { 3674d71ae5a4SJacob Faibussowitsch case 5: 3675d71ae5a4SJacob Faibussowitsch sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3676d71ae5a4SJacob Faibussowitsch v5 += 2; /* fall through */ 3677d71ae5a4SJacob Faibussowitsch case 4: 3678d71ae5a4SJacob Faibussowitsch sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3679d71ae5a4SJacob Faibussowitsch v4 += 2; /* fall through */ 3680d71ae5a4SJacob Faibussowitsch case 3: 3681d71ae5a4SJacob Faibussowitsch sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3682d71ae5a4SJacob Faibussowitsch v3 += 2; /* fall through */ 3683d71ae5a4SJacob Faibussowitsch case 2: 3684d71ae5a4SJacob Faibussowitsch sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3685d71ae5a4SJacob Faibussowitsch v2 += 2; /* fall through */ 3686d71ae5a4SJacob Faibussowitsch case 1: 3687d71ae5a4SJacob Faibussowitsch sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3688d71ae5a4SJacob Faibussowitsch v1 += 2; 3689d876e2b0SMark Adams } 3690d876e2b0SMark Adams } 3691d876e2b0SMark Adams /* ragged edge */ 3692d876e2b0SMark Adams if (n == sz - 1) { 3693d876e2b0SMark Adams tmp0 = x[*idx]; 3694*4d12350bSJunchao Zhang switch (nodesz) { 3695d71ae5a4SJacob Faibussowitsch case 5: 3696d71ae5a4SJacob Faibussowitsch sum5 -= *v5 * tmp0; /* fall through */ 3697d71ae5a4SJacob Faibussowitsch case 4: 3698d71ae5a4SJacob Faibussowitsch sum4 -= *v4 * tmp0; /* fall through */ 3699d71ae5a4SJacob Faibussowitsch case 3: 3700d71ae5a4SJacob Faibussowitsch sum3 -= *v3 * tmp0; /* fall through */ 3701d71ae5a4SJacob Faibussowitsch case 2: 3702d71ae5a4SJacob Faibussowitsch sum2 -= *v2 * tmp0; /* fall through */ 3703d71ae5a4SJacob Faibussowitsch case 1: 3704d71ae5a4SJacob Faibussowitsch sum1 -= *v1 * tmp0; 3705d876e2b0SMark Adams } 3706d876e2b0SMark Adams } 3707d876e2b0SMark Adams /* update */ 3708d876e2b0SMark Adams if (xb == b) { 3709d876e2b0SMark Adams /* whole (old way) w/ diag */ 3710*4d12350bSJunchao Zhang switch (nodesz) { 3711d876e2b0SMark Adams case 5: 37125850ef23SBarry Smith x[row--] += sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24]; 37135850ef23SBarry Smith x[row--] += sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23]; 37145850ef23SBarry Smith x[row--] += sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22]; 37155850ef23SBarry Smith x[row--] += sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21]; 37165850ef23SBarry Smith x[row--] += sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20]; 37175850ef23SBarry Smith break; 3718d876e2b0SMark Adams case 4: 3719d876e2b0SMark Adams x[row--] += sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15]; 3720d876e2b0SMark Adams x[row--] += sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14]; 3721d876e2b0SMark Adams x[row--] += sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13]; 3722d876e2b0SMark Adams x[row--] += sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12]; 3723d876e2b0SMark Adams break; 3724d876e2b0SMark Adams case 3: 3725d876e2b0SMark Adams x[row--] += sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8]; 3726d876e2b0SMark Adams x[row--] += sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7]; 3727d876e2b0SMark Adams x[row--] += sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6]; 3728d876e2b0SMark Adams break; 3729d876e2b0SMark Adams case 2: 3730d876e2b0SMark Adams x[row--] += sum2 * ibdiag[1] + sum1 * ibdiag[3]; 3731d876e2b0SMark Adams x[row--] += sum2 * ibdiag[0] + sum1 * ibdiag[2]; 3732d876e2b0SMark Adams break; 3733d71ae5a4SJacob Faibussowitsch case 1: 3734d71ae5a4SJacob Faibussowitsch x[row--] += sum1 * (*ibdiag); 3735d71ae5a4SJacob Faibussowitsch break; 3736d876e2b0SMark Adams } 3737d876e2b0SMark Adams } else { 3738d876e2b0SMark Adams /* no diag so set = */ 3739*4d12350bSJunchao Zhang switch (nodesz) { 3740d876e2b0SMark Adams case 5: 3741d876e2b0SMark Adams x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24]; 3742d876e2b0SMark Adams x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23]; 3743d876e2b0SMark Adams x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22]; 3744d876e2b0SMark Adams x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21]; 3745d876e2b0SMark Adams x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20]; 3746d876e2b0SMark Adams break; 3747d876e2b0SMark Adams case 4: 3748d876e2b0SMark Adams x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15]; 3749d876e2b0SMark Adams x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14]; 3750d876e2b0SMark Adams x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13]; 3751d876e2b0SMark Adams x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12]; 3752d876e2b0SMark Adams break; 3753d876e2b0SMark Adams case 3: 3754d876e2b0SMark Adams x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8]; 3755d876e2b0SMark Adams x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7]; 3756d876e2b0SMark Adams x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6]; 3757d876e2b0SMark Adams break; 3758d876e2b0SMark Adams case 2: 3759d876e2b0SMark Adams x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3]; 3760d876e2b0SMark Adams x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2]; 3761d876e2b0SMark Adams break; 3762d71ae5a4SJacob Faibussowitsch case 1: 3763d71ae5a4SJacob Faibussowitsch x[row--] = sum1 * (*ibdiag); 3764d71ae5a4SJacob Faibussowitsch break; 37655850ef23SBarry Smith } 37665850ef23SBarry Smith } 3767d876e2b0SMark Adams } 3768d876e2b0SMark Adams if (xb == b) { 37699566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz)); 3770d876e2b0SMark Adams } else { 37719566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper, undercounts diag inverse */ 3772d876e2b0SMark Adams } 37735850ef23SBarry Smith } 37742af78befSBarry Smith } 377589c6957cSBarry Smith if (flag & SOR_EISENSTAT) { 377689c6957cSBarry Smith /* 377789c6957cSBarry Smith Apply (U + D)^-1 where D is now the block diagonal 377889c6957cSBarry Smith */ 377989c6957cSBarry Smith ibdiag = a->inode.ibdiag + a->inode.bdiagsize; 378089c6957cSBarry Smith for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) { 3781*4d12350bSJunchao Zhang nodesz = sizes[i + 1] - sizes[i]; 3782*4d12350bSJunchao Zhang ibdiag -= nodesz * nodesz; 378389c6957cSBarry Smith sz = ii[row + 1] - diag[row] - 1; 378489c6957cSBarry Smith v1 = a->a + diag[row] + 1; 378589c6957cSBarry Smith idx = a->j + diag[row] + 1; 37864108e4d5SBarry Smith /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3787*4d12350bSJunchao Zhang switch (nodesz) { 378889c6957cSBarry Smith case 1: 378989c6957cSBarry Smith 379089c6957cSBarry Smith sum1 = b[row]; 379189c6957cSBarry Smith for (n = 0; n < sz - 1; n += 2) { 379289c6957cSBarry Smith i1 = idx[0]; 379389c6957cSBarry Smith i2 = idx[1]; 379489c6957cSBarry Smith idx += 2; 379589c6957cSBarry Smith tmp0 = x[i1]; 379689c6957cSBarry Smith tmp1 = x[i2]; 37979371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 37989371c9d4SSatish Balay v1 += 2; 379989c6957cSBarry Smith } 380089c6957cSBarry Smith 380189c6957cSBarry Smith if (n == sz - 1) { 380289c6957cSBarry Smith tmp0 = x[*idx]; 380389c6957cSBarry Smith sum1 -= *v1 * tmp0; 380489c6957cSBarry Smith } 38059371c9d4SSatish Balay x[row] = sum1 * (*ibdiag); 38069371c9d4SSatish Balay row--; 380789c6957cSBarry Smith break; 380889c6957cSBarry Smith 380989c6957cSBarry Smith case 2: 381089c6957cSBarry Smith 381189c6957cSBarry Smith sum1 = b[row]; 381289c6957cSBarry Smith sum2 = b[row - 1]; 381389c6957cSBarry Smith /* note that sum1 is associated with the second of the two rows */ 381489c6957cSBarry Smith v2 = a->a + diag[row - 1] + 2; 381589c6957cSBarry Smith for (n = 0; n < sz - 1; n += 2) { 381689c6957cSBarry Smith i1 = idx[0]; 381789c6957cSBarry Smith i2 = idx[1]; 381889c6957cSBarry Smith idx += 2; 381989c6957cSBarry Smith tmp0 = x[i1]; 382089c6957cSBarry Smith tmp1 = x[i2]; 38219371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 38229371c9d4SSatish Balay v1 += 2; 38239371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 38249371c9d4SSatish Balay v2 += 2; 382589c6957cSBarry Smith } 382689c6957cSBarry Smith 382789c6957cSBarry Smith if (n == sz - 1) { 382889c6957cSBarry Smith tmp0 = x[*idx]; 382989c6957cSBarry Smith sum1 -= *v1 * tmp0; 383089c6957cSBarry Smith sum2 -= *v2 * tmp0; 383189c6957cSBarry Smith } 3832938d4eb3SBarry Smith x[row] = sum2 * ibdiag[1] + sum1 * ibdiag[3]; 3833938d4eb3SBarry Smith x[row - 1] = sum2 * ibdiag[0] + sum1 * ibdiag[2]; 3834938d4eb3SBarry Smith row -= 2; 383589c6957cSBarry Smith break; 383689c6957cSBarry Smith case 3: 383789c6957cSBarry Smith 383889c6957cSBarry Smith sum1 = b[row]; 383989c6957cSBarry Smith sum2 = b[row - 1]; 384089c6957cSBarry Smith sum3 = b[row - 2]; 384189c6957cSBarry Smith v2 = a->a + diag[row - 1] + 2; 384289c6957cSBarry Smith v3 = a->a + diag[row - 2] + 3; 384389c6957cSBarry Smith for (n = 0; n < sz - 1; n += 2) { 384489c6957cSBarry Smith i1 = idx[0]; 384589c6957cSBarry Smith i2 = idx[1]; 384689c6957cSBarry Smith idx += 2; 384789c6957cSBarry Smith tmp0 = x[i1]; 384889c6957cSBarry Smith tmp1 = x[i2]; 38499371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 38509371c9d4SSatish Balay v1 += 2; 38519371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 38529371c9d4SSatish Balay v2 += 2; 38539371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 38549371c9d4SSatish Balay v3 += 2; 385589c6957cSBarry Smith } 385689c6957cSBarry Smith 385789c6957cSBarry Smith if (n == sz - 1) { 385889c6957cSBarry Smith tmp0 = x[*idx]; 385989c6957cSBarry Smith sum1 -= *v1 * tmp0; 386089c6957cSBarry Smith sum2 -= *v2 * tmp0; 386189c6957cSBarry Smith sum3 -= *v3 * tmp0; 386289c6957cSBarry Smith } 3863938d4eb3SBarry Smith x[row] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8]; 3864938d4eb3SBarry Smith x[row - 1] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7]; 3865938d4eb3SBarry Smith x[row - 2] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6]; 3866938d4eb3SBarry Smith row -= 3; 386789c6957cSBarry Smith break; 386889c6957cSBarry Smith case 4: 386989c6957cSBarry Smith 387089c6957cSBarry Smith sum1 = b[row]; 387189c6957cSBarry Smith sum2 = b[row - 1]; 387289c6957cSBarry Smith sum3 = b[row - 2]; 387389c6957cSBarry Smith sum4 = b[row - 3]; 387489c6957cSBarry Smith v2 = a->a + diag[row - 1] + 2; 387589c6957cSBarry Smith v3 = a->a + diag[row - 2] + 3; 387689c6957cSBarry Smith v4 = a->a + diag[row - 3] + 4; 387789c6957cSBarry Smith for (n = 0; n < sz - 1; n += 2) { 387889c6957cSBarry Smith i1 = idx[0]; 387989c6957cSBarry Smith i2 = idx[1]; 388089c6957cSBarry Smith idx += 2; 388189c6957cSBarry Smith tmp0 = x[i1]; 388289c6957cSBarry Smith tmp1 = x[i2]; 38839371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 38849371c9d4SSatish Balay v1 += 2; 38859371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 38869371c9d4SSatish Balay v2 += 2; 38879371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 38889371c9d4SSatish Balay v3 += 2; 38899371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 38909371c9d4SSatish Balay v4 += 2; 389189c6957cSBarry Smith } 389289c6957cSBarry Smith 389389c6957cSBarry Smith if (n == sz - 1) { 389489c6957cSBarry Smith tmp0 = x[*idx]; 389589c6957cSBarry Smith sum1 -= *v1 * tmp0; 389689c6957cSBarry Smith sum2 -= *v2 * tmp0; 389789c6957cSBarry Smith sum3 -= *v3 * tmp0; 389889c6957cSBarry Smith sum4 -= *v4 * tmp0; 389989c6957cSBarry Smith } 3900938d4eb3SBarry Smith x[row] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15]; 3901938d4eb3SBarry Smith x[row - 1] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14]; 3902938d4eb3SBarry Smith x[row - 2] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13]; 3903938d4eb3SBarry Smith x[row - 3] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12]; 3904938d4eb3SBarry Smith row -= 4; 390589c6957cSBarry Smith break; 390689c6957cSBarry Smith case 5: 390789c6957cSBarry Smith 390889c6957cSBarry Smith sum1 = b[row]; 390989c6957cSBarry Smith sum2 = b[row - 1]; 391089c6957cSBarry Smith sum3 = b[row - 2]; 391189c6957cSBarry Smith sum4 = b[row - 3]; 391289c6957cSBarry Smith sum5 = b[row - 4]; 391389c6957cSBarry Smith v2 = a->a + diag[row - 1] + 2; 391489c6957cSBarry Smith v3 = a->a + diag[row - 2] + 3; 391589c6957cSBarry Smith v4 = a->a + diag[row - 3] + 4; 391689c6957cSBarry Smith v5 = a->a + diag[row - 4] + 5; 391789c6957cSBarry Smith for (n = 0; n < sz - 1; n += 2) { 391889c6957cSBarry Smith i1 = idx[0]; 391989c6957cSBarry Smith i2 = idx[1]; 392089c6957cSBarry Smith idx += 2; 392189c6957cSBarry Smith tmp0 = x[i1]; 392289c6957cSBarry Smith tmp1 = x[i2]; 39239371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 39249371c9d4SSatish Balay v1 += 2; 39259371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 39269371c9d4SSatish Balay v2 += 2; 39279371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 39289371c9d4SSatish Balay v3 += 2; 39299371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 39309371c9d4SSatish Balay v4 += 2; 39319371c9d4SSatish Balay sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 39329371c9d4SSatish Balay v5 += 2; 393389c6957cSBarry Smith } 393489c6957cSBarry Smith 393589c6957cSBarry Smith if (n == sz - 1) { 393689c6957cSBarry Smith tmp0 = x[*idx]; 393789c6957cSBarry Smith sum1 -= *v1 * tmp0; 393889c6957cSBarry Smith sum2 -= *v2 * tmp0; 393989c6957cSBarry Smith sum3 -= *v3 * tmp0; 394089c6957cSBarry Smith sum4 -= *v4 * tmp0; 394189c6957cSBarry Smith sum5 -= *v5 * tmp0; 394289c6957cSBarry Smith } 3943938d4eb3SBarry Smith x[row] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24]; 3944938d4eb3SBarry Smith x[row - 1] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23]; 3945938d4eb3SBarry Smith x[row - 2] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22]; 3946938d4eb3SBarry Smith x[row - 3] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21]; 3947938d4eb3SBarry Smith x[row - 4] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20]; 3948938d4eb3SBarry Smith row -= 5; 394989c6957cSBarry Smith break; 3950d71ae5a4SJacob Faibussowitsch default: 3951*4d12350bSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz); 395289c6957cSBarry Smith } 395389c6957cSBarry Smith } 39549566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(a->nz)); 395589c6957cSBarry Smith 395689c6957cSBarry Smith /* 395789c6957cSBarry Smith t = b - D x where D is the block diagonal 395889c6957cSBarry Smith */ 395989c6957cSBarry Smith cnt = 0; 396089c6957cSBarry Smith for (i = 0, row = 0; i < m; i++) { 3961*4d12350bSJunchao Zhang nodesz = sizes[i + 1] - sizes[i]; 3962*4d12350bSJunchao Zhang switch (nodesz) { 396389c6957cSBarry Smith case 1: 39649371c9d4SSatish Balay t[row] = b[row] - bdiag[cnt++] * x[row]; 39659371c9d4SSatish Balay row++; 396689c6957cSBarry Smith break; 396789c6957cSBarry Smith case 2: 39689371c9d4SSatish Balay x1 = x[row]; 39699371c9d4SSatish Balay x2 = x[row + 1]; 397089c6957cSBarry Smith tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2]; 397189c6957cSBarry Smith tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3]; 397289c6957cSBarry Smith t[row] = b[row] - tmp1; 39739371c9d4SSatish Balay t[row + 1] = b[row + 1] - tmp2; 39749371c9d4SSatish Balay row += 2; 397589c6957cSBarry Smith cnt += 4; 397689c6957cSBarry Smith break; 397789c6957cSBarry Smith case 3: 39789371c9d4SSatish Balay x1 = x[row]; 39799371c9d4SSatish Balay x2 = x[row + 1]; 39809371c9d4SSatish Balay x3 = x[row + 2]; 398189c6957cSBarry Smith tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6]; 398289c6957cSBarry Smith tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7]; 398389c6957cSBarry Smith tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8]; 398489c6957cSBarry Smith t[row] = b[row] - tmp1; 398589c6957cSBarry Smith t[row + 1] = b[row + 1] - tmp2; 39869371c9d4SSatish Balay t[row + 2] = b[row + 2] - tmp3; 39879371c9d4SSatish Balay row += 3; 398889c6957cSBarry Smith cnt += 9; 398989c6957cSBarry Smith break; 399089c6957cSBarry Smith case 4: 39919371c9d4SSatish Balay x1 = x[row]; 39929371c9d4SSatish Balay x2 = x[row + 1]; 39939371c9d4SSatish Balay x3 = x[row + 2]; 39949371c9d4SSatish Balay x4 = x[row + 3]; 399589c6957cSBarry Smith tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12]; 399689c6957cSBarry Smith tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13]; 399789c6957cSBarry Smith tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14]; 399889c6957cSBarry Smith tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15]; 399989c6957cSBarry Smith t[row] = b[row] - tmp1; 400089c6957cSBarry Smith t[row + 1] = b[row + 1] - tmp2; 400189c6957cSBarry Smith t[row + 2] = b[row + 2] - tmp3; 40029371c9d4SSatish Balay t[row + 3] = b[row + 3] - tmp4; 40039371c9d4SSatish Balay row += 4; 400489c6957cSBarry Smith cnt += 16; 400589c6957cSBarry Smith break; 400689c6957cSBarry Smith case 5: 40079371c9d4SSatish Balay x1 = x[row]; 40089371c9d4SSatish Balay x2 = x[row + 1]; 40099371c9d4SSatish Balay x3 = x[row + 2]; 40109371c9d4SSatish Balay x4 = x[row + 3]; 40119371c9d4SSatish Balay x5 = x[row + 4]; 401289c6957cSBarry Smith tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20]; 401389c6957cSBarry Smith tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21]; 401489c6957cSBarry Smith tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22]; 401589c6957cSBarry Smith tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23]; 401689c6957cSBarry Smith tmp5 = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24]; 401789c6957cSBarry Smith t[row] = b[row] - tmp1; 401889c6957cSBarry Smith t[row + 1] = b[row + 1] - tmp2; 401989c6957cSBarry Smith t[row + 2] = b[row + 2] - tmp3; 402089c6957cSBarry Smith t[row + 3] = b[row + 3] - tmp4; 40219371c9d4SSatish Balay t[row + 4] = b[row + 4] - tmp5; 40229371c9d4SSatish Balay row += 5; 402389c6957cSBarry Smith cnt += 25; 402489c6957cSBarry Smith break; 4025d71ae5a4SJacob Faibussowitsch default: 4026*4d12350bSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz); 402789c6957cSBarry Smith } 402889c6957cSBarry Smith } 40299566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(m)); 403089c6957cSBarry Smith 403189c6957cSBarry Smith /* 403289c6957cSBarry Smith Apply (L + D)^-1 where D is the block diagonal 403389c6957cSBarry Smith */ 403489c6957cSBarry Smith for (i = 0, row = 0; i < m; i++) { 4035*4d12350bSJunchao Zhang nodesz = sizes[i + 1] - sizes[i]; 403689c6957cSBarry Smith sz = diag[row] - ii[row]; 403789c6957cSBarry Smith v1 = a->a + ii[row]; 403889c6957cSBarry Smith idx = a->j + ii[row]; 40394108e4d5SBarry Smith /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 4040*4d12350bSJunchao Zhang switch (nodesz) { 404189c6957cSBarry Smith case 1: 404289c6957cSBarry Smith 404389c6957cSBarry Smith sum1 = t[row]; 404489c6957cSBarry Smith for (n = 0; n < sz - 1; n += 2) { 404589c6957cSBarry Smith i1 = idx[0]; 404689c6957cSBarry Smith i2 = idx[1]; 404789c6957cSBarry Smith idx += 2; 404889c6957cSBarry Smith tmp0 = t[i1]; 404989c6957cSBarry Smith tmp1 = t[i2]; 40509371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 40519371c9d4SSatish Balay v1 += 2; 405289c6957cSBarry Smith } 405389c6957cSBarry Smith 405489c6957cSBarry Smith if (n == sz - 1) { 405589c6957cSBarry Smith tmp0 = t[*idx]; 405689c6957cSBarry Smith sum1 -= *v1 * tmp0; 405789c6957cSBarry Smith } 40589371c9d4SSatish Balay x[row] += t[row] = sum1 * (*ibdiag++); 40599371c9d4SSatish Balay row++; 406089c6957cSBarry Smith break; 406189c6957cSBarry Smith case 2: 406289c6957cSBarry Smith v2 = a->a + ii[row + 1]; 406389c6957cSBarry Smith sum1 = t[row]; 406489c6957cSBarry Smith sum2 = t[row + 1]; 406589c6957cSBarry Smith for (n = 0; n < sz - 1; n += 2) { 406689c6957cSBarry Smith i1 = idx[0]; 406789c6957cSBarry Smith i2 = idx[1]; 406889c6957cSBarry Smith idx += 2; 406989c6957cSBarry Smith tmp0 = t[i1]; 407089c6957cSBarry Smith tmp1 = t[i2]; 40719371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 40729371c9d4SSatish Balay v1 += 2; 40739371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 40749371c9d4SSatish Balay v2 += 2; 407589c6957cSBarry Smith } 407689c6957cSBarry Smith 407789c6957cSBarry Smith if (n == sz - 1) { 407889c6957cSBarry Smith tmp0 = t[*idx]; 407989c6957cSBarry Smith sum1 -= v1[0] * tmp0; 408089c6957cSBarry Smith sum2 -= v2[0] * tmp0; 408189c6957cSBarry Smith } 408289c6957cSBarry Smith x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[2]; 408389c6957cSBarry Smith x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3]; 40849371c9d4SSatish Balay ibdiag += 4; 40859371c9d4SSatish Balay row += 2; 408689c6957cSBarry Smith break; 408789c6957cSBarry Smith case 3: 408889c6957cSBarry Smith v2 = a->a + ii[row + 1]; 408989c6957cSBarry Smith v3 = a->a + ii[row + 2]; 409089c6957cSBarry Smith sum1 = t[row]; 409189c6957cSBarry Smith sum2 = t[row + 1]; 409289c6957cSBarry Smith sum3 = t[row + 2]; 409389c6957cSBarry Smith for (n = 0; n < sz - 1; n += 2) { 409489c6957cSBarry Smith i1 = idx[0]; 409589c6957cSBarry Smith i2 = idx[1]; 409689c6957cSBarry Smith idx += 2; 409789c6957cSBarry Smith tmp0 = t[i1]; 409889c6957cSBarry Smith tmp1 = t[i2]; 40999371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 41009371c9d4SSatish Balay v1 += 2; 41019371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 41029371c9d4SSatish Balay v2 += 2; 41039371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 41049371c9d4SSatish Balay v3 += 2; 410589c6957cSBarry Smith } 410689c6957cSBarry Smith 410789c6957cSBarry Smith if (n == sz - 1) { 410889c6957cSBarry Smith tmp0 = t[*idx]; 410989c6957cSBarry Smith sum1 -= v1[0] * tmp0; 411089c6957cSBarry Smith sum2 -= v2[0] * tmp0; 411189c6957cSBarry Smith sum3 -= v3[0] * tmp0; 411289c6957cSBarry Smith } 411389c6957cSBarry Smith x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6]; 411489c6957cSBarry Smith x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7]; 411589c6957cSBarry Smith x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8]; 41169371c9d4SSatish Balay ibdiag += 9; 41179371c9d4SSatish Balay row += 3; 411889c6957cSBarry Smith break; 411989c6957cSBarry Smith case 4: 412089c6957cSBarry Smith v2 = a->a + ii[row + 1]; 412189c6957cSBarry Smith v3 = a->a + ii[row + 2]; 412289c6957cSBarry Smith v4 = a->a + ii[row + 3]; 412389c6957cSBarry Smith sum1 = t[row]; 412489c6957cSBarry Smith sum2 = t[row + 1]; 412589c6957cSBarry Smith sum3 = t[row + 2]; 412689c6957cSBarry Smith sum4 = t[row + 3]; 412789c6957cSBarry Smith for (n = 0; n < sz - 1; n += 2) { 412889c6957cSBarry Smith i1 = idx[0]; 412989c6957cSBarry Smith i2 = idx[1]; 413089c6957cSBarry Smith idx += 2; 413189c6957cSBarry Smith tmp0 = t[i1]; 413289c6957cSBarry Smith tmp1 = t[i2]; 41339371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 41349371c9d4SSatish Balay v1 += 2; 41359371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 41369371c9d4SSatish Balay v2 += 2; 41379371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 41389371c9d4SSatish Balay v3 += 2; 41399371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 41409371c9d4SSatish Balay v4 += 2; 414189c6957cSBarry Smith } 414289c6957cSBarry Smith 414389c6957cSBarry Smith if (n == sz - 1) { 414489c6957cSBarry Smith tmp0 = t[*idx]; 414589c6957cSBarry Smith sum1 -= v1[0] * tmp0; 414689c6957cSBarry Smith sum2 -= v2[0] * tmp0; 414789c6957cSBarry Smith sum3 -= v3[0] * tmp0; 414889c6957cSBarry Smith sum4 -= v4[0] * tmp0; 414989c6957cSBarry Smith } 415089c6957cSBarry Smith x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12]; 415189c6957cSBarry Smith x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13]; 415289c6957cSBarry Smith x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14]; 415389c6957cSBarry Smith x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15]; 41549371c9d4SSatish Balay ibdiag += 16; 41559371c9d4SSatish Balay row += 4; 415689c6957cSBarry Smith break; 415789c6957cSBarry Smith case 5: 415889c6957cSBarry Smith v2 = a->a + ii[row + 1]; 415989c6957cSBarry Smith v3 = a->a + ii[row + 2]; 416089c6957cSBarry Smith v4 = a->a + ii[row + 3]; 416189c6957cSBarry Smith v5 = a->a + ii[row + 4]; 416289c6957cSBarry Smith sum1 = t[row]; 416389c6957cSBarry Smith sum2 = t[row + 1]; 416489c6957cSBarry Smith sum3 = t[row + 2]; 416589c6957cSBarry Smith sum4 = t[row + 3]; 416689c6957cSBarry Smith sum5 = t[row + 4]; 416789c6957cSBarry Smith for (n = 0; n < sz - 1; n += 2) { 416889c6957cSBarry Smith i1 = idx[0]; 416989c6957cSBarry Smith i2 = idx[1]; 417089c6957cSBarry Smith idx += 2; 417189c6957cSBarry Smith tmp0 = t[i1]; 417289c6957cSBarry Smith tmp1 = t[i2]; 41739371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 41749371c9d4SSatish Balay v1 += 2; 41759371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 41769371c9d4SSatish Balay v2 += 2; 41779371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 41789371c9d4SSatish Balay v3 += 2; 41799371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 41809371c9d4SSatish Balay v4 += 2; 41819371c9d4SSatish Balay sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 41829371c9d4SSatish Balay v5 += 2; 418389c6957cSBarry Smith } 418489c6957cSBarry Smith 418589c6957cSBarry Smith if (n == sz - 1) { 418689c6957cSBarry Smith tmp0 = t[*idx]; 418789c6957cSBarry Smith sum1 -= v1[0] * tmp0; 418889c6957cSBarry Smith sum2 -= v2[0] * tmp0; 418989c6957cSBarry Smith sum3 -= v3[0] * tmp0; 419089c6957cSBarry Smith sum4 -= v4[0] * tmp0; 419189c6957cSBarry Smith sum5 -= v5[0] * tmp0; 419289c6957cSBarry Smith } 419389c6957cSBarry Smith x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20]; 419489c6957cSBarry Smith x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21]; 419589c6957cSBarry Smith x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22]; 419689c6957cSBarry Smith x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23]; 419789c6957cSBarry Smith x[row + 4] += t[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24]; 41989371c9d4SSatish Balay ibdiag += 25; 41999371c9d4SSatish Balay row += 5; 420089c6957cSBarry Smith break; 4201d71ae5a4SJacob Faibussowitsch default: 4202*4d12350bSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz); 420389c6957cSBarry Smith } 420489c6957cSBarry Smith } 42059566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(a->nz)); 42065850ef23SBarry Smith } 42079566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 42089566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 42093ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 42102af78befSBarry Smith } 42112af78befSBarry Smith 4212ff6a9541SJacob Faibussowitsch static PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A, Vec bb, Vec xx) 4213d71ae5a4SJacob Faibussowitsch { 421489c6957cSBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 421589c6957cSBarry Smith PetscScalar *x, tmp1, tmp2, tmp3, tmp4, tmp5, x1, x2, x3, x4, x5; 421689c6957cSBarry Smith const MatScalar *bdiag = a->inode.bdiag; 421789c6957cSBarry Smith const PetscScalar *b; 4218*4d12350bSJunchao Zhang PetscInt m = a->inode.node_count, cnt = 0, i, row, nodesz; 4219*4d12350bSJunchao Zhang const PetscInt *sizes = a->inode.size_csr; 42202af78befSBarry Smith 422189c6957cSBarry Smith PetscFunctionBegin; 4222*4d12350bSJunchao Zhang PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 42239566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 42249566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 422589c6957cSBarry Smith cnt = 0; 422689c6957cSBarry Smith for (i = 0, row = 0; i < m; i++) { 4227*4d12350bSJunchao Zhang nodesz = sizes[i + 1] - sizes[i]; 4228*4d12350bSJunchao Zhang switch (nodesz) { 422989c6957cSBarry Smith case 1: 42309371c9d4SSatish Balay x[row] = b[row] * bdiag[cnt++]; 42319371c9d4SSatish Balay row++; 423289c6957cSBarry Smith break; 423389c6957cSBarry Smith case 2: 42349371c9d4SSatish Balay x1 = b[row]; 42359371c9d4SSatish Balay x2 = b[row + 1]; 423689c6957cSBarry Smith tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2]; 423789c6957cSBarry Smith tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3]; 423889c6957cSBarry Smith x[row++] = tmp1; 423989c6957cSBarry Smith x[row++] = tmp2; 424089c6957cSBarry Smith cnt += 4; 424189c6957cSBarry Smith break; 424289c6957cSBarry Smith case 3: 42439371c9d4SSatish Balay x1 = b[row]; 42449371c9d4SSatish Balay x2 = b[row + 1]; 42459371c9d4SSatish Balay x3 = b[row + 2]; 424689c6957cSBarry Smith tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6]; 424789c6957cSBarry Smith tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7]; 424889c6957cSBarry Smith tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8]; 424989c6957cSBarry Smith x[row++] = tmp1; 425089c6957cSBarry Smith x[row++] = tmp2; 425189c6957cSBarry Smith x[row++] = tmp3; 425289c6957cSBarry Smith cnt += 9; 425389c6957cSBarry Smith break; 425489c6957cSBarry Smith case 4: 42559371c9d4SSatish Balay x1 = b[row]; 42569371c9d4SSatish Balay x2 = b[row + 1]; 42579371c9d4SSatish Balay x3 = b[row + 2]; 42589371c9d4SSatish Balay x4 = b[row + 3]; 425989c6957cSBarry Smith tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12]; 426089c6957cSBarry Smith tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13]; 426189c6957cSBarry Smith tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14]; 426289c6957cSBarry Smith tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15]; 426389c6957cSBarry Smith x[row++] = tmp1; 426489c6957cSBarry Smith x[row++] = tmp2; 426589c6957cSBarry Smith x[row++] = tmp3; 426689c6957cSBarry Smith x[row++] = tmp4; 426789c6957cSBarry Smith cnt += 16; 426889c6957cSBarry Smith break; 426989c6957cSBarry Smith case 5: 42709371c9d4SSatish Balay x1 = b[row]; 42719371c9d4SSatish Balay x2 = b[row + 1]; 42729371c9d4SSatish Balay x3 = b[row + 2]; 42739371c9d4SSatish Balay x4 = b[row + 3]; 42749371c9d4SSatish Balay x5 = b[row + 4]; 427589c6957cSBarry Smith tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20]; 427689c6957cSBarry Smith tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21]; 427789c6957cSBarry Smith tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22]; 427889c6957cSBarry Smith tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23]; 427989c6957cSBarry Smith tmp5 = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24]; 428089c6957cSBarry Smith x[row++] = tmp1; 428189c6957cSBarry Smith x[row++] = tmp2; 428289c6957cSBarry Smith x[row++] = tmp3; 428389c6957cSBarry Smith x[row++] = tmp4; 428489c6957cSBarry Smith x[row++] = tmp5; 428589c6957cSBarry Smith cnt += 25; 428689c6957cSBarry Smith break; 4287d71ae5a4SJacob Faibussowitsch default: 4288*4d12350bSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz); 428989c6957cSBarry Smith } 429089c6957cSBarry Smith } 42919566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * cnt)); 42929566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 42939566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 42943ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 429589c6957cSBarry Smith } 429689c6957cSBarry Smith 4297d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJ_Inode_ResetOps(Mat A) 4298d71ae5a4SJacob Faibussowitsch { 4299b215bc84SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4300b215bc84SStefano Zampini 4301b215bc84SStefano Zampini PetscFunctionBegin; 4302b215bc84SStefano Zampini a->inode.node_count = 0; 4303b215bc84SStefano Zampini a->inode.use = PETSC_FALSE; 4304b215bc84SStefano Zampini a->inode.checked = PETSC_FALSE; 4305b215bc84SStefano Zampini a->inode.mat_nonzerostate = -1; 4306b215bc84SStefano Zampini A->ops->getrowij = MatGetRowIJ_SeqAIJ; 4307b215bc84SStefano Zampini A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ; 4308b215bc84SStefano Zampini A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ; 4309b215bc84SStefano Zampini A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ; 4310b215bc84SStefano Zampini A->ops->coloringpatch = NULL; 4311b215bc84SStefano Zampini A->ops->multdiagonalblock = NULL; 4312ad540459SPierre Jolivet if (A->factortype) A->ops->solve = MatSolve_SeqAIJ_inplace; 43133ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4314b215bc84SStefano Zampini } 4315b215bc84SStefano Zampini 43164c1414c8SBarry Smith /* 43174c1414c8SBarry Smith samestructure indicates that the matrix has not changed its nonzero structure so we 43184c1414c8SBarry Smith do not need to recompute the inodes 43194c1414c8SBarry Smith */ 4320d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCheckInode(Mat A) 4321d71ae5a4SJacob Faibussowitsch { 43224c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 43238758e1faSBarry Smith PetscInt i, j, m, nzx, nzy, *ns, node_count, blk_size; 4324ace3abfcSBarry Smith PetscBool flag; 43258758e1faSBarry Smith const PetscInt *idx, *idy, *ii; 43264c1414c8SBarry Smith 43274c1414c8SBarry Smith PetscFunctionBegin; 4328b215bc84SStefano Zampini if (!a->inode.use) { 43299566063dSJacob Faibussowitsch PetscCall(MatSeqAIJ_Inode_ResetOps(A)); 4330*4d12350bSJunchao Zhang PetscCall(PetscFree(a->inode.size_csr)); 43313ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4332b215bc84SStefano Zampini } 43333ba16761SJacob Faibussowitsch if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(PETSC_SUCCESS); 43344c1414c8SBarry Smith 4335d0f46423SBarry Smith m = A->rmap->n; 4336*4d12350bSJunchao Zhang if (!a->inode.size_csr) PetscCall(PetscMalloc1(m + 1, &a->inode.size_csr)); 4337*4d12350bSJunchao Zhang ns = a->inode.size_csr; 4338*4d12350bSJunchao Zhang ns[0] = 0; 43394c1414c8SBarry Smith 43404c1414c8SBarry Smith i = 0; 43414c1414c8SBarry Smith node_count = 0; 43424c1414c8SBarry Smith idx = a->j; 43434c1414c8SBarry Smith ii = a->i; 43446f2c871aSStefano Zampini if (idx) { 43454c1414c8SBarry Smith while (i < m) { /* For each row */ 43464c1414c8SBarry Smith nzx = ii[i + 1] - ii[i]; /* Number of nonzeros */ 43474c1414c8SBarry Smith /* Limits the number of elements in a node to 'a->inode.limit' */ 43484c1414c8SBarry Smith for (j = i + 1, idy = idx, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) { 43494c1414c8SBarry Smith nzy = ii[j + 1] - ii[j]; /* Same number of nonzeros */ 43504c1414c8SBarry Smith if (nzy != nzx) break; 43514c1414c8SBarry Smith idy += nzx; /* Same nonzero pattern */ 43529566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(idx, idy, nzx, &flag)); 43534c1414c8SBarry Smith if (!flag) break; 43544c1414c8SBarry Smith } 4355*4d12350bSJunchao Zhang ns[node_count + 1] = ns[node_count] + blk_size; 4356*4d12350bSJunchao Zhang node_count++; 43574c1414c8SBarry Smith idx += blk_size * nzx; 43584c1414c8SBarry Smith i = j; 43594c1414c8SBarry Smith } 43606f2c871aSStefano Zampini } 43614c1414c8SBarry Smith /* If not enough inodes found,, do not use inode version of the routines */ 43626f2c871aSStefano Zampini if (!m || !idx || node_count > .8 * m) { 43639566063dSJacob Faibussowitsch PetscCall(MatSeqAIJ_Inode_ResetOps(A)); 4364*4d12350bSJunchao Zhang PetscCall(PetscFree(a->inode.size_csr)); 43659566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m)); 43664c1414c8SBarry Smith } else { 4367d5f3da31SBarry Smith if (!A->factortype) { 4368375a6242SBarry Smith A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4369375a6242SBarry Smith if (A->rmap->n == A->cmap->n) { 43704108e4d5SBarry Smith A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 43714108e4d5SBarry Smith A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 43724108e4d5SBarry Smith A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 43734108e4d5SBarry Smith A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 43744108e4d5SBarry Smith A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 4375375a6242SBarry Smith } 4376d3ac4fa3SBarry Smith } else { 4377d3ac4fa3SBarry Smith A->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4378d3ac4fa3SBarry Smith } 43794c1414c8SBarry Smith a->inode.node_count = node_count; 43809566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit)); 43814c1414c8SBarry Smith } 4382be6adb11SBarry Smith a->inode.checked = PETSC_TRUE; 4383a02bda8eSBarry Smith a->inode.mat_nonzerostate = A->nonzerostate; 43843ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 43854c1414c8SBarry Smith } 43864c1414c8SBarry Smith 4387d71ae5a4SJacob Faibussowitsch PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A, MatDuplicateOption cpvalues, Mat *C) 4388d71ae5a4SJacob Faibussowitsch { 4389150f0143SBarry Smith Mat B = *C; 4390150f0143SBarry Smith Mat_SeqAIJ *c = (Mat_SeqAIJ *)B->data, *a = (Mat_SeqAIJ *)A->data; 4391150f0143SBarry Smith PetscInt m = A->rmap->n; 4392150f0143SBarry Smith 4393150f0143SBarry Smith PetscFunctionBegin; 4394150f0143SBarry Smith c->inode.use = a->inode.use; 4395150f0143SBarry Smith c->inode.limit = a->inode.limit; 4396150f0143SBarry Smith c->inode.max_limit = a->inode.max_limit; 4397ec710b6aSStefano Zampini c->inode.checked = PETSC_FALSE; 4398*4d12350bSJunchao Zhang c->inode.size_csr = NULL; 4399ec710b6aSStefano Zampini c->inode.node_count = 0; 4400ec710b6aSStefano Zampini c->inode.ibdiagvalid = PETSC_FALSE; 4401ec710b6aSStefano Zampini c->inode.ibdiag = NULL; 4402ec710b6aSStefano Zampini c->inode.bdiag = NULL; 4403ec710b6aSStefano Zampini c->inode.mat_nonzerostate = -1; 4404b215bc84SStefano Zampini if (a->inode.use) { 4405*4d12350bSJunchao Zhang if (a->inode.checked && a->inode.size_csr) { 4406*4d12350bSJunchao Zhang PetscCall(PetscMalloc1(m + 1, &c->inode.size_csr)); 4407*4d12350bSJunchao Zhang PetscCall(PetscArraycpy(c->inode.size_csr, a->inode.size_csr, m + 1)); 4408ec710b6aSStefano Zampini 4409ec710b6aSStefano Zampini c->inode.checked = PETSC_TRUE; 4410ec710b6aSStefano Zampini c->inode.node_count = a->inode.node_count; 4411ec710b6aSStefano Zampini c->inode.mat_nonzerostate = (*C)->nonzerostate; 4412ec710b6aSStefano Zampini } 4413a02bda8eSBarry Smith /* note the table of functions below should match that in MatSeqAIJCheckInode() */ 44142c451681SBarry Smith if (!B->factortype) { 44152c451681SBarry Smith B->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 44162c451681SBarry Smith B->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 44172c451681SBarry Smith B->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 44182c451681SBarry Smith B->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 44192c451681SBarry Smith B->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 44202c451681SBarry Smith B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4421150f0143SBarry Smith } else { 44222c451681SBarry Smith B->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4423150f0143SBarry Smith } 4424150f0143SBarry Smith } 44253ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4426150f0143SBarry Smith } 4427150f0143SBarry Smith 4428d71ae5a4SJacob Faibussowitsch static inline PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols, PetscInt nzl, PetscInt nzu, PetscInt nz, const PetscInt *ai, const PetscInt *aj, const PetscInt *adiag, PetscInt row) 4429d71ae5a4SJacob Faibussowitsch { 44308758e1faSBarry Smith PetscInt k; 44318758e1faSBarry Smith const PetscInt *vi; 44326e111a19SKarl Rupp 443317454e89SShri Abhyankar PetscFunctionBegin; 443417454e89SShri Abhyankar vi = aj + ai[row]; 443517454e89SShri Abhyankar for (k = 0; k < nzl; k++) cols[k] = vi[k]; 443617454e89SShri Abhyankar vi = aj + adiag[row]; 443717454e89SShri Abhyankar cols[nzl] = vi[0]; 443817454e89SShri Abhyankar vi = aj + adiag[row + 1] + 1; 443917454e89SShri Abhyankar for (k = 0; k < nzu; k++) cols[nzl + 1 + k] = vi[k]; 44403ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 444117454e89SShri Abhyankar } 44426936b636SHong Zhang /* 4443a02bda8eSBarry Smith MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix. 4444a02bda8eSBarry Smith Modified from MatSeqAIJCheckInode(). 44456936b636SHong Zhang 44466936b636SHong Zhang Input Parameters: 4447abb87a52SBarry Smith . Mat A - ILU or LU matrix factor 4448abb87a52SBarry Smith 44496936b636SHong Zhang */ 4450d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A) 4451d71ae5a4SJacob Faibussowitsch { 4452019b515eSShri Abhyankar Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4453019b515eSShri Abhyankar PetscInt i, j, m, nzl1, nzu1, nzl2, nzu2, nzx, nzy, node_count, blk_size; 44548758e1faSBarry Smith PetscInt *cols1, *cols2, *ns; 44558758e1faSBarry Smith const PetscInt *ai = a->i, *aj = a->j, *adiag = a->diag; 4456ace3abfcSBarry Smith PetscBool flag; 4457019b515eSShri Abhyankar 4458019b515eSShri Abhyankar PetscFunctionBegin; 44593ba16761SJacob Faibussowitsch if (!a->inode.use) PetscFunctionReturn(PETSC_SUCCESS); 44603ba16761SJacob Faibussowitsch if (a->inode.checked) PetscFunctionReturn(PETSC_SUCCESS); 4461019b515eSShri Abhyankar 4462019b515eSShri Abhyankar m = A->rmap->n; 4463*4d12350bSJunchao Zhang if (a->inode.size_csr) ns = a->inode.size_csr; 446448a46eb9SPierre Jolivet else PetscCall(PetscMalloc1(m + 1, &ns)); 4465*4d12350bSJunchao Zhang ns[0] = 0; 4466019b515eSShri Abhyankar 4467019b515eSShri Abhyankar i = 0; 4468019b515eSShri Abhyankar node_count = 0; 44699566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(m, &cols1, m, &cols2)); 4470019b515eSShri Abhyankar while (i < m) { /* For each row */ 4471019b515eSShri Abhyankar nzl1 = ai[i + 1] - ai[i]; /* Number of nonzeros in L */ 4472019b515eSShri Abhyankar nzu1 = adiag[i] - adiag[i + 1] - 1; /* Number of nonzeros in U excluding diagonal*/ 4473019b515eSShri Abhyankar nzx = nzl1 + nzu1 + 1; 44743ba16761SJacob Faibussowitsch PetscCall(MatGetRow_FactoredLU(cols1, nzl1, nzu1, nzx, ai, aj, adiag, i)); 4475019b515eSShri Abhyankar 4476019b515eSShri Abhyankar /* Limits the number of elements in a node to 'a->inode.limit' */ 4477019b515eSShri Abhyankar for (j = i + 1, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) { 4478019b515eSShri Abhyankar nzl2 = ai[j + 1] - ai[j]; 4479019b515eSShri Abhyankar nzu2 = adiag[j] - adiag[j + 1] - 1; 4480019b515eSShri Abhyankar nzy = nzl2 + nzu2 + 1; 4481019b515eSShri Abhyankar if (nzy != nzx) break; 44829566063dSJacob Faibussowitsch PetscCall(MatGetRow_FactoredLU(cols2, nzl2, nzu2, nzy, ai, aj, adiag, j)); 44839566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(cols1, cols2, nzx, &flag)); 44848758e1faSBarry Smith if (!flag) break; 4485019b515eSShri Abhyankar } 4486*4d12350bSJunchao Zhang ns[node_count + 1] = ns[node_count] + blk_size; 4487*4d12350bSJunchao Zhang node_count++; 4488019b515eSShri Abhyankar i = j; 4489019b515eSShri Abhyankar } 44909566063dSJacob Faibussowitsch PetscCall(PetscFree2(cols1, cols2)); 4491019b515eSShri Abhyankar /* If not enough inodes found,, do not use inode version of the routines */ 4492be6adb11SBarry Smith if (!m || node_count > .8 * m) { 44939566063dSJacob Faibussowitsch PetscCall(PetscFree(ns)); 44942205254eSKarl Rupp 4495019b515eSShri Abhyankar a->inode.node_count = 0; 4496*4d12350bSJunchao Zhang a->inode.size_csr = NULL; 4497019b515eSShri Abhyankar a->inode.use = PETSC_FALSE; 44982205254eSKarl Rupp 44999566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m)); 4500019b515eSShri Abhyankar } else { 4501f4259b30SLisandro Dalcin A->ops->mult = NULL; 4502f4259b30SLisandro Dalcin A->ops->sor = NULL; 4503f4259b30SLisandro Dalcin A->ops->multadd = NULL; 4504f4259b30SLisandro Dalcin A->ops->getrowij = NULL; 4505f4259b30SLisandro Dalcin A->ops->restorerowij = NULL; 4506f4259b30SLisandro Dalcin A->ops->getcolumnij = NULL; 4507f4259b30SLisandro Dalcin A->ops->restorecolumnij = NULL; 4508f4259b30SLisandro Dalcin A->ops->coloringpatch = NULL; 4509f4259b30SLisandro Dalcin A->ops->multdiagonalblock = NULL; 4510019b515eSShri Abhyankar a->inode.node_count = node_count; 4511*4d12350bSJunchao Zhang a->inode.size_csr = ns; 45129566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit)); 4513019b515eSShri Abhyankar } 4514be6adb11SBarry Smith a->inode.checked = PETSC_TRUE; 45153ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4516019b515eSShri Abhyankar } 4517019b515eSShri Abhyankar 4518d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A) 4519d71ae5a4SJacob Faibussowitsch { 4520acf2f550SJed Brown Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4521acf2f550SJed Brown 4522acf2f550SJed Brown PetscFunctionBegin; 4523acf2f550SJed Brown a->inode.ibdiagvalid = PETSC_FALSE; 45243ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4525acf2f550SJed Brown } 4526acf2f550SJed Brown 45274c1414c8SBarry Smith /* 45284c1414c8SBarry Smith This is really ugly. if inodes are used this replaces the 45294c1414c8SBarry Smith permutations with ones that correspond to rows/cols of the matrix 4530467446fbSPierre Jolivet rather than inode blocks 45314c1414c8SBarry Smith */ 4532d71ae5a4SJacob Faibussowitsch PetscErrorCode MatInodeAdjustForInodes(Mat A, IS *rperm, IS *cperm) 4533d71ae5a4SJacob Faibussowitsch { 45344c1414c8SBarry Smith PetscFunctionBegin; 4535cac4c232SBarry Smith PetscTryMethod(A, "MatInodeAdjustForInodes_C", (Mat, IS *, IS *), (A, rperm, cperm)); 45363ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 45374c1414c8SBarry Smith } 45384c1414c8SBarry Smith 4539d71ae5a4SJacob Faibussowitsch PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A, IS *rperm, IS *cperm) 4540d71ae5a4SJacob Faibussowitsch { 45414c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 45425d0c19d7SBarry Smith PetscInt m = A->rmap->n, n = A->cmap->n, i, j, nslim_row = a->inode.node_count; 45435d0c19d7SBarry Smith const PetscInt *ridx, *cidx; 4544*4d12350bSJunchao Zhang PetscInt row, col, *permr, *permc, *ns_row = a->inode.size_csr, *tns, start_val, end_val, indx; 45454c1414c8SBarry Smith PetscInt nslim_col, *ns_col; 45464c1414c8SBarry Smith IS ris = *rperm, cis = *cperm; 45474c1414c8SBarry Smith 45484c1414c8SBarry Smith PetscFunctionBegin; 4549*4d12350bSJunchao Zhang if (!a->inode.size_csr) PetscFunctionReturn(PETSC_SUCCESS); /* no inodes so return */ 45503ba16761SJacob Faibussowitsch if (a->inode.node_count == m) PetscFunctionReturn(PETSC_SUCCESS); /* all inodes are of size 1 */ 45514c1414c8SBarry Smith 45529566063dSJacob Faibussowitsch PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col)); 455332603206SJames Wright PetscCall(PetscMalloc1(((nslim_row > nslim_col ? nslim_row : nslim_col) + 1), &tns)); 45549566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(m, &permr, n, &permc)); 45554c1414c8SBarry Smith 45569566063dSJacob Faibussowitsch PetscCall(ISGetIndices(ris, &ridx)); 45579566063dSJacob Faibussowitsch PetscCall(ISGetIndices(cis, &cidx)); 45584c1414c8SBarry Smith 4559baca6076SPierre Jolivet /* Form the inode structure for the rows of permuted matrix using inv perm*/ 4560*4d12350bSJunchao Zhang for (i = 0, tns[0] = 0; i < nslim_row; ++i) tns[i + 1] = tns[i] + (ns_row[i + 1] - ns_row[i]); 45614c1414c8SBarry Smith 45624c1414c8SBarry Smith /* Construct the permutations for rows*/ 45634c1414c8SBarry Smith for (i = 0, row = 0; i < nslim_row; ++i) { 45644c1414c8SBarry Smith indx = ridx[i]; 45654c1414c8SBarry Smith start_val = tns[indx]; 45664c1414c8SBarry Smith end_val = tns[indx + 1]; 45674c1414c8SBarry Smith for (j = start_val; j < end_val; ++j, ++row) permr[row] = j; 45684c1414c8SBarry Smith } 45694c1414c8SBarry Smith 45704c1414c8SBarry Smith /* Form the inode structure for the columns of permuted matrix using inv perm*/ 4571*4d12350bSJunchao Zhang for (i = 0, tns[0] = 0; i < nslim_col; ++i) tns[i + 1] = tns[i] + (ns_col[i + 1] - ns_col[i]); 45724c1414c8SBarry Smith 45734c1414c8SBarry Smith /* Construct permutations for columns */ 45744c1414c8SBarry Smith for (i = 0, col = 0; i < nslim_col; ++i) { 45754c1414c8SBarry Smith indx = cidx[i]; 45764c1414c8SBarry Smith start_val = tns[indx]; 45774c1414c8SBarry Smith end_val = tns[indx + 1]; 45784c1414c8SBarry Smith for (j = start_val; j < end_val; ++j, ++col) permc[col] = j; 45794c1414c8SBarry Smith } 45804c1414c8SBarry Smith 45819566063dSJacob Faibussowitsch PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permr, PETSC_COPY_VALUES, rperm)); 45829566063dSJacob Faibussowitsch PetscCall(ISSetPermutation(*rperm)); 45839566063dSJacob Faibussowitsch PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permc, PETSC_COPY_VALUES, cperm)); 45849566063dSJacob Faibussowitsch PetscCall(ISSetPermutation(*cperm)); 45854c1414c8SBarry Smith 45869566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(ris, &ridx)); 45879566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(cis, &cidx)); 45884c1414c8SBarry Smith 45899566063dSJacob Faibussowitsch PetscCall(PetscFree(ns_col)); 45909566063dSJacob Faibussowitsch PetscCall(PetscFree2(permr, permc)); 45919566063dSJacob Faibussowitsch PetscCall(ISDestroy(&cis)); 45929566063dSJacob Faibussowitsch PetscCall(ISDestroy(&ris)); 45939566063dSJacob Faibussowitsch PetscCall(PetscFree(tns)); 45943ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 45954c1414c8SBarry Smith } 45964c1414c8SBarry Smith 45974c1414c8SBarry Smith /*@C 459811a5261eSBarry Smith MatInodeGetInodeSizes - Returns the inode information of a matrix with inodes 45994c1414c8SBarry Smith 46003f9fe445SBarry Smith Not Collective 46014c1414c8SBarry Smith 46024c1414c8SBarry Smith Input Parameter: 460311a5261eSBarry Smith . A - the Inode matrix or matrix derived from the Inode class -- e.g., `MATSEQAIJ` 46044c1414c8SBarry Smith 4605d8d19677SJose E. Roman Output Parameters: 46064c1414c8SBarry Smith + node_count - no of inodes present in the matrix. 46072ef1f0ffSBarry Smith . sizes - an array of size `node_count`, with the sizes of each inode. 46084c1414c8SBarry Smith - limit - the max size used to generate the inodes. 46094c1414c8SBarry Smith 46104c1414c8SBarry Smith Level: advanced 46114c1414c8SBarry Smith 461211a5261eSBarry Smith Note: 46134c1414c8SBarry Smith It should be called after the matrix is assembled. 46144c1414c8SBarry Smith The contents of the sizes[] array should not be changed. 46152ef1f0ffSBarry Smith `NULL` may be passed for information not needed 46164c1414c8SBarry Smith 46171cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatGetInfo()` 46184c1414c8SBarry Smith @*/ 4619d71ae5a4SJacob Faibussowitsch PetscErrorCode MatInodeGetInodeSizes(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit) 4620d71ae5a4SJacob Faibussowitsch { 46215f80ce2aSJacob Faibussowitsch PetscErrorCode (*f)(Mat, PetscInt *, PetscInt **, PetscInt *); 46224c1414c8SBarry Smith 46234c1414c8SBarry Smith PetscFunctionBegin; 46245f80ce2aSJacob Faibussowitsch PetscCheck(A->assembled, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix"); 46259566063dSJacob Faibussowitsch PetscCall(PetscObjectQueryFunction((PetscObject)A, "MatInodeGetInodeSizes_C", &f)); 46269566063dSJacob Faibussowitsch if (f) PetscCall((*f)(A, node_count, sizes, limit)); 46273ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 46284c1414c8SBarry Smith } 46294c1414c8SBarry Smith 4630d71ae5a4SJacob Faibussowitsch PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit) 4631d71ae5a4SJacob Faibussowitsch { 46324c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 46334c1414c8SBarry Smith 46344c1414c8SBarry Smith PetscFunctionBegin; 46354c1414c8SBarry Smith if (node_count) *node_count = a->inode.node_count; 4636*4d12350bSJunchao Zhang if (sizes) *sizes = a->inode.size_csr; 46374c1414c8SBarry Smith if (limit) *limit = a->inode.limit; 46383ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 46394c1414c8SBarry Smith } 4640