14c1414c8SBarry Smith 24c1414c8SBarry Smith /* 34c1414c8SBarry Smith This file provides high performance routines for the Inode format (compressed sparse row) 44c1414c8SBarry Smith by taking advantage of rows with identical nonzero structure (I-nodes). 54c1414c8SBarry Smith */ 6c6db04a5SJed Brown #include <../src/mat/impls/aij/seq/aij.h> 74c1414c8SBarry Smith 8d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatCreateColInode_Private(Mat A, PetscInt *size, PetscInt **ns) 9d71ae5a4SJacob Faibussowitsch { 104c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 114c1414c8SBarry Smith PetscInt i, count, m, n, min_mn, *ns_row, *ns_col; 124c1414c8SBarry Smith 134c1414c8SBarry Smith PetscFunctionBegin; 14d0f46423SBarry Smith n = A->cmap->n; 15d0f46423SBarry Smith m = A->rmap->n; 1608401ef6SPierre Jolivet PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 174c1414c8SBarry Smith ns_row = a->inode.size; 184c1414c8SBarry Smith 194c1414c8SBarry Smith min_mn = (m < n) ? m : n; 204c1414c8SBarry Smith if (!ns) { 219371c9d4SSatish Balay for (count = 0, i = 0; count < min_mn; count += ns_row[i], i++) 229371c9d4SSatish Balay ; 239371c9d4SSatish Balay for (; count + 1 < n; count++, i++) 249371c9d4SSatish Balay ; 25ad540459SPierre Jolivet if (count < n) i++; 264c1414c8SBarry Smith *size = i; 273ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 284c1414c8SBarry Smith } 299566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(n + 1, &ns_col)); 304c1414c8SBarry Smith 314c1414c8SBarry Smith /* Use the same row structure wherever feasible. */ 32ad540459SPierre Jolivet for (count = 0, i = 0; count < min_mn; count += ns_row[i], i++) ns_col[i] = ns_row[i]; 334c1414c8SBarry Smith 344c1414c8SBarry Smith /* if m < n; pad up the remainder with inode_limit */ 35ad540459SPierre Jolivet for (; count + 1 < n; count++, i++) ns_col[i] = 1; 364c1414c8SBarry Smith /* The last node is the odd ball. padd it up with the remaining rows; */ 374c1414c8SBarry Smith if (count < n) { 384c1414c8SBarry Smith ns_col[i] = n - count; 394c1414c8SBarry Smith i++; 404c1414c8SBarry Smith } else if (count > n) { 414c1414c8SBarry Smith /* Adjust for the over estimation */ 424c1414c8SBarry Smith ns_col[i - 1] += n - count; 434c1414c8SBarry Smith } 444c1414c8SBarry Smith *size = i; 454c1414c8SBarry Smith *ns = ns_col; 463ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 474c1414c8SBarry Smith } 484c1414c8SBarry Smith 494c1414c8SBarry Smith /* 504c1414c8SBarry Smith This builds symmetric version of nonzero structure, 514c1414c8SBarry Smith */ 52d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift) 53d71ae5a4SJacob Faibussowitsch { 544c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 558758e1faSBarry Smith PetscInt *work, *ia, *ja, nz, nslim_row, nslim_col, m, row, col, n; 568758e1faSBarry Smith PetscInt *tns, *tvc, *ns_row = a->inode.size, *ns_col, nsz, i1, i2; 578758e1faSBarry Smith const PetscInt *j, *jmax, *ai = a->i, *aj = a->j; 584c1414c8SBarry Smith 594c1414c8SBarry Smith PetscFunctionBegin; 604c1414c8SBarry Smith nslim_row = a->inode.node_count; 61d0f46423SBarry Smith m = A->rmap->n; 62d0f46423SBarry Smith n = A->cmap->n; 6308401ef6SPierre Jolivet PetscCheck(m == n, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square"); 6408401ef6SPierre Jolivet PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 654c1414c8SBarry Smith 664c1414c8SBarry Smith /* Use the row_inode as column_inode */ 674c1414c8SBarry Smith nslim_col = nslim_row; 684c1414c8SBarry Smith ns_col = ns_row; 694c1414c8SBarry Smith 7035cb6cd3SPierre Jolivet /* allocate space for reformatted inode structure */ 719566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc)); 724c1414c8SBarry Smith for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_row[i1]; 734c1414c8SBarry Smith 744c1414c8SBarry Smith for (i1 = 0, col = 0; i1 < nslim_col; ++i1) { 754c1414c8SBarry Smith nsz = ns_col[i1]; 762205254eSKarl Rupp for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1; 774c1414c8SBarry Smith } 784c1414c8SBarry Smith /* allocate space for row pointers */ 799566063dSJacob Faibussowitsch PetscCall(PetscCalloc1(nslim_row + 1, &ia)); 804c1414c8SBarry Smith *iia = ia; 819566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(nslim_row + 1, &work)); 824c1414c8SBarry Smith 834c1414c8SBarry Smith /* determine the number of columns in each row */ 844c1414c8SBarry Smith ia[0] = oshift; 854c1414c8SBarry Smith for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) { 864c1414c8SBarry Smith j = aj + ai[row] + ishift; 874c1414c8SBarry Smith jmax = aj + ai[row + 1] + ishift; 8883fed2edSSatish Balay if (j == jmax) continue; /* empty row */ 894c1414c8SBarry Smith col = *j++ + ishift; 904c1414c8SBarry Smith i2 = tvc[col]; 916aad120cSJose E. Roman while (i2 < i1 && j < jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elements */ 924c1414c8SBarry Smith ia[i1 + 1]++; 934c1414c8SBarry Smith ia[i2 + 1]++; 944c1414c8SBarry Smith i2++; /* Start col of next node */ 9590d2dec7SBarry Smith while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j; 964c1414c8SBarry Smith i2 = tvc[col]; 974c1414c8SBarry Smith } 984c1414c8SBarry Smith if (i2 == i1) ia[i2 + 1]++; /* now the diagonal element */ 994c1414c8SBarry Smith } 1004c1414c8SBarry Smith 1014c1414c8SBarry Smith /* shift ia[i] to point to next row */ 1024c1414c8SBarry Smith for (i1 = 1; i1 < nslim_row + 1; i1++) { 1034c1414c8SBarry Smith row = ia[i1 - 1]; 1044c1414c8SBarry Smith ia[i1] += row; 1054c1414c8SBarry Smith work[i1 - 1] = row - oshift; 1064c1414c8SBarry Smith } 1074c1414c8SBarry Smith 1084c1414c8SBarry Smith /* allocate space for column pointers */ 1094c1414c8SBarry Smith nz = ia[nslim_row] + (!ishift); 1109566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(nz, &ja)); 1114c1414c8SBarry Smith *jja = ja; 1124c1414c8SBarry Smith 1134c1414c8SBarry Smith /* loop over lower triangular part putting into ja */ 1144c1414c8SBarry Smith for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) { 1154c1414c8SBarry Smith j = aj + ai[row] + ishift; 1164c1414c8SBarry Smith jmax = aj + ai[row + 1] + ishift; 11783fed2edSSatish Balay if (j == jmax) continue; /* empty row */ 1184c1414c8SBarry Smith col = *j++ + ishift; 1194c1414c8SBarry Smith i2 = tvc[col]; 1204c1414c8SBarry Smith while (i2 < i1 && j < jmax) { 1214c1414c8SBarry Smith ja[work[i2]++] = i1 + oshift; 1224c1414c8SBarry Smith ja[work[i1]++] = i2 + oshift; 1234c1414c8SBarry Smith ++i2; 12490d2dec7SBarry Smith while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j; /* Skip rest col indices in this node */ 1254c1414c8SBarry Smith i2 = tvc[col]; 1264c1414c8SBarry Smith } 1274c1414c8SBarry Smith if (i2 == i1) ja[work[i1]++] = i2 + oshift; 1284c1414c8SBarry Smith } 1299566063dSJacob Faibussowitsch PetscCall(PetscFree(work)); 1309566063dSJacob Faibussowitsch PetscCall(PetscFree2(tns, tvc)); 1313ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1324c1414c8SBarry Smith } 1334c1414c8SBarry Smith 1344c1414c8SBarry Smith /* 1354c1414c8SBarry Smith This builds nonsymmetric version of nonzero structure, 1364c1414c8SBarry Smith */ 137d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift) 138d71ae5a4SJacob Faibussowitsch { 1394c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1408758e1faSBarry Smith PetscInt *work, *ia, *ja, nz, nslim_row, n, row, col, *ns_col, nslim_col; 1418758e1faSBarry Smith PetscInt *tns, *tvc, nsz, i1, i2; 1428758e1faSBarry Smith const PetscInt *j, *ai = a->i, *aj = a->j, *ns_row = a->inode.size; 1434c1414c8SBarry Smith 1444c1414c8SBarry Smith PetscFunctionBegin; 14508401ef6SPierre Jolivet PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 1464c1414c8SBarry Smith nslim_row = a->inode.node_count; 147d0f46423SBarry Smith n = A->cmap->n; 1484c1414c8SBarry Smith 1494c1414c8SBarry Smith /* Create The column_inode for this matrix */ 1509566063dSJacob Faibussowitsch PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col)); 1514c1414c8SBarry Smith 15235cb6cd3SPierre Jolivet /* allocate space for reformatted column_inode structure */ 1539566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc)); 1544c1414c8SBarry Smith for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_col[i1]; 1554c1414c8SBarry Smith 1564c1414c8SBarry Smith for (i1 = 0, col = 0; i1 < nslim_col; ++i1) { 1574c1414c8SBarry Smith nsz = ns_col[i1]; 1582205254eSKarl Rupp for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1; 1594c1414c8SBarry Smith } 1604c1414c8SBarry Smith /* allocate space for row pointers */ 1619566063dSJacob Faibussowitsch PetscCall(PetscCalloc1(nslim_row + 1, &ia)); 1624c1414c8SBarry Smith *iia = ia; 1639566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(nslim_row + 1, &work)); 1644c1414c8SBarry Smith 1654c1414c8SBarry Smith /* determine the number of columns in each row */ 1664c1414c8SBarry Smith ia[0] = oshift; 1674c1414c8SBarry Smith for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) { 1684c1414c8SBarry Smith j = aj + ai[row] + ishift; 16983fed2edSSatish Balay nz = ai[row + 1] - ai[row]; 17083fed2edSSatish Balay if (!nz) continue; /* empty row */ 1714c1414c8SBarry Smith col = *j++ + ishift; 1724c1414c8SBarry Smith i2 = tvc[col]; 1736aad120cSJose E. Roman while (nz-- > 0) { /* off-diagonal elements */ 1744c1414c8SBarry Smith ia[i1 + 1]++; 1754c1414c8SBarry Smith i2++; /* Start col of next node */ 176a8e3a797SJed Brown while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 1774c1414c8SBarry Smith if (nz > 0) i2 = tvc[col]; 1784c1414c8SBarry Smith } 1794c1414c8SBarry Smith } 1804c1414c8SBarry Smith 1814c1414c8SBarry Smith /* shift ia[i] to point to next row */ 1824c1414c8SBarry Smith for (i1 = 1; i1 < nslim_row + 1; i1++) { 1834c1414c8SBarry Smith row = ia[i1 - 1]; 1844c1414c8SBarry Smith ia[i1] += row; 1854c1414c8SBarry Smith work[i1 - 1] = row - oshift; 1864c1414c8SBarry Smith } 1874c1414c8SBarry Smith 1884c1414c8SBarry Smith /* allocate space for column pointers */ 1894c1414c8SBarry Smith nz = ia[nslim_row] + (!ishift); 1909566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(nz, &ja)); 1914c1414c8SBarry Smith *jja = ja; 1924c1414c8SBarry Smith 1934c1414c8SBarry Smith /* loop over matrix putting into ja */ 1944c1414c8SBarry Smith for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) { 1954c1414c8SBarry Smith j = aj + ai[row] + ishift; 19683fed2edSSatish Balay nz = ai[row + 1] - ai[row]; 19783fed2edSSatish Balay if (!nz) continue; /* empty row */ 1984c1414c8SBarry Smith col = *j++ + ishift; 1994c1414c8SBarry Smith i2 = tvc[col]; 2004c1414c8SBarry Smith while (nz-- > 0) { 2014c1414c8SBarry Smith ja[work[i1]++] = i2 + oshift; 2024c1414c8SBarry Smith ++i2; 203a8e3a797SJed Brown while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 2044c1414c8SBarry Smith if (nz > 0) i2 = tvc[col]; 2054c1414c8SBarry Smith } 2064c1414c8SBarry Smith } 2079566063dSJacob Faibussowitsch PetscCall(PetscFree(ns_col)); 2089566063dSJacob Faibussowitsch PetscCall(PetscFree(work)); 2099566063dSJacob Faibussowitsch PetscCall(PetscFree2(tns, tvc)); 2103ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2114c1414c8SBarry Smith } 2124c1414c8SBarry Smith 213d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) 214d71ae5a4SJacob Faibussowitsch { 2154c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2164c1414c8SBarry Smith 2174c1414c8SBarry Smith PetscFunctionBegin; 21850ba90b4SBarry Smith if (n) *n = a->inode.node_count; 2193ba16761SJacob Faibussowitsch if (!ia) PetscFunctionReturn(PETSC_SUCCESS); 2208f7157efSSatish Balay if (!blockcompressed) { 2219566063dSJacob Faibussowitsch PetscCall(MatGetRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done)); 2228f7157efSSatish Balay } else if (symmetric) { 2239566063dSJacob Faibussowitsch PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift)); 2244c1414c8SBarry Smith } else { 2259566063dSJacob Faibussowitsch PetscCall(MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift)); 2264c1414c8SBarry Smith } 2273ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2284c1414c8SBarry Smith } 2294c1414c8SBarry Smith 230d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) 231d71ae5a4SJacob Faibussowitsch { 2324c1414c8SBarry Smith PetscFunctionBegin; 2333ba16761SJacob Faibussowitsch if (!ia) PetscFunctionReturn(PETSC_SUCCESS); 2348f7157efSSatish Balay 2358f7157efSSatish Balay if (!blockcompressed) { 2369566063dSJacob Faibussowitsch PetscCall(MatRestoreRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done)); 2378f7157efSSatish Balay } else { 2389566063dSJacob Faibussowitsch PetscCall(PetscFree(*ia)); 2399566063dSJacob Faibussowitsch PetscCall(PetscFree(*ja)); 2408f7157efSSatish Balay } 2413ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2424c1414c8SBarry Smith } 2434c1414c8SBarry Smith 2444c1414c8SBarry Smith /* ----------------------------------------------------------- */ 2454c1414c8SBarry Smith 246d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift) 247d71ae5a4SJacob Faibussowitsch { 2484c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2494c1414c8SBarry Smith PetscInt *work, *ia, *ja, *j, nz, nslim_row, n, row, col, *ns_col, nslim_col; 2504c1414c8SBarry Smith PetscInt *tns, *tvc, *ns_row = a->inode.size, nsz, i1, i2, *ai = a->i, *aj = a->j; 2514c1414c8SBarry Smith 2524c1414c8SBarry Smith PetscFunctionBegin; 25308401ef6SPierre Jolivet PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 2544c1414c8SBarry Smith nslim_row = a->inode.node_count; 255d0f46423SBarry Smith n = A->cmap->n; 2564c1414c8SBarry Smith 2574c1414c8SBarry Smith /* Create The column_inode for this matrix */ 2589566063dSJacob Faibussowitsch PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col)); 2594c1414c8SBarry Smith 26035cb6cd3SPierre Jolivet /* allocate space for reformatted column_inode structure */ 2619566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc)); 2624c1414c8SBarry Smith for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_col[i1]; 2634c1414c8SBarry Smith 2644c1414c8SBarry Smith for (i1 = 0, col = 0; i1 < nslim_col; ++i1) { 2654c1414c8SBarry Smith nsz = ns_col[i1]; 2662205254eSKarl Rupp for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1; 2674c1414c8SBarry Smith } 2684c1414c8SBarry Smith /* allocate space for column pointers */ 2699566063dSJacob Faibussowitsch PetscCall(PetscCalloc1(nslim_col + 1, &ia)); 2704c1414c8SBarry Smith *iia = ia; 2719566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(nslim_col + 1, &work)); 2724c1414c8SBarry Smith 2734c1414c8SBarry Smith /* determine the number of columns in each row */ 2744c1414c8SBarry Smith ia[0] = oshift; 2754c1414c8SBarry Smith for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) { 2764c1414c8SBarry Smith j = aj + ai[row] + ishift; 2774c1414c8SBarry Smith col = *j++ + ishift; 2784c1414c8SBarry Smith i2 = tvc[col]; 2794c1414c8SBarry Smith nz = ai[row + 1] - ai[row]; 2806aad120cSJose E. Roman while (nz-- > 0) { /* off-diagonal elements */ 2814c1414c8SBarry Smith /* ia[i1+1]++; */ 2824c1414c8SBarry Smith ia[i2 + 1]++; 2834c1414c8SBarry Smith i2++; 284a8e3a797SJed Brown while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 2854c1414c8SBarry Smith if (nz > 0) i2 = tvc[col]; 2864c1414c8SBarry Smith } 2874c1414c8SBarry Smith } 2884c1414c8SBarry Smith 2894c1414c8SBarry Smith /* shift ia[i] to point to next col */ 2904c1414c8SBarry Smith for (i1 = 1; i1 < nslim_col + 1; i1++) { 2914c1414c8SBarry Smith col = ia[i1 - 1]; 2924c1414c8SBarry Smith ia[i1] += col; 2934c1414c8SBarry Smith work[i1 - 1] = col - oshift; 2944c1414c8SBarry Smith } 2954c1414c8SBarry Smith 2964c1414c8SBarry Smith /* allocate space for column pointers */ 2974c1414c8SBarry Smith nz = ia[nslim_col] + (!ishift); 2989566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(nz, &ja)); 2994c1414c8SBarry Smith *jja = ja; 3004c1414c8SBarry Smith 3014c1414c8SBarry Smith /* loop over matrix putting into ja */ 3024c1414c8SBarry Smith for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) { 3034c1414c8SBarry Smith j = aj + ai[row] + ishift; 3044c1414c8SBarry Smith col = *j++ + ishift; 3054c1414c8SBarry Smith i2 = tvc[col]; 3064c1414c8SBarry Smith nz = ai[row + 1] - ai[row]; 3074c1414c8SBarry Smith while (nz-- > 0) { 3084c1414c8SBarry Smith /* ja[work[i1]++] = i2 + oshift; */ 3094c1414c8SBarry Smith ja[work[i2]++] = i1 + oshift; 3104c1414c8SBarry Smith i2++; 311a8e3a797SJed Brown while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 3124c1414c8SBarry Smith if (nz > 0) i2 = tvc[col]; 3134c1414c8SBarry Smith } 3144c1414c8SBarry Smith } 3159566063dSJacob Faibussowitsch PetscCall(PetscFree(ns_col)); 3169566063dSJacob Faibussowitsch PetscCall(PetscFree(work)); 3179566063dSJacob Faibussowitsch PetscCall(PetscFree2(tns, tvc)); 3183ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3194c1414c8SBarry Smith } 3204c1414c8SBarry Smith 321d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) 322d71ae5a4SJacob Faibussowitsch { 3234c1414c8SBarry Smith PetscFunctionBegin; 3249566063dSJacob Faibussowitsch PetscCall(MatCreateColInode_Private(A, n, NULL)); 3253ba16761SJacob Faibussowitsch if (!ia) PetscFunctionReturn(PETSC_SUCCESS); 3264c1414c8SBarry Smith 3278f7157efSSatish Balay if (!blockcompressed) { 3289566063dSJacob Faibussowitsch PetscCall(MatGetColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done)); 3298f7157efSSatish Balay } else if (symmetric) { 330a5b23f4aSJose E. Roman /* Since the indices are symmetric it doesn't matter */ 3319566063dSJacob Faibussowitsch PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift)); 3324c1414c8SBarry Smith } else { 3339566063dSJacob Faibussowitsch PetscCall(MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift)); 3344c1414c8SBarry Smith } 3353ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3364c1414c8SBarry Smith } 3374c1414c8SBarry Smith 338d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) 339d71ae5a4SJacob Faibussowitsch { 3404c1414c8SBarry Smith PetscFunctionBegin; 3413ba16761SJacob Faibussowitsch if (!ia) PetscFunctionReturn(PETSC_SUCCESS); 3428f7157efSSatish Balay if (!blockcompressed) { 3439566063dSJacob Faibussowitsch PetscCall(MatRestoreColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done)); 3448f7157efSSatish Balay } else { 3459566063dSJacob Faibussowitsch PetscCall(PetscFree(*ia)); 3469566063dSJacob Faibussowitsch PetscCall(PetscFree(*ja)); 3478f7157efSSatish Balay } 3483ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3494c1414c8SBarry Smith } 3504c1414c8SBarry Smith 3514c1414c8SBarry Smith /* ----------------------------------------------------------- */ 3524c1414c8SBarry Smith 353d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqAIJ_Inode(Mat A, Vec xx, Vec yy) 354d71ae5a4SJacob Faibussowitsch { 3554c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3564c1414c8SBarry Smith PetscScalar sum1, sum2, sum3, sum4, sum5, tmp0, tmp1; 357d9fead3dSBarry Smith PetscScalar *y; 358dd6ea824SBarry Smith const PetscScalar *x; 359dd6ea824SBarry Smith const MatScalar *v1, *v2, *v3, *v4, *v5; 3608758e1faSBarry Smith PetscInt i1, i2, n, i, row, node_max, nsz, sz, nonzerorow = 0; 3618758e1faSBarry Smith const PetscInt *idx, *ns, *ii; 3624c1414c8SBarry Smith 3634c1414c8SBarry Smith #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 3644c1414c8SBarry Smith #pragma disjoint(*x, *y, *v1, *v2, *v3, *v4, *v5) 3654c1414c8SBarry Smith #endif 3664c1414c8SBarry Smith 3674c1414c8SBarry Smith PetscFunctionBegin; 36808401ef6SPierre Jolivet PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 3694c1414c8SBarry Smith node_max = a->inode.node_count; 3704c1414c8SBarry Smith ns = a->inode.size; /* Node Size array */ 3719566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 3729566063dSJacob Faibussowitsch PetscCall(VecGetArray(yy, &y)); 3734c1414c8SBarry Smith idx = a->j; 3744c1414c8SBarry Smith v1 = a->a; 3754c1414c8SBarry Smith ii = a->i; 3764c1414c8SBarry Smith 3774c1414c8SBarry Smith for (i = 0, row = 0; i < node_max; ++i) { 3784c1414c8SBarry Smith nsz = ns[i]; 3794c1414c8SBarry Smith n = ii[1] - ii[0]; 38098c9bda7SSatish Balay nonzerorow += (n > 0) * nsz; 3814c1414c8SBarry Smith ii += nsz; 38250d8bf02SJed Brown PetscPrefetchBlock(idx + nsz * n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the indices for the block row after the current one */ 38350d8bf02SJed Brown PetscPrefetchBlock(v1 + nsz * n, nsz * n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one */ 3844c1414c8SBarry Smith sz = n; /* No of non zeros in this row */ 3854c1414c8SBarry Smith /* Switch on the size of Node */ 3864c1414c8SBarry Smith switch (nsz) { /* Each loop in 'case' is unrolled */ 3874c1414c8SBarry Smith case 1: 38875567043SBarry Smith sum1 = 0.; 3894c1414c8SBarry Smith 3904c1414c8SBarry Smith for (n = 0; n < sz - 1; n += 2) { 3914c1414c8SBarry Smith i1 = idx[0]; /* The instructions are ordered to */ 3924c1414c8SBarry Smith i2 = idx[1]; /* make the compiler's job easy */ 3934c1414c8SBarry Smith idx += 2; 3944c1414c8SBarry Smith tmp0 = x[i1]; 3954c1414c8SBarry Smith tmp1 = x[i2]; 3969371c9d4SSatish Balay sum1 += v1[0] * tmp0 + v1[1] * tmp1; 3979371c9d4SSatish Balay v1 += 2; 3984c1414c8SBarry Smith } 3994c1414c8SBarry Smith 4004c1414c8SBarry Smith if (n == sz - 1) { /* Take care of the last nonzero */ 4014c1414c8SBarry Smith tmp0 = x[*idx++]; 4024c1414c8SBarry Smith sum1 += *v1++ * tmp0; 4034c1414c8SBarry Smith } 4044c1414c8SBarry Smith y[row++] = sum1; 4054c1414c8SBarry Smith break; 4064c1414c8SBarry Smith case 2: 40775567043SBarry Smith sum1 = 0.; 40875567043SBarry Smith sum2 = 0.; 4094c1414c8SBarry Smith v2 = v1 + n; 4104c1414c8SBarry Smith 4114c1414c8SBarry Smith for (n = 0; n < sz - 1; n += 2) { 4124c1414c8SBarry Smith i1 = idx[0]; 4134c1414c8SBarry Smith i2 = idx[1]; 4144c1414c8SBarry Smith idx += 2; 4154c1414c8SBarry Smith tmp0 = x[i1]; 4164c1414c8SBarry Smith tmp1 = x[i2]; 4179371c9d4SSatish Balay sum1 += v1[0] * tmp0 + v1[1] * tmp1; 4189371c9d4SSatish Balay v1 += 2; 4199371c9d4SSatish Balay sum2 += v2[0] * tmp0 + v2[1] * tmp1; 4209371c9d4SSatish Balay v2 += 2; 4214c1414c8SBarry Smith } 4224c1414c8SBarry Smith if (n == sz - 1) { 4234c1414c8SBarry Smith tmp0 = x[*idx++]; 4244c1414c8SBarry Smith sum1 += *v1++ * tmp0; 4254c1414c8SBarry Smith sum2 += *v2++ * tmp0; 4264c1414c8SBarry Smith } 4274c1414c8SBarry Smith y[row++] = sum1; 4284c1414c8SBarry Smith y[row++] = sum2; 4294c1414c8SBarry Smith v1 = v2; /* Since the next block to be processed starts there*/ 4304c1414c8SBarry Smith idx += sz; 4314c1414c8SBarry Smith break; 4324c1414c8SBarry Smith case 3: 43375567043SBarry Smith sum1 = 0.; 43475567043SBarry Smith sum2 = 0.; 43575567043SBarry Smith sum3 = 0.; 4364c1414c8SBarry Smith v2 = v1 + n; 4374c1414c8SBarry Smith v3 = v2 + n; 4384c1414c8SBarry Smith 4394c1414c8SBarry Smith for (n = 0; n < sz - 1; n += 2) { 4404c1414c8SBarry Smith i1 = idx[0]; 4414c1414c8SBarry Smith i2 = idx[1]; 4424c1414c8SBarry Smith idx += 2; 4434c1414c8SBarry Smith tmp0 = x[i1]; 4444c1414c8SBarry Smith tmp1 = x[i2]; 4459371c9d4SSatish Balay sum1 += v1[0] * tmp0 + v1[1] * tmp1; 4469371c9d4SSatish Balay v1 += 2; 4479371c9d4SSatish Balay sum2 += v2[0] * tmp0 + v2[1] * tmp1; 4489371c9d4SSatish Balay v2 += 2; 4499371c9d4SSatish Balay sum3 += v3[0] * tmp0 + v3[1] * tmp1; 4509371c9d4SSatish Balay v3 += 2; 4514c1414c8SBarry Smith } 4524c1414c8SBarry Smith if (n == sz - 1) { 4534c1414c8SBarry Smith tmp0 = x[*idx++]; 4544c1414c8SBarry Smith sum1 += *v1++ * tmp0; 4554c1414c8SBarry Smith sum2 += *v2++ * tmp0; 4564c1414c8SBarry Smith sum3 += *v3++ * tmp0; 4574c1414c8SBarry Smith } 4584c1414c8SBarry Smith y[row++] = sum1; 4594c1414c8SBarry Smith y[row++] = sum2; 4604c1414c8SBarry Smith y[row++] = sum3; 4614c1414c8SBarry Smith v1 = v3; /* Since the next block to be processed starts there*/ 4624c1414c8SBarry Smith idx += 2 * sz; 4634c1414c8SBarry Smith break; 4644c1414c8SBarry Smith case 4: 46575567043SBarry Smith sum1 = 0.; 46675567043SBarry Smith sum2 = 0.; 46775567043SBarry Smith sum3 = 0.; 46875567043SBarry Smith sum4 = 0.; 4694c1414c8SBarry Smith v2 = v1 + n; 4704c1414c8SBarry Smith v3 = v2 + n; 4714c1414c8SBarry Smith v4 = v3 + n; 4724c1414c8SBarry Smith 4734c1414c8SBarry Smith for (n = 0; n < sz - 1; n += 2) { 4744c1414c8SBarry Smith i1 = idx[0]; 4754c1414c8SBarry Smith i2 = idx[1]; 4764c1414c8SBarry Smith idx += 2; 4774c1414c8SBarry Smith tmp0 = x[i1]; 4784c1414c8SBarry Smith tmp1 = x[i2]; 4799371c9d4SSatish Balay sum1 += v1[0] * tmp0 + v1[1] * tmp1; 4809371c9d4SSatish Balay v1 += 2; 4819371c9d4SSatish Balay sum2 += v2[0] * tmp0 + v2[1] * tmp1; 4829371c9d4SSatish Balay v2 += 2; 4839371c9d4SSatish Balay sum3 += v3[0] * tmp0 + v3[1] * tmp1; 4849371c9d4SSatish Balay v3 += 2; 4859371c9d4SSatish Balay sum4 += v4[0] * tmp0 + v4[1] * tmp1; 4869371c9d4SSatish Balay v4 += 2; 4874c1414c8SBarry Smith } 4884c1414c8SBarry Smith if (n == sz - 1) { 4894c1414c8SBarry Smith tmp0 = x[*idx++]; 4904c1414c8SBarry Smith sum1 += *v1++ * tmp0; 4914c1414c8SBarry Smith sum2 += *v2++ * tmp0; 4924c1414c8SBarry Smith sum3 += *v3++ * tmp0; 4934c1414c8SBarry Smith sum4 += *v4++ * tmp0; 4944c1414c8SBarry Smith } 4954c1414c8SBarry Smith y[row++] = sum1; 4964c1414c8SBarry Smith y[row++] = sum2; 4974c1414c8SBarry Smith y[row++] = sum3; 4984c1414c8SBarry Smith y[row++] = sum4; 4994c1414c8SBarry Smith v1 = v4; /* Since the next block to be processed starts there*/ 5004c1414c8SBarry Smith idx += 3 * sz; 5014c1414c8SBarry Smith break; 5024c1414c8SBarry Smith case 5: 50375567043SBarry Smith sum1 = 0.; 50475567043SBarry Smith sum2 = 0.; 50575567043SBarry Smith sum3 = 0.; 50675567043SBarry Smith sum4 = 0.; 50775567043SBarry Smith sum5 = 0.; 5084c1414c8SBarry Smith v2 = v1 + n; 5094c1414c8SBarry Smith v3 = v2 + n; 5104c1414c8SBarry Smith v4 = v3 + n; 5114c1414c8SBarry Smith v5 = v4 + n; 5124c1414c8SBarry Smith 5134c1414c8SBarry Smith for (n = 0; n < sz - 1; n += 2) { 5144c1414c8SBarry Smith i1 = idx[0]; 5154c1414c8SBarry Smith i2 = idx[1]; 5164c1414c8SBarry Smith idx += 2; 5174c1414c8SBarry Smith tmp0 = x[i1]; 5184c1414c8SBarry Smith tmp1 = x[i2]; 5199371c9d4SSatish Balay sum1 += v1[0] * tmp0 + v1[1] * tmp1; 5209371c9d4SSatish Balay v1 += 2; 5219371c9d4SSatish Balay sum2 += v2[0] * tmp0 + v2[1] * tmp1; 5229371c9d4SSatish Balay v2 += 2; 5239371c9d4SSatish Balay sum3 += v3[0] * tmp0 + v3[1] * tmp1; 5249371c9d4SSatish Balay v3 += 2; 5259371c9d4SSatish Balay sum4 += v4[0] * tmp0 + v4[1] * tmp1; 5269371c9d4SSatish Balay v4 += 2; 5279371c9d4SSatish Balay sum5 += v5[0] * tmp0 + v5[1] * tmp1; 5289371c9d4SSatish Balay v5 += 2; 5294c1414c8SBarry Smith } 5304c1414c8SBarry Smith if (n == sz - 1) { 5314c1414c8SBarry Smith tmp0 = x[*idx++]; 5324c1414c8SBarry Smith sum1 += *v1++ * tmp0; 5334c1414c8SBarry Smith sum2 += *v2++ * tmp0; 5344c1414c8SBarry Smith sum3 += *v3++ * tmp0; 5354c1414c8SBarry Smith sum4 += *v4++ * tmp0; 5364c1414c8SBarry Smith sum5 += *v5++ * tmp0; 5374c1414c8SBarry Smith } 5384c1414c8SBarry Smith y[row++] = sum1; 5394c1414c8SBarry Smith y[row++] = sum2; 5404c1414c8SBarry Smith y[row++] = sum3; 5414c1414c8SBarry Smith y[row++] = sum4; 5424c1414c8SBarry Smith y[row++] = sum5; 5434c1414c8SBarry Smith v1 = v5; /* Since the next block to be processed starts there */ 5444c1414c8SBarry Smith idx += 4 * sz; 5454c1414c8SBarry Smith break; 546d71ae5a4SJacob Faibussowitsch default: 547d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported"); 5484c1414c8SBarry Smith } 5494c1414c8SBarry Smith } 5509566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 5519566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(yy, &y)); 5529566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz - nonzerorow)); 5533ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 5544c1414c8SBarry Smith } 5554c1414c8SBarry Smith /* ----------------------------------------------------------- */ 5564108e4d5SBarry Smith /* Almost same code as the MatMult_SeqAIJ_Inode() */ 557d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A, Vec xx, Vec zz, Vec yy) 558d71ae5a4SJacob Faibussowitsch { 5594c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 5604c1414c8SBarry Smith PetscScalar sum1, sum2, sum3, sum4, sum5, tmp0, tmp1; 5618758e1faSBarry Smith const MatScalar *v1, *v2, *v3, *v4, *v5; 5628758e1faSBarry Smith const PetscScalar *x; 5638758e1faSBarry Smith PetscScalar *y, *z, *zt; 5648758e1faSBarry Smith PetscInt i1, i2, n, i, row, node_max, nsz, sz; 5658758e1faSBarry Smith const PetscInt *idx, *ns, *ii; 5664c1414c8SBarry Smith 5674c1414c8SBarry Smith PetscFunctionBegin; 56808401ef6SPierre Jolivet PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 5694c1414c8SBarry Smith node_max = a->inode.node_count; 5704c1414c8SBarry Smith ns = a->inode.size; /* Node Size array */ 5712205254eSKarl Rupp 5729566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x)); 5739566063dSJacob Faibussowitsch PetscCall(VecGetArrayPair(zz, yy, &z, &y)); 5744c1414c8SBarry Smith zt = z; 5754c1414c8SBarry Smith 5764c1414c8SBarry Smith idx = a->j; 5774c1414c8SBarry Smith v1 = a->a; 5784c1414c8SBarry Smith ii = a->i; 5794c1414c8SBarry Smith 5804c1414c8SBarry Smith for (i = 0, row = 0; i < node_max; ++i) { 5814c1414c8SBarry Smith nsz = ns[i]; 5824c1414c8SBarry Smith n = ii[1] - ii[0]; 5834c1414c8SBarry Smith ii += nsz; 5844c1414c8SBarry Smith sz = n; /* No of non zeros in this row */ 5854c1414c8SBarry Smith /* Switch on the size of Node */ 5864c1414c8SBarry Smith switch (nsz) { /* Each loop in 'case' is unrolled */ 5874c1414c8SBarry Smith case 1: 5884c1414c8SBarry Smith sum1 = *zt++; 5894c1414c8SBarry Smith 5904c1414c8SBarry Smith for (n = 0; n < sz - 1; n += 2) { 5914c1414c8SBarry Smith i1 = idx[0]; /* The instructions are ordered to */ 5924c1414c8SBarry Smith i2 = idx[1]; /* make the compiler's job easy */ 5934c1414c8SBarry Smith idx += 2; 5944c1414c8SBarry Smith tmp0 = x[i1]; 5954c1414c8SBarry Smith tmp1 = x[i2]; 5969371c9d4SSatish Balay sum1 += v1[0] * tmp0 + v1[1] * tmp1; 5979371c9d4SSatish Balay v1 += 2; 5984c1414c8SBarry Smith } 5994c1414c8SBarry Smith 6004c1414c8SBarry Smith if (n == sz - 1) { /* Take care of the last nonzero */ 6014c1414c8SBarry Smith tmp0 = x[*idx++]; 6024c1414c8SBarry Smith sum1 += *v1++ * tmp0; 6034c1414c8SBarry Smith } 6044c1414c8SBarry Smith y[row++] = sum1; 6054c1414c8SBarry Smith break; 6064c1414c8SBarry Smith case 2: 6074c1414c8SBarry Smith sum1 = *zt++; 6084c1414c8SBarry Smith sum2 = *zt++; 6094c1414c8SBarry Smith v2 = v1 + n; 6104c1414c8SBarry Smith 6114c1414c8SBarry Smith for (n = 0; n < sz - 1; n += 2) { 6124c1414c8SBarry Smith i1 = idx[0]; 6134c1414c8SBarry Smith i2 = idx[1]; 6144c1414c8SBarry Smith idx += 2; 6154c1414c8SBarry Smith tmp0 = x[i1]; 6164c1414c8SBarry Smith tmp1 = x[i2]; 6179371c9d4SSatish Balay sum1 += v1[0] * tmp0 + v1[1] * tmp1; 6189371c9d4SSatish Balay v1 += 2; 6199371c9d4SSatish Balay sum2 += v2[0] * tmp0 + v2[1] * tmp1; 6209371c9d4SSatish Balay v2 += 2; 6214c1414c8SBarry Smith } 6224c1414c8SBarry Smith if (n == sz - 1) { 6234c1414c8SBarry Smith tmp0 = x[*idx++]; 6244c1414c8SBarry Smith sum1 += *v1++ * tmp0; 6254c1414c8SBarry Smith sum2 += *v2++ * tmp0; 6264c1414c8SBarry Smith } 6274c1414c8SBarry Smith y[row++] = sum1; 6284c1414c8SBarry Smith y[row++] = sum2; 6294c1414c8SBarry Smith v1 = v2; /* Since the next block to be processed starts there*/ 6304c1414c8SBarry Smith idx += sz; 6314c1414c8SBarry Smith break; 6324c1414c8SBarry Smith case 3: 6334c1414c8SBarry Smith sum1 = *zt++; 6344c1414c8SBarry Smith sum2 = *zt++; 6354c1414c8SBarry Smith sum3 = *zt++; 6364c1414c8SBarry Smith v2 = v1 + n; 6374c1414c8SBarry Smith v3 = v2 + n; 6384c1414c8SBarry Smith 6394c1414c8SBarry Smith for (n = 0; n < sz - 1; n += 2) { 6404c1414c8SBarry Smith i1 = idx[0]; 6414c1414c8SBarry Smith i2 = idx[1]; 6424c1414c8SBarry Smith idx += 2; 6434c1414c8SBarry Smith tmp0 = x[i1]; 6444c1414c8SBarry Smith tmp1 = x[i2]; 6459371c9d4SSatish Balay sum1 += v1[0] * tmp0 + v1[1] * tmp1; 6469371c9d4SSatish Balay v1 += 2; 6479371c9d4SSatish Balay sum2 += v2[0] * tmp0 + v2[1] * tmp1; 6489371c9d4SSatish Balay v2 += 2; 6499371c9d4SSatish Balay sum3 += v3[0] * tmp0 + v3[1] * tmp1; 6509371c9d4SSatish Balay v3 += 2; 6514c1414c8SBarry Smith } 6524c1414c8SBarry Smith if (n == sz - 1) { 6534c1414c8SBarry Smith tmp0 = x[*idx++]; 6544c1414c8SBarry Smith sum1 += *v1++ * tmp0; 6554c1414c8SBarry Smith sum2 += *v2++ * tmp0; 6564c1414c8SBarry Smith sum3 += *v3++ * tmp0; 6574c1414c8SBarry Smith } 6584c1414c8SBarry Smith y[row++] = sum1; 6594c1414c8SBarry Smith y[row++] = sum2; 6604c1414c8SBarry Smith y[row++] = sum3; 6614c1414c8SBarry Smith v1 = v3; /* Since the next block to be processed starts there*/ 6624c1414c8SBarry Smith idx += 2 * sz; 6634c1414c8SBarry Smith break; 6644c1414c8SBarry Smith case 4: 6654c1414c8SBarry Smith sum1 = *zt++; 6664c1414c8SBarry Smith sum2 = *zt++; 6674c1414c8SBarry Smith sum3 = *zt++; 6684c1414c8SBarry Smith sum4 = *zt++; 6694c1414c8SBarry Smith v2 = v1 + n; 6704c1414c8SBarry Smith v3 = v2 + n; 6714c1414c8SBarry Smith v4 = v3 + n; 6724c1414c8SBarry Smith 6734c1414c8SBarry Smith for (n = 0; n < sz - 1; n += 2) { 6744c1414c8SBarry Smith i1 = idx[0]; 6754c1414c8SBarry Smith i2 = idx[1]; 6764c1414c8SBarry Smith idx += 2; 6774c1414c8SBarry Smith tmp0 = x[i1]; 6784c1414c8SBarry Smith tmp1 = x[i2]; 6799371c9d4SSatish Balay sum1 += v1[0] * tmp0 + v1[1] * tmp1; 6809371c9d4SSatish Balay v1 += 2; 6819371c9d4SSatish Balay sum2 += v2[0] * tmp0 + v2[1] * tmp1; 6829371c9d4SSatish Balay v2 += 2; 6839371c9d4SSatish Balay sum3 += v3[0] * tmp0 + v3[1] * tmp1; 6849371c9d4SSatish Balay v3 += 2; 6859371c9d4SSatish Balay sum4 += v4[0] * tmp0 + v4[1] * tmp1; 6869371c9d4SSatish Balay v4 += 2; 6874c1414c8SBarry Smith } 6884c1414c8SBarry Smith if (n == sz - 1) { 6894c1414c8SBarry Smith tmp0 = x[*idx++]; 6904c1414c8SBarry Smith sum1 += *v1++ * tmp0; 6914c1414c8SBarry Smith sum2 += *v2++ * tmp0; 6924c1414c8SBarry Smith sum3 += *v3++ * tmp0; 6934c1414c8SBarry Smith sum4 += *v4++ * tmp0; 6944c1414c8SBarry Smith } 6954c1414c8SBarry Smith y[row++] = sum1; 6964c1414c8SBarry Smith y[row++] = sum2; 6974c1414c8SBarry Smith y[row++] = sum3; 6984c1414c8SBarry Smith y[row++] = sum4; 6994c1414c8SBarry Smith v1 = v4; /* Since the next block to be processed starts there*/ 7004c1414c8SBarry Smith idx += 3 * sz; 7014c1414c8SBarry Smith break; 7024c1414c8SBarry Smith case 5: 7034c1414c8SBarry Smith sum1 = *zt++; 7044c1414c8SBarry Smith sum2 = *zt++; 7054c1414c8SBarry Smith sum3 = *zt++; 7064c1414c8SBarry Smith sum4 = *zt++; 7074c1414c8SBarry Smith sum5 = *zt++; 7084c1414c8SBarry Smith v2 = v1 + n; 7094c1414c8SBarry Smith v3 = v2 + n; 7104c1414c8SBarry Smith v4 = v3 + n; 7114c1414c8SBarry Smith v5 = v4 + n; 7124c1414c8SBarry Smith 7134c1414c8SBarry Smith for (n = 0; n < sz - 1; n += 2) { 7144c1414c8SBarry Smith i1 = idx[0]; 7154c1414c8SBarry Smith i2 = idx[1]; 7164c1414c8SBarry Smith idx += 2; 7174c1414c8SBarry Smith tmp0 = x[i1]; 7184c1414c8SBarry Smith tmp1 = x[i2]; 7199371c9d4SSatish Balay sum1 += v1[0] * tmp0 + v1[1] * tmp1; 7209371c9d4SSatish Balay v1 += 2; 7219371c9d4SSatish Balay sum2 += v2[0] * tmp0 + v2[1] * tmp1; 7229371c9d4SSatish Balay v2 += 2; 7239371c9d4SSatish Balay sum3 += v3[0] * tmp0 + v3[1] * tmp1; 7249371c9d4SSatish Balay v3 += 2; 7259371c9d4SSatish Balay sum4 += v4[0] * tmp0 + v4[1] * tmp1; 7269371c9d4SSatish Balay v4 += 2; 7279371c9d4SSatish Balay sum5 += v5[0] * tmp0 + v5[1] * tmp1; 7289371c9d4SSatish Balay v5 += 2; 7294c1414c8SBarry Smith } 7304c1414c8SBarry Smith if (n == sz - 1) { 7314c1414c8SBarry Smith tmp0 = x[*idx++]; 7324c1414c8SBarry Smith sum1 += *v1++ * tmp0; 7334c1414c8SBarry Smith sum2 += *v2++ * tmp0; 7344c1414c8SBarry Smith sum3 += *v3++ * tmp0; 7354c1414c8SBarry Smith sum4 += *v4++ * tmp0; 7364c1414c8SBarry Smith sum5 += *v5++ * tmp0; 7374c1414c8SBarry Smith } 7384c1414c8SBarry Smith y[row++] = sum1; 7394c1414c8SBarry Smith y[row++] = sum2; 7404c1414c8SBarry Smith y[row++] = sum3; 7414c1414c8SBarry Smith y[row++] = sum4; 7424c1414c8SBarry Smith y[row++] = sum5; 7434c1414c8SBarry Smith v1 = v5; /* Since the next block to be processed starts there */ 7444c1414c8SBarry Smith idx += 4 * sz; 7454c1414c8SBarry Smith break; 746d71ae5a4SJacob Faibussowitsch default: 747d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported"); 7484c1414c8SBarry Smith } 7494c1414c8SBarry Smith } 7509566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x)); 7519566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayPair(zz, yy, &z, &y)); 7529566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz)); 7533ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 7544c1414c8SBarry Smith } 7554c1414c8SBarry Smith 7564c1414c8SBarry Smith /* ----------------------------------------------------------- */ 757*ff6a9541SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A, Vec bb, Vec xx) 758d71ae5a4SJacob Faibussowitsch { 7594c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 7604c1414c8SBarry Smith IS iscol = a->col, isrow = a->row; 7615d0c19d7SBarry Smith const PetscInt *r, *c, *rout, *cout; 7628758e1faSBarry Smith PetscInt i, j, n = A->rmap->n, nz; 7638758e1faSBarry Smith PetscInt node_max, *ns, row, nsz, aii, i0, i1; 7648758e1faSBarry Smith const PetscInt *ai = a->i, *a_j = a->j, *vi, *ad, *aj; 765d9fead3dSBarry Smith PetscScalar *x, *tmp, *tmps, tmp0, tmp1; 766d9fead3dSBarry Smith PetscScalar sum1, sum2, sum3, sum4, sum5; 767dd6ea824SBarry Smith const MatScalar *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa; 768dd6ea824SBarry Smith const PetscScalar *b; 7694c1414c8SBarry Smith 7704c1414c8SBarry Smith PetscFunctionBegin; 77108401ef6SPierre Jolivet PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 7724c1414c8SBarry Smith node_max = a->inode.node_count; 7734c1414c8SBarry Smith ns = a->inode.size; /* Node Size array */ 7744c1414c8SBarry Smith 7759566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 7769566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(xx, &x)); 7774c1414c8SBarry Smith tmp = a->solve_work; 7784c1414c8SBarry Smith 7799371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 7809371c9d4SSatish Balay r = rout; 7819371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 7829371c9d4SSatish Balay c = cout + (n - 1); 7834c1414c8SBarry Smith 7844c1414c8SBarry Smith /* forward solve the lower triangular */ 7854c1414c8SBarry Smith tmps = tmp; 7864c1414c8SBarry Smith aa = a_a; 7874c1414c8SBarry Smith aj = a_j; 7884c1414c8SBarry Smith ad = a->diag; 7894c1414c8SBarry Smith 7904c1414c8SBarry Smith for (i = 0, row = 0; i < node_max; ++i) { 7914c1414c8SBarry Smith nsz = ns[i]; 7924c1414c8SBarry Smith aii = ai[row]; 7934c1414c8SBarry Smith v1 = aa + aii; 7944c1414c8SBarry Smith vi = aj + aii; 7954c1414c8SBarry Smith nz = ad[row] - aii; 79626549573SJed Brown if (i < node_max - 1) { 79726549573SJed Brown /* Prefetch the block after the current one, the prefetch itself can't cause a memory error, 79826549573SJed Brown * but our indexing to determine it's size could. */ 79950d8bf02SJed Brown PetscPrefetchBlock(aj + ai[row + nsz], ad[row + nsz] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */ 80026549573SJed Brown /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */ 80150d8bf02SJed Brown PetscPrefetchBlock(aa + ai[row + nsz], ad[row + nsz + ns[i + 1] - 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); 80226549573SJed Brown /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */ 80326549573SJed Brown } 8044c1414c8SBarry Smith 8054c1414c8SBarry Smith switch (nsz) { /* Each loop in 'case' is unrolled */ 8064c1414c8SBarry Smith case 1: 8074c1414c8SBarry Smith sum1 = b[*r++]; 8084c1414c8SBarry Smith for (j = 0; j < nz - 1; j += 2) { 8094c1414c8SBarry Smith i0 = vi[0]; 8104c1414c8SBarry Smith i1 = vi[1]; 8114c1414c8SBarry Smith vi += 2; 8124c1414c8SBarry Smith tmp0 = tmps[i0]; 8134c1414c8SBarry Smith tmp1 = tmps[i1]; 8149371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 8159371c9d4SSatish Balay v1 += 2; 8164c1414c8SBarry Smith } 8174c1414c8SBarry Smith if (j == nz - 1) { 8184c1414c8SBarry Smith tmp0 = tmps[*vi++]; 8194c1414c8SBarry Smith sum1 -= *v1++ * tmp0; 8204c1414c8SBarry Smith } 8214c1414c8SBarry Smith tmp[row++] = sum1; 8224c1414c8SBarry Smith break; 8234c1414c8SBarry Smith case 2: 8244c1414c8SBarry Smith sum1 = b[*r++]; 8254c1414c8SBarry Smith sum2 = b[*r++]; 8264c1414c8SBarry Smith v2 = aa + ai[row + 1]; 8274c1414c8SBarry Smith 8284c1414c8SBarry Smith for (j = 0; j < nz - 1; j += 2) { 8294c1414c8SBarry Smith i0 = vi[0]; 8304c1414c8SBarry Smith i1 = vi[1]; 8314c1414c8SBarry Smith vi += 2; 8324c1414c8SBarry Smith tmp0 = tmps[i0]; 8334c1414c8SBarry Smith tmp1 = tmps[i1]; 8349371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 8359371c9d4SSatish Balay v1 += 2; 8369371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 8379371c9d4SSatish Balay v2 += 2; 8384c1414c8SBarry Smith } 8394c1414c8SBarry Smith if (j == nz - 1) { 8404c1414c8SBarry Smith tmp0 = tmps[*vi++]; 8414c1414c8SBarry Smith sum1 -= *v1++ * tmp0; 8424c1414c8SBarry Smith sum2 -= *v2++ * tmp0; 8434c1414c8SBarry Smith } 8444c1414c8SBarry Smith sum2 -= *v2++ * sum1; 8454c1414c8SBarry Smith tmp[row++] = sum1; 8464c1414c8SBarry Smith tmp[row++] = sum2; 8474c1414c8SBarry Smith break; 8484c1414c8SBarry Smith case 3: 8494c1414c8SBarry Smith sum1 = b[*r++]; 8504c1414c8SBarry Smith sum2 = b[*r++]; 8514c1414c8SBarry Smith sum3 = b[*r++]; 8524c1414c8SBarry Smith v2 = aa + ai[row + 1]; 8534c1414c8SBarry Smith v3 = aa + ai[row + 2]; 8544c1414c8SBarry Smith 8554c1414c8SBarry Smith for (j = 0; j < nz - 1; j += 2) { 8564c1414c8SBarry Smith i0 = vi[0]; 8574c1414c8SBarry Smith i1 = vi[1]; 8584c1414c8SBarry Smith vi += 2; 8594c1414c8SBarry Smith tmp0 = tmps[i0]; 8604c1414c8SBarry Smith tmp1 = tmps[i1]; 8619371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 8629371c9d4SSatish Balay v1 += 2; 8639371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 8649371c9d4SSatish Balay v2 += 2; 8659371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 8669371c9d4SSatish Balay v3 += 2; 8674c1414c8SBarry Smith } 8684c1414c8SBarry Smith if (j == nz - 1) { 8694c1414c8SBarry Smith tmp0 = tmps[*vi++]; 8704c1414c8SBarry Smith sum1 -= *v1++ * tmp0; 8714c1414c8SBarry Smith sum2 -= *v2++ * tmp0; 8724c1414c8SBarry Smith sum3 -= *v3++ * tmp0; 8734c1414c8SBarry Smith } 8744c1414c8SBarry Smith sum2 -= *v2++ * sum1; 8754c1414c8SBarry Smith sum3 -= *v3++ * sum1; 8764c1414c8SBarry Smith sum3 -= *v3++ * sum2; 8772205254eSKarl Rupp 8784c1414c8SBarry Smith tmp[row++] = sum1; 8794c1414c8SBarry Smith tmp[row++] = sum2; 8804c1414c8SBarry Smith tmp[row++] = sum3; 8814c1414c8SBarry Smith break; 8824c1414c8SBarry Smith 8834c1414c8SBarry Smith case 4: 8844c1414c8SBarry Smith sum1 = b[*r++]; 8854c1414c8SBarry Smith sum2 = b[*r++]; 8864c1414c8SBarry Smith sum3 = b[*r++]; 8874c1414c8SBarry Smith sum4 = b[*r++]; 8884c1414c8SBarry Smith v2 = aa + ai[row + 1]; 8894c1414c8SBarry Smith v3 = aa + ai[row + 2]; 8904c1414c8SBarry Smith v4 = aa + ai[row + 3]; 8914c1414c8SBarry Smith 8924c1414c8SBarry Smith for (j = 0; j < nz - 1; j += 2) { 8934c1414c8SBarry Smith i0 = vi[0]; 8944c1414c8SBarry Smith i1 = vi[1]; 8954c1414c8SBarry Smith vi += 2; 8964c1414c8SBarry Smith tmp0 = tmps[i0]; 8974c1414c8SBarry Smith tmp1 = tmps[i1]; 8989371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 8999371c9d4SSatish Balay v1 += 2; 9009371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 9019371c9d4SSatish Balay v2 += 2; 9029371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 9039371c9d4SSatish Balay v3 += 2; 9049371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 9059371c9d4SSatish Balay v4 += 2; 9064c1414c8SBarry Smith } 9074c1414c8SBarry Smith if (j == nz - 1) { 9084c1414c8SBarry Smith tmp0 = tmps[*vi++]; 9094c1414c8SBarry Smith sum1 -= *v1++ * tmp0; 9104c1414c8SBarry Smith sum2 -= *v2++ * tmp0; 9114c1414c8SBarry Smith sum3 -= *v3++ * tmp0; 9124c1414c8SBarry Smith sum4 -= *v4++ * tmp0; 9134c1414c8SBarry Smith } 9144c1414c8SBarry Smith sum2 -= *v2++ * sum1; 9154c1414c8SBarry Smith sum3 -= *v3++ * sum1; 9164c1414c8SBarry Smith sum4 -= *v4++ * sum1; 9174c1414c8SBarry Smith sum3 -= *v3++ * sum2; 9184c1414c8SBarry Smith sum4 -= *v4++ * sum2; 9194c1414c8SBarry Smith sum4 -= *v4++ * sum3; 9204c1414c8SBarry Smith 9214c1414c8SBarry Smith tmp[row++] = sum1; 9224c1414c8SBarry Smith tmp[row++] = sum2; 9234c1414c8SBarry Smith tmp[row++] = sum3; 9244c1414c8SBarry Smith tmp[row++] = sum4; 9254c1414c8SBarry Smith break; 9264c1414c8SBarry Smith case 5: 9274c1414c8SBarry Smith sum1 = b[*r++]; 9284c1414c8SBarry Smith sum2 = b[*r++]; 9294c1414c8SBarry Smith sum3 = b[*r++]; 9304c1414c8SBarry Smith sum4 = b[*r++]; 9314c1414c8SBarry Smith sum5 = b[*r++]; 9324c1414c8SBarry Smith v2 = aa + ai[row + 1]; 9334c1414c8SBarry Smith v3 = aa + ai[row + 2]; 9344c1414c8SBarry Smith v4 = aa + ai[row + 3]; 9354c1414c8SBarry Smith v5 = aa + ai[row + 4]; 9364c1414c8SBarry Smith 9374c1414c8SBarry Smith for (j = 0; j < nz - 1; j += 2) { 9384c1414c8SBarry Smith i0 = vi[0]; 9394c1414c8SBarry Smith i1 = vi[1]; 9404c1414c8SBarry Smith vi += 2; 9414c1414c8SBarry Smith tmp0 = tmps[i0]; 9424c1414c8SBarry Smith tmp1 = tmps[i1]; 9439371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 9449371c9d4SSatish Balay v1 += 2; 9459371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 9469371c9d4SSatish Balay v2 += 2; 9479371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 9489371c9d4SSatish Balay v3 += 2; 9499371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 9509371c9d4SSatish Balay v4 += 2; 9519371c9d4SSatish Balay sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 9529371c9d4SSatish Balay v5 += 2; 9534c1414c8SBarry Smith } 9544c1414c8SBarry Smith if (j == nz - 1) { 9554c1414c8SBarry Smith tmp0 = tmps[*vi++]; 9564c1414c8SBarry Smith sum1 -= *v1++ * tmp0; 9574c1414c8SBarry Smith sum2 -= *v2++ * tmp0; 9584c1414c8SBarry Smith sum3 -= *v3++ * tmp0; 9594c1414c8SBarry Smith sum4 -= *v4++ * tmp0; 9604c1414c8SBarry Smith sum5 -= *v5++ * tmp0; 9614c1414c8SBarry Smith } 9624c1414c8SBarry Smith 9634c1414c8SBarry Smith sum2 -= *v2++ * sum1; 9644c1414c8SBarry Smith sum3 -= *v3++ * sum1; 9654c1414c8SBarry Smith sum4 -= *v4++ * sum1; 9664c1414c8SBarry Smith sum5 -= *v5++ * sum1; 9674c1414c8SBarry Smith sum3 -= *v3++ * sum2; 9684c1414c8SBarry Smith sum4 -= *v4++ * sum2; 9694c1414c8SBarry Smith sum5 -= *v5++ * sum2; 9704c1414c8SBarry Smith sum4 -= *v4++ * sum3; 9714c1414c8SBarry Smith sum5 -= *v5++ * sum3; 9724c1414c8SBarry Smith sum5 -= *v5++ * sum4; 9734c1414c8SBarry Smith 9744c1414c8SBarry Smith tmp[row++] = sum1; 9754c1414c8SBarry Smith tmp[row++] = sum2; 9764c1414c8SBarry Smith tmp[row++] = sum3; 9774c1414c8SBarry Smith tmp[row++] = sum4; 9784c1414c8SBarry Smith tmp[row++] = sum5; 9794c1414c8SBarry Smith break; 980d71ae5a4SJacob Faibussowitsch default: 981d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported "); 9824c1414c8SBarry Smith } 9834c1414c8SBarry Smith } 9844c1414c8SBarry Smith /* backward solve the upper triangular */ 9854c1414c8SBarry Smith for (i = node_max - 1, row = n - 1; i >= 0; i--) { 9864c1414c8SBarry Smith nsz = ns[i]; 9874c1414c8SBarry Smith aii = ai[row + 1] - 1; 9884c1414c8SBarry Smith v1 = aa + aii; 9894c1414c8SBarry Smith vi = aj + aii; 9904c1414c8SBarry Smith nz = aii - ad[row]; 9914c1414c8SBarry Smith switch (nsz) { /* Each loop in 'case' is unrolled */ 9924c1414c8SBarry Smith case 1: 9934c1414c8SBarry Smith sum1 = tmp[row]; 9944c1414c8SBarry Smith 9954c1414c8SBarry Smith for (j = nz; j > 1; j -= 2) { 9964c1414c8SBarry Smith vi -= 2; 9974c1414c8SBarry Smith i0 = vi[2]; 9984c1414c8SBarry Smith i1 = vi[1]; 9994c1414c8SBarry Smith tmp0 = tmps[i0]; 10004c1414c8SBarry Smith tmp1 = tmps[i1]; 10014c1414c8SBarry Smith v1 -= 2; 10024c1414c8SBarry Smith sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 10034c1414c8SBarry Smith } 10044c1414c8SBarry Smith if (j == 1) { 10054c1414c8SBarry Smith tmp0 = tmps[*vi--]; 10064c1414c8SBarry Smith sum1 -= *v1-- * tmp0; 10074c1414c8SBarry Smith } 10089371c9d4SSatish Balay x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 10099371c9d4SSatish Balay row--; 10104c1414c8SBarry Smith break; 10114c1414c8SBarry Smith case 2: 10124c1414c8SBarry Smith sum1 = tmp[row]; 10134c1414c8SBarry Smith sum2 = tmp[row - 1]; 10144c1414c8SBarry Smith v2 = aa + ai[row] - 1; 10154c1414c8SBarry Smith for (j = nz; j > 1; j -= 2) { 10164c1414c8SBarry Smith vi -= 2; 10174c1414c8SBarry Smith i0 = vi[2]; 10184c1414c8SBarry Smith i1 = vi[1]; 10194c1414c8SBarry Smith tmp0 = tmps[i0]; 10204c1414c8SBarry Smith tmp1 = tmps[i1]; 10214c1414c8SBarry Smith v1 -= 2; 10224c1414c8SBarry Smith v2 -= 2; 10234c1414c8SBarry Smith sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 10244c1414c8SBarry Smith sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 10254c1414c8SBarry Smith } 10264c1414c8SBarry Smith if (j == 1) { 10274c1414c8SBarry Smith tmp0 = tmps[*vi--]; 10284c1414c8SBarry Smith sum1 -= *v1-- * tmp0; 10294c1414c8SBarry Smith sum2 -= *v2-- * tmp0; 10304c1414c8SBarry Smith } 10314c1414c8SBarry Smith 10329371c9d4SSatish Balay tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 10339371c9d4SSatish Balay row--; 10344c1414c8SBarry Smith sum2 -= *v2-- * tmp0; 10359371c9d4SSatish Balay x[*c--] = tmp[row] = sum2 * a_a[ad[row]]; 10369371c9d4SSatish Balay row--; 10374c1414c8SBarry Smith break; 10384c1414c8SBarry Smith case 3: 10394c1414c8SBarry Smith sum1 = tmp[row]; 10404c1414c8SBarry Smith sum2 = tmp[row - 1]; 10414c1414c8SBarry Smith sum3 = tmp[row - 2]; 10424c1414c8SBarry Smith v2 = aa + ai[row] - 1; 10434c1414c8SBarry Smith v3 = aa + ai[row - 1] - 1; 10444c1414c8SBarry Smith for (j = nz; j > 1; j -= 2) { 10454c1414c8SBarry Smith vi -= 2; 10464c1414c8SBarry Smith i0 = vi[2]; 10474c1414c8SBarry Smith i1 = vi[1]; 10484c1414c8SBarry Smith tmp0 = tmps[i0]; 10494c1414c8SBarry Smith tmp1 = tmps[i1]; 10504c1414c8SBarry Smith v1 -= 2; 10514c1414c8SBarry Smith v2 -= 2; 10524c1414c8SBarry Smith v3 -= 2; 10534c1414c8SBarry Smith sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 10544c1414c8SBarry Smith sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 10554c1414c8SBarry Smith sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 10564c1414c8SBarry Smith } 10574c1414c8SBarry Smith if (j == 1) { 10584c1414c8SBarry Smith tmp0 = tmps[*vi--]; 10594c1414c8SBarry Smith sum1 -= *v1-- * tmp0; 10604c1414c8SBarry Smith sum2 -= *v2-- * tmp0; 10614c1414c8SBarry Smith sum3 -= *v3-- * tmp0; 10624c1414c8SBarry Smith } 10639371c9d4SSatish Balay tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 10649371c9d4SSatish Balay row--; 10654c1414c8SBarry Smith sum2 -= *v2-- * tmp0; 10664c1414c8SBarry Smith sum3 -= *v3-- * tmp0; 10679371c9d4SSatish Balay tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]]; 10689371c9d4SSatish Balay row--; 10694c1414c8SBarry Smith sum3 -= *v3-- * tmp0; 10709371c9d4SSatish Balay x[*c--] = tmp[row] = sum3 * a_a[ad[row]]; 10719371c9d4SSatish Balay row--; 10724c1414c8SBarry Smith 10734c1414c8SBarry Smith break; 10744c1414c8SBarry Smith case 4: 10754c1414c8SBarry Smith sum1 = tmp[row]; 10764c1414c8SBarry Smith sum2 = tmp[row - 1]; 10774c1414c8SBarry Smith sum3 = tmp[row - 2]; 10784c1414c8SBarry Smith sum4 = tmp[row - 3]; 10794c1414c8SBarry Smith v2 = aa + ai[row] - 1; 10804c1414c8SBarry Smith v3 = aa + ai[row - 1] - 1; 10814c1414c8SBarry Smith v4 = aa + ai[row - 2] - 1; 10824c1414c8SBarry Smith 10834c1414c8SBarry Smith for (j = nz; j > 1; j -= 2) { 10844c1414c8SBarry Smith vi -= 2; 10854c1414c8SBarry Smith i0 = vi[2]; 10864c1414c8SBarry Smith i1 = vi[1]; 10874c1414c8SBarry Smith tmp0 = tmps[i0]; 10884c1414c8SBarry Smith tmp1 = tmps[i1]; 10894c1414c8SBarry Smith v1 -= 2; 10904c1414c8SBarry Smith v2 -= 2; 10914c1414c8SBarry Smith v3 -= 2; 10924c1414c8SBarry Smith v4 -= 2; 10934c1414c8SBarry Smith sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 10944c1414c8SBarry Smith sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 10954c1414c8SBarry Smith sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 10964c1414c8SBarry Smith sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 10974c1414c8SBarry Smith } 10984c1414c8SBarry Smith if (j == 1) { 10994c1414c8SBarry Smith tmp0 = tmps[*vi--]; 11004c1414c8SBarry Smith sum1 -= *v1-- * tmp0; 11014c1414c8SBarry Smith sum2 -= *v2-- * tmp0; 11024c1414c8SBarry Smith sum3 -= *v3-- * tmp0; 11034c1414c8SBarry Smith sum4 -= *v4-- * tmp0; 11044c1414c8SBarry Smith } 11054c1414c8SBarry Smith 11069371c9d4SSatish Balay tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 11079371c9d4SSatish Balay row--; 11084c1414c8SBarry Smith sum2 -= *v2-- * tmp0; 11094c1414c8SBarry Smith sum3 -= *v3-- * tmp0; 11104c1414c8SBarry Smith sum4 -= *v4-- * tmp0; 11119371c9d4SSatish Balay tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]]; 11129371c9d4SSatish Balay row--; 11134c1414c8SBarry Smith sum3 -= *v3-- * tmp0; 11144c1414c8SBarry Smith sum4 -= *v4-- * tmp0; 11159371c9d4SSatish Balay tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]]; 11169371c9d4SSatish Balay row--; 11174c1414c8SBarry Smith sum4 -= *v4-- * tmp0; 11189371c9d4SSatish Balay x[*c--] = tmp[row] = sum4 * a_a[ad[row]]; 11199371c9d4SSatish Balay row--; 11204c1414c8SBarry Smith break; 11214c1414c8SBarry Smith case 5: 11224c1414c8SBarry Smith sum1 = tmp[row]; 11234c1414c8SBarry Smith sum2 = tmp[row - 1]; 11244c1414c8SBarry Smith sum3 = tmp[row - 2]; 11254c1414c8SBarry Smith sum4 = tmp[row - 3]; 11264c1414c8SBarry Smith sum5 = tmp[row - 4]; 11274c1414c8SBarry Smith v2 = aa + ai[row] - 1; 11284c1414c8SBarry Smith v3 = aa + ai[row - 1] - 1; 11294c1414c8SBarry Smith v4 = aa + ai[row - 2] - 1; 11304c1414c8SBarry Smith v5 = aa + ai[row - 3] - 1; 11314c1414c8SBarry Smith for (j = nz; j > 1; j -= 2) { 11324c1414c8SBarry Smith vi -= 2; 11334c1414c8SBarry Smith i0 = vi[2]; 11344c1414c8SBarry Smith i1 = vi[1]; 11354c1414c8SBarry Smith tmp0 = tmps[i0]; 11364c1414c8SBarry Smith tmp1 = tmps[i1]; 11374c1414c8SBarry Smith v1 -= 2; 11384c1414c8SBarry Smith v2 -= 2; 11394c1414c8SBarry Smith v3 -= 2; 11404c1414c8SBarry Smith v4 -= 2; 11414c1414c8SBarry Smith v5 -= 2; 11424c1414c8SBarry Smith sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 11434c1414c8SBarry Smith sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 11444c1414c8SBarry Smith sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 11454c1414c8SBarry Smith sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 11464c1414c8SBarry Smith sum5 -= v5[2] * tmp0 + v5[1] * tmp1; 11474c1414c8SBarry Smith } 11484c1414c8SBarry Smith if (j == 1) { 11494c1414c8SBarry Smith tmp0 = tmps[*vi--]; 11504c1414c8SBarry Smith sum1 -= *v1-- * tmp0; 11514c1414c8SBarry Smith sum2 -= *v2-- * tmp0; 11524c1414c8SBarry Smith sum3 -= *v3-- * tmp0; 11534c1414c8SBarry Smith sum4 -= *v4-- * tmp0; 11544c1414c8SBarry Smith sum5 -= *v5-- * tmp0; 11554c1414c8SBarry Smith } 11564c1414c8SBarry Smith 11579371c9d4SSatish Balay tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 11589371c9d4SSatish Balay row--; 11594c1414c8SBarry Smith sum2 -= *v2-- * tmp0; 11604c1414c8SBarry Smith sum3 -= *v3-- * tmp0; 11614c1414c8SBarry Smith sum4 -= *v4-- * tmp0; 11624c1414c8SBarry Smith sum5 -= *v5-- * tmp0; 11639371c9d4SSatish Balay tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]]; 11649371c9d4SSatish Balay row--; 11654c1414c8SBarry Smith sum3 -= *v3-- * tmp0; 11664c1414c8SBarry Smith sum4 -= *v4-- * tmp0; 11674c1414c8SBarry Smith sum5 -= *v5-- * tmp0; 11689371c9d4SSatish Balay tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]]; 11699371c9d4SSatish Balay row--; 11704c1414c8SBarry Smith sum4 -= *v4-- * tmp0; 11714c1414c8SBarry Smith sum5 -= *v5-- * tmp0; 11729371c9d4SSatish Balay tmp0 = x[*c--] = tmp[row] = sum4 * a_a[ad[row]]; 11739371c9d4SSatish Balay row--; 11744c1414c8SBarry Smith sum5 -= *v5-- * tmp0; 11759371c9d4SSatish Balay x[*c--] = tmp[row] = sum5 * a_a[ad[row]]; 11769371c9d4SSatish Balay row--; 11774c1414c8SBarry Smith break; 1178d71ae5a4SJacob Faibussowitsch default: 1179d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported "); 11804c1414c8SBarry Smith } 11814c1414c8SBarry Smith } 11829566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 11839566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 11849566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 11859566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(xx, &x)); 11869566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n)); 11873ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 11884c1414c8SBarry Smith } 11894c1414c8SBarry Smith 1190d71ae5a4SJacob Faibussowitsch PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B, Mat A, const MatFactorInfo *info) 1191d71ae5a4SJacob Faibussowitsch { 119228f1b45aSHong Zhang Mat C = B; 119328f1b45aSHong Zhang Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data; 119428f1b45aSHong Zhang IS isrow = b->row, isicol = b->icol; 119528f1b45aSHong Zhang const PetscInt *r, *ic, *ics; 119628f1b45aSHong Zhang const PetscInt n = A->rmap->n, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag; 119728f1b45aSHong Zhang PetscInt i, j, k, nz, nzL, row, *pj; 119828f1b45aSHong Zhang const PetscInt *ajtmp, *bjtmp; 11999877982aSShri Abhyankar MatScalar *pc, *pc1, *pc2, *pc3, *pc4, mul1, mul2, mul3, mul4, *pv, *rtmp1, *rtmp2, *rtmp3, *rtmp4; 12009877982aSShri Abhyankar const MatScalar *aa = a->a, *v, *v1, *v2, *v3, *v4; 120128f1b45aSHong Zhang FactorShiftCtx sctx; 12024f81c4b7SBarry Smith const PetscInt *ddiag; 120328f1b45aSHong Zhang PetscReal rs; 120428f1b45aSHong Zhang MatScalar d; 12054f81c4b7SBarry Smith PetscInt inod, nodesz, node_max, col; 12064f81c4b7SBarry Smith const PetscInt *ns; 120707b50cabSHong Zhang PetscInt *tmp_vec1, *tmp_vec2, *nsmap; 12080e95ead3SHong Zhang 120928f1b45aSHong Zhang PetscFunctionBegin; 121028f1b45aSHong Zhang /* MatPivotSetUp(): initialize shift context sctx */ 12119566063dSJacob Faibussowitsch PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx))); 121228f1b45aSHong Zhang 1213f4db908eSBarry Smith if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 121428f1b45aSHong Zhang ddiag = a->diag; 121528f1b45aSHong Zhang sctx.shift_top = info->zeropivot; 121628f1b45aSHong Zhang for (i = 0; i < n; i++) { 121728f1b45aSHong Zhang /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 121828f1b45aSHong Zhang d = (aa)[ddiag[i]]; 121928f1b45aSHong Zhang rs = -PetscAbsScalar(d) - PetscRealPart(d); 122028f1b45aSHong Zhang v = aa + ai[i]; 122128f1b45aSHong Zhang nz = ai[i + 1] - ai[i]; 12222205254eSKarl Rupp for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]); 122328f1b45aSHong Zhang if (rs > sctx.shift_top) sctx.shift_top = rs; 122428f1b45aSHong Zhang } 122528f1b45aSHong Zhang sctx.shift_top *= 1.1; 122628f1b45aSHong Zhang sctx.nshift_max = 5; 122728f1b45aSHong Zhang sctx.shift_lo = 0.; 122828f1b45aSHong Zhang sctx.shift_hi = 1.; 122928f1b45aSHong Zhang } 123028f1b45aSHong Zhang 12319566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow, &r)); 12329566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isicol, &ic)); 123368785679SHong Zhang 12349566063dSJacob Faibussowitsch PetscCall(PetscCalloc4(n, &rtmp1, n, &rtmp2, n, &rtmp3, n, &rtmp4)); 123528f1b45aSHong Zhang ics = ic; 123628f1b45aSHong Zhang 123728f1b45aSHong Zhang node_max = a->inode.node_count; 123828f1b45aSHong Zhang ns = a->inode.size; 123928b400f6SJacob Faibussowitsch PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information"); 124028f1b45aSHong Zhang 12419877982aSShri Abhyankar /* If max inode size > 4, split it into two inodes.*/ 124268785679SHong Zhang /* also map the inode sizes according to the ordering */ 12439566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(n + 1, &tmp_vec1)); 124468785679SHong Zhang for (i = 0, j = 0; i < node_max; ++i, ++j) { 1245b1550197SShri Abhyankar if (ns[i] > 4) { 1246048b5e81SShri Abhyankar tmp_vec1[j] = 4; 124768785679SHong Zhang ++j; 124868785679SHong Zhang tmp_vec1[j] = ns[i] - tmp_vec1[j - 1]; 124968785679SHong Zhang } else { 125068785679SHong Zhang tmp_vec1[j] = ns[i]; 125168785679SHong Zhang } 125268785679SHong Zhang } 125368785679SHong Zhang /* Use the correct node_max */ 125468785679SHong Zhang node_max = j; 125568785679SHong Zhang 125668785679SHong Zhang /* Now reorder the inode info based on mat re-ordering info */ 125768785679SHong Zhang /* First create a row -> inode_size_array_index map */ 12589566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(n + 1, &nsmap)); 12599566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(node_max + 1, &tmp_vec2)); 126068785679SHong Zhang for (i = 0, row = 0; i < node_max; i++) { 126168785679SHong Zhang nodesz = tmp_vec1[i]; 1262ad540459SPierre Jolivet for (j = 0; j < nodesz; j++, row++) nsmap[row] = i; 126368785679SHong Zhang } 126468785679SHong Zhang /* Using nsmap, create a reordered ns structure */ 126568785679SHong Zhang for (i = 0, j = 0; i < node_max; i++) { 126668785679SHong Zhang nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 126768785679SHong Zhang tmp_vec2[i] = nodesz; 126868785679SHong Zhang j += nodesz; 126968785679SHong Zhang } 12709566063dSJacob Faibussowitsch PetscCall(PetscFree(nsmap)); 12719566063dSJacob Faibussowitsch PetscCall(PetscFree(tmp_vec1)); 1272b89f182dSHong Zhang 127368785679SHong Zhang /* Now use the correct ns */ 127468785679SHong Zhang ns = tmp_vec2; 127568785679SHong Zhang 127628f1b45aSHong Zhang do { 127707b50cabSHong Zhang sctx.newshift = PETSC_FALSE; 127828f1b45aSHong Zhang /* Now loop over each block-row, and do the factorization */ 127928f1b45aSHong Zhang for (inod = 0, i = 0; inod < node_max; inod++) { /* i: row index; inod: inode index */ 128028f1b45aSHong Zhang nodesz = ns[inod]; 128128f1b45aSHong Zhang 128228f1b45aSHong Zhang switch (nodesz) { 128328f1b45aSHong Zhang case 1: 128468785679SHong Zhang /*----------*/ 1285b89f182dSHong Zhang /* zero rtmp1 */ 128628f1b45aSHong Zhang /* L part */ 128728f1b45aSHong Zhang nz = bi[i + 1] - bi[i]; 128828f1b45aSHong Zhang bjtmp = bj + bi[i]; 1289b89f182dSHong Zhang for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0; 129028f1b45aSHong Zhang 129128f1b45aSHong Zhang /* U part */ 129228f1b45aSHong Zhang nz = bdiag[i] - bdiag[i + 1]; 129328f1b45aSHong Zhang bjtmp = bj + bdiag[i + 1] + 1; 1294b89f182dSHong Zhang for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0; 129528f1b45aSHong Zhang 129628f1b45aSHong Zhang /* load in initial (unfactored row) */ 129728f1b45aSHong Zhang nz = ai[r[i] + 1] - ai[r[i]]; 129828f1b45aSHong Zhang ajtmp = aj + ai[r[i]]; 129928f1b45aSHong Zhang v = aa + ai[r[i]]; 13002205254eSKarl Rupp for (j = 0; j < nz; j++) rtmp1[ics[ajtmp[j]]] = v[j]; 13012205254eSKarl Rupp 130228f1b45aSHong Zhang /* ZeropivotApply() */ 1303b89f182dSHong Zhang rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */ 130428f1b45aSHong Zhang 130528f1b45aSHong Zhang /* elimination */ 130628f1b45aSHong Zhang bjtmp = bj + bi[i]; 130728f1b45aSHong Zhang row = *bjtmp++; 130828f1b45aSHong Zhang nzL = bi[i + 1] - bi[i]; 130928f1b45aSHong Zhang for (k = 0; k < nzL; k++) { 1310b89f182dSHong Zhang pc = rtmp1 + row; 131128f1b45aSHong Zhang if (*pc != 0.0) { 131228f1b45aSHong Zhang pv = b->a + bdiag[row]; 1313b89f182dSHong Zhang mul1 = *pc * (*pv); 1314b89f182dSHong Zhang *pc = mul1; 131528f1b45aSHong Zhang pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 131628f1b45aSHong Zhang pv = b->a + bdiag[row + 1] + 1; 131728f1b45aSHong Zhang nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 1318b89f182dSHong Zhang for (j = 0; j < nz; j++) rtmp1[pj[j]] -= mul1 * pv[j]; 13199566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(1 + 2.0 * nz)); 132028f1b45aSHong Zhang } 132128f1b45aSHong Zhang row = *bjtmp++; 132228f1b45aSHong Zhang } 132328f1b45aSHong Zhang 132428f1b45aSHong Zhang /* finished row so stick it into b->a */ 132528f1b45aSHong Zhang rs = 0.0; 132628f1b45aSHong Zhang /* L part */ 132728f1b45aSHong Zhang pv = b->a + bi[i]; 132828f1b45aSHong Zhang pj = b->j + bi[i]; 132928f1b45aSHong Zhang nz = bi[i + 1] - bi[i]; 133028f1b45aSHong Zhang for (j = 0; j < nz; j++) { 13319371c9d4SSatish Balay pv[j] = rtmp1[pj[j]]; 13329371c9d4SSatish Balay rs += PetscAbsScalar(pv[j]); 133328f1b45aSHong Zhang } 133428f1b45aSHong Zhang 133528f1b45aSHong Zhang /* U part */ 133628f1b45aSHong Zhang pv = b->a + bdiag[i + 1] + 1; 133728f1b45aSHong Zhang pj = b->j + bdiag[i + 1] + 1; 133828f1b45aSHong Zhang nz = bdiag[i] - bdiag[i + 1] - 1; 133928f1b45aSHong Zhang for (j = 0; j < nz; j++) { 13409371c9d4SSatish Balay pv[j] = rtmp1[pj[j]]; 13419371c9d4SSatish Balay rs += PetscAbsScalar(pv[j]); 134228f1b45aSHong Zhang } 134328f1b45aSHong Zhang 1344b89f182dSHong Zhang /* Check zero pivot */ 134528f1b45aSHong Zhang sctx.rs = rs; 1346b89f182dSHong Zhang sctx.pv = rtmp1[i]; 13479566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 134807b50cabSHong Zhang if (sctx.newshift) break; 134928f1b45aSHong Zhang 1350a5b23f4aSJose E. Roman /* Mark diagonal and invert diagonal for simpler triangular solves */ 135128f1b45aSHong Zhang pv = b->a + bdiag[i]; 1352b89f182dSHong Zhang *pv = 1.0 / sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */ 135328f1b45aSHong Zhang break; 135428f1b45aSHong Zhang 135528f1b45aSHong Zhang case 2: 135628f1b45aSHong Zhang /*----------*/ 1357b89f182dSHong Zhang /* zero rtmp1 and rtmp2 */ 135828f1b45aSHong Zhang /* L part */ 135928f1b45aSHong Zhang nz = bi[i + 1] - bi[i]; 136028f1b45aSHong Zhang bjtmp = bj + bi[i]; 136128f1b45aSHong Zhang for (j = 0; j < nz; j++) { 136268785679SHong Zhang col = bjtmp[j]; 13639371c9d4SSatish Balay rtmp1[col] = 0.0; 13649371c9d4SSatish Balay rtmp2[col] = 0.0; 136528f1b45aSHong Zhang } 136628f1b45aSHong Zhang 136728f1b45aSHong Zhang /* U part */ 136828f1b45aSHong Zhang nz = bdiag[i] - bdiag[i + 1]; 136928f1b45aSHong Zhang bjtmp = bj + bdiag[i + 1] + 1; 137028f1b45aSHong Zhang for (j = 0; j < nz; j++) { 137168785679SHong Zhang col = bjtmp[j]; 13729371c9d4SSatish Balay rtmp1[col] = 0.0; 13739371c9d4SSatish Balay rtmp2[col] = 0.0; 137428f1b45aSHong Zhang } 137528f1b45aSHong Zhang 137628f1b45aSHong Zhang /* load in initial (unfactored row) */ 137728f1b45aSHong Zhang nz = ai[r[i] + 1] - ai[r[i]]; 137828f1b45aSHong Zhang ajtmp = aj + ai[r[i]]; 13799371c9d4SSatish Balay v1 = aa + ai[r[i]]; 13809371c9d4SSatish Balay v2 = aa + ai[r[i] + 1]; 138128f1b45aSHong Zhang for (j = 0; j < nz; j++) { 138268785679SHong Zhang col = ics[ajtmp[j]]; 13839371c9d4SSatish Balay rtmp1[col] = v1[j]; 13849371c9d4SSatish Balay rtmp2[col] = v2[j]; 138528f1b45aSHong Zhang } 138628f1b45aSHong Zhang /* ZeropivotApply(): shift the diagonal of the matrix */ 13879371c9d4SSatish Balay rtmp1[i] += sctx.shift_amount; 13889371c9d4SSatish Balay rtmp2[i + 1] += sctx.shift_amount; 138928f1b45aSHong Zhang 139028f1b45aSHong Zhang /* elimination */ 139128f1b45aSHong Zhang bjtmp = bj + bi[i]; 139228f1b45aSHong Zhang row = *bjtmp++; /* pivot row */ 139328f1b45aSHong Zhang nzL = bi[i + 1] - bi[i]; 139428f1b45aSHong Zhang for (k = 0; k < nzL; k++) { 1395b89f182dSHong Zhang pc1 = rtmp1 + row; 1396b89f182dSHong Zhang pc2 = rtmp2 + row; 139728f1b45aSHong Zhang if (*pc1 != 0.0 || *pc2 != 0.0) { 139828f1b45aSHong Zhang pv = b->a + bdiag[row]; 13999371c9d4SSatish Balay mul1 = *pc1 * (*pv); 14009371c9d4SSatish Balay mul2 = *pc2 * (*pv); 14019371c9d4SSatish Balay *pc1 = mul1; 14029371c9d4SSatish Balay *pc2 = mul2; 140328f1b45aSHong Zhang 140428f1b45aSHong Zhang pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 140528f1b45aSHong Zhang pv = b->a + bdiag[row + 1] + 1; 140628f1b45aSHong Zhang nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 140728f1b45aSHong Zhang for (j = 0; j < nz; j++) { 140868785679SHong Zhang col = pj[j]; 1409b89f182dSHong Zhang rtmp1[col] -= mul1 * pv[j]; 1410b89f182dSHong Zhang rtmp2[col] -= mul2 * pv[j]; 141128f1b45aSHong Zhang } 14129566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2 + 4.0 * nz)); 141328f1b45aSHong Zhang } 141428f1b45aSHong Zhang row = *bjtmp++; 141528f1b45aSHong Zhang } 141628f1b45aSHong Zhang 1417b89f182dSHong Zhang /* finished row i; check zero pivot, then stick row i into b->a */ 141828f1b45aSHong Zhang rs = 0.0; 141928f1b45aSHong Zhang /* L part */ 1420b89f182dSHong Zhang pc1 = b->a + bi[i]; 142128f1b45aSHong Zhang pj = b->j + bi[i]; 142228f1b45aSHong Zhang nz = bi[i + 1] - bi[i]; 142328f1b45aSHong Zhang for (j = 0; j < nz; j++) { 142468785679SHong Zhang col = pj[j]; 14259371c9d4SSatish Balay pc1[j] = rtmp1[col]; 14269371c9d4SSatish Balay rs += PetscAbsScalar(pc1[j]); 142728f1b45aSHong Zhang } 142828f1b45aSHong Zhang /* U part */ 1429b89f182dSHong Zhang pc1 = b->a + bdiag[i + 1] + 1; 143028f1b45aSHong Zhang pj = b->j + bdiag[i + 1] + 1; 14310e7a5c2bSHong Zhang nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */ 143228f1b45aSHong Zhang for (j = 0; j < nz; j++) { 143368785679SHong Zhang col = pj[j]; 14349371c9d4SSatish Balay pc1[j] = rtmp1[col]; 14359371c9d4SSatish Balay rs += PetscAbsScalar(pc1[j]); 143628f1b45aSHong Zhang } 143728f1b45aSHong Zhang 143828f1b45aSHong Zhang sctx.rs = rs; 1439b89f182dSHong Zhang sctx.pv = rtmp1[i]; 14409566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 144107b50cabSHong Zhang if (sctx.newshift) break; 1442b89f182dSHong Zhang pc1 = b->a + bdiag[i]; /* Mark diagonal */ 1443b89f182dSHong Zhang *pc1 = 1.0 / sctx.pv; 1444b89f182dSHong Zhang 1445b89f182dSHong Zhang /* Now take care of diagonal 2x2 block. */ 1446b89f182dSHong Zhang pc2 = rtmp2 + i; 1447b89f182dSHong Zhang if (*pc2 != 0.0) { 1448b89f182dSHong Zhang mul1 = (*pc2) * (*pc1); /* *pc1=diag[i] is inverted! */ 1449b89f182dSHong Zhang *pc2 = mul1; /* insert L entry */ 1450b89f182dSHong Zhang pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */ 1451b89f182dSHong Zhang nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */ 1452b89f182dSHong Zhang for (j = 0; j < nz; j++) { 14539371c9d4SSatish Balay col = pj[j]; 14549371c9d4SSatish Balay rtmp2[col] -= mul1 * rtmp1[col]; 145528f1b45aSHong Zhang } 14569566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(1 + 2.0 * nz)); 1457b89f182dSHong Zhang } 1458b89f182dSHong Zhang 1459b89f182dSHong Zhang /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1460b89f182dSHong Zhang rs = 0.0; 1461b89f182dSHong Zhang /* L part */ 1462b89f182dSHong Zhang pc2 = b->a + bi[i + 1]; 1463b89f182dSHong Zhang pj = b->j + bi[i + 1]; 1464b89f182dSHong Zhang nz = bi[i + 2] - bi[i + 1]; 1465b89f182dSHong Zhang for (j = 0; j < nz; j++) { 1466b89f182dSHong Zhang col = pj[j]; 14679371c9d4SSatish Balay pc2[j] = rtmp2[col]; 14689371c9d4SSatish Balay rs += PetscAbsScalar(pc2[j]); 1469b89f182dSHong Zhang } 1470b89f182dSHong Zhang /* U part */ 1471b89f182dSHong Zhang pc2 = b->a + bdiag[i + 2] + 1; 14720e7a5c2bSHong Zhang pj = b->j + bdiag[i + 2] + 1; 14730e7a5c2bSHong Zhang nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */ 1474b89f182dSHong Zhang for (j = 0; j < nz; j++) { 1475b89f182dSHong Zhang col = pj[j]; 14769371c9d4SSatish Balay pc2[j] = rtmp2[col]; 14779371c9d4SSatish Balay rs += PetscAbsScalar(pc2[j]); 1478b89f182dSHong Zhang } 1479b89f182dSHong Zhang 148028f1b45aSHong Zhang sctx.rs = rs; 1481b89f182dSHong Zhang sctx.pv = rtmp2[i + 1]; 14829566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1)); 148307b50cabSHong Zhang if (sctx.newshift) break; 148428f1b45aSHong Zhang pc2 = b->a + bdiag[i + 1]; 1485b89f182dSHong Zhang *pc2 = 1.0 / sctx.pv; 148628f1b45aSHong Zhang break; 1487b89f182dSHong Zhang 148868785679SHong Zhang case 3: 148968785679SHong Zhang /*----------*/ 149068785679SHong Zhang /* zero rtmp */ 149168785679SHong Zhang /* L part */ 149268785679SHong Zhang nz = bi[i + 1] - bi[i]; 149368785679SHong Zhang bjtmp = bj + bi[i]; 149468785679SHong Zhang for (j = 0; j < nz; j++) { 149568785679SHong Zhang col = bjtmp[j]; 14969371c9d4SSatish Balay rtmp1[col] = 0.0; 14979371c9d4SSatish Balay rtmp2[col] = 0.0; 14989371c9d4SSatish Balay rtmp3[col] = 0.0; 149968785679SHong Zhang } 150068785679SHong Zhang 150168785679SHong Zhang /* U part */ 150268785679SHong Zhang nz = bdiag[i] - bdiag[i + 1]; 150368785679SHong Zhang bjtmp = bj + bdiag[i + 1] + 1; 150468785679SHong Zhang for (j = 0; j < nz; j++) { 150568785679SHong Zhang col = bjtmp[j]; 15069371c9d4SSatish Balay rtmp1[col] = 0.0; 15079371c9d4SSatish Balay rtmp2[col] = 0.0; 15089371c9d4SSatish Balay rtmp3[col] = 0.0; 150968785679SHong Zhang } 151068785679SHong Zhang 151168785679SHong Zhang /* load in initial (unfactored row) */ 151268785679SHong Zhang nz = ai[r[i] + 1] - ai[r[i]]; 151368785679SHong Zhang ajtmp = aj + ai[r[i]]; 15149371c9d4SSatish Balay v1 = aa + ai[r[i]]; 15159371c9d4SSatish Balay v2 = aa + ai[r[i] + 1]; 15169371c9d4SSatish Balay v3 = aa + ai[r[i] + 2]; 151768785679SHong Zhang for (j = 0; j < nz; j++) { 151868785679SHong Zhang col = ics[ajtmp[j]]; 15199371c9d4SSatish Balay rtmp1[col] = v1[j]; 15209371c9d4SSatish Balay rtmp2[col] = v2[j]; 15219371c9d4SSatish Balay rtmp3[col] = v3[j]; 152268785679SHong Zhang } 152368785679SHong Zhang /* ZeropivotApply(): shift the diagonal of the matrix */ 15249371c9d4SSatish Balay rtmp1[i] += sctx.shift_amount; 15259371c9d4SSatish Balay rtmp2[i + 1] += sctx.shift_amount; 15269371c9d4SSatish Balay rtmp3[i + 2] += sctx.shift_amount; 152768785679SHong Zhang 152868785679SHong Zhang /* elimination */ 152968785679SHong Zhang bjtmp = bj + bi[i]; 153068785679SHong Zhang row = *bjtmp++; /* pivot row */ 153168785679SHong Zhang nzL = bi[i + 1] - bi[i]; 153268785679SHong Zhang for (k = 0; k < nzL; k++) { 1533b89f182dSHong Zhang pc1 = rtmp1 + row; 1534b89f182dSHong Zhang pc2 = rtmp2 + row; 1535b89f182dSHong Zhang pc3 = rtmp3 + row; 153668785679SHong Zhang if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) { 153768785679SHong Zhang pv = b->a + bdiag[row]; 15389371c9d4SSatish Balay mul1 = *pc1 * (*pv); 15399371c9d4SSatish Balay mul2 = *pc2 * (*pv); 15409371c9d4SSatish Balay mul3 = *pc3 * (*pv); 15419371c9d4SSatish Balay *pc1 = mul1; 15429371c9d4SSatish Balay *pc2 = mul2; 15439371c9d4SSatish Balay *pc3 = mul3; 154468785679SHong Zhang 154568785679SHong Zhang pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 154668785679SHong Zhang pv = b->a + bdiag[row + 1] + 1; 154768785679SHong Zhang nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 154868785679SHong Zhang for (j = 0; j < nz; j++) { 154968785679SHong Zhang col = pj[j]; 1550b89f182dSHong Zhang rtmp1[col] -= mul1 * pv[j]; 1551b89f182dSHong Zhang rtmp2[col] -= mul2 * pv[j]; 1552b89f182dSHong Zhang rtmp3[col] -= mul3 * pv[j]; 155368785679SHong Zhang } 15549566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(3 + 6.0 * nz)); 155568785679SHong Zhang } 155668785679SHong Zhang row = *bjtmp++; 155768785679SHong Zhang } 155868785679SHong Zhang 1559b89f182dSHong Zhang /* finished row i; check zero pivot, then stick row i into b->a */ 1560b89f182dSHong Zhang rs = 0.0; 1561b89f182dSHong Zhang /* L part */ 1562b89f182dSHong Zhang pc1 = b->a + bi[i]; 1563b89f182dSHong Zhang pj = b->j + bi[i]; 1564b89f182dSHong Zhang nz = bi[i + 1] - bi[i]; 1565b89f182dSHong Zhang for (j = 0; j < nz; j++) { 1566b89f182dSHong Zhang col = pj[j]; 15679371c9d4SSatish Balay pc1[j] = rtmp1[col]; 15689371c9d4SSatish Balay rs += PetscAbsScalar(pc1[j]); 1569b89f182dSHong Zhang } 1570b89f182dSHong Zhang /* U part */ 1571b89f182dSHong Zhang pc1 = b->a + bdiag[i + 1] + 1; 1572b89f182dSHong Zhang pj = b->j + bdiag[i + 1] + 1; 15730e7a5c2bSHong Zhang nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */ 1574b89f182dSHong Zhang for (j = 0; j < nz; j++) { 1575b89f182dSHong Zhang col = pj[j]; 15769371c9d4SSatish Balay pc1[j] = rtmp1[col]; 15779371c9d4SSatish Balay rs += PetscAbsScalar(pc1[j]); 1578b89f182dSHong Zhang } 157968785679SHong Zhang 1580b89f182dSHong Zhang sctx.rs = rs; 1581b89f182dSHong Zhang sctx.pv = rtmp1[i]; 15829566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 158307b50cabSHong Zhang if (sctx.newshift) break; 1584b89f182dSHong Zhang pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1585b89f182dSHong Zhang *pc1 = 1.0 / sctx.pv; 1586b89f182dSHong Zhang 1587b89f182dSHong Zhang /* Now take care of 1st column of diagonal 3x3 block. */ 1588b89f182dSHong Zhang pc2 = rtmp2 + i; 1589b89f182dSHong Zhang pc3 = rtmp3 + i; 1590b89f182dSHong Zhang if (*pc2 != 0.0 || *pc3 != 0.0) { 15919371c9d4SSatish Balay mul2 = (*pc2) * (*pc1); 15929371c9d4SSatish Balay *pc2 = mul2; 15939371c9d4SSatish Balay mul3 = (*pc3) * (*pc1); 15949371c9d4SSatish Balay *pc3 = mul3; 159568785679SHong Zhang pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */ 159668785679SHong Zhang nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */ 159768785679SHong Zhang for (j = 0; j < nz; j++) { 159868785679SHong Zhang col = pj[j]; 1599b89f182dSHong Zhang rtmp2[col] -= mul2 * rtmp1[col]; 1600b89f182dSHong Zhang rtmp3[col] -= mul3 * rtmp1[col]; 160168785679SHong Zhang } 16029566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2 + 4.0 * nz)); 160368785679SHong Zhang } 160468785679SHong Zhang 1605b89f182dSHong Zhang /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1606b89f182dSHong Zhang rs = 0.0; 1607b89f182dSHong Zhang /* L part */ 1608b89f182dSHong Zhang pc2 = b->a + bi[i + 1]; 1609b89f182dSHong Zhang pj = b->j + bi[i + 1]; 1610b89f182dSHong Zhang nz = bi[i + 2] - bi[i + 1]; 1611b89f182dSHong Zhang for (j = 0; j < nz; j++) { 1612b89f182dSHong Zhang col = pj[j]; 16139371c9d4SSatish Balay pc2[j] = rtmp2[col]; 16149371c9d4SSatish Balay rs += PetscAbsScalar(pc2[j]); 1615b89f182dSHong Zhang } 1616b89f182dSHong Zhang /* U part */ 1617b89f182dSHong Zhang pc2 = b->a + bdiag[i + 2] + 1; 16180e7a5c2bSHong Zhang pj = b->j + bdiag[i + 2] + 1; 16190e7a5c2bSHong Zhang nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */ 1620b89f182dSHong Zhang for (j = 0; j < nz; j++) { 1621b89f182dSHong Zhang col = pj[j]; 16229371c9d4SSatish Balay pc2[j] = rtmp2[col]; 16239371c9d4SSatish Balay rs += PetscAbsScalar(pc2[j]); 1624b89f182dSHong Zhang } 1625b89f182dSHong Zhang 1626b89f182dSHong Zhang sctx.rs = rs; 1627b89f182dSHong Zhang sctx.pv = rtmp2[i + 1]; 16289566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1)); 162907b50cabSHong Zhang if (sctx.newshift) break; 1630b89f182dSHong Zhang pc2 = b->a + bdiag[i + 1]; 1631b89f182dSHong Zhang *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */ 1632b89f182dSHong Zhang 1633b89f182dSHong Zhang /* Now take care of 2nd column of diagonal 3x3 block. */ 1634b89f182dSHong Zhang pc3 = rtmp3 + i + 1; 163568785679SHong Zhang if (*pc3 != 0.0) { 16369371c9d4SSatish Balay mul3 = (*pc3) * (*pc2); 16379371c9d4SSatish Balay *pc3 = mul3; 163868785679SHong Zhang pj = b->j + bdiag[i + 2] + 1; /* beginning of U(i+1,:) */ 163968785679SHong Zhang nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */ 164068785679SHong Zhang for (j = 0; j < nz; j++) { 164168785679SHong Zhang col = pj[j]; 1642b89f182dSHong Zhang rtmp3[col] -= mul3 * rtmp2[col]; 164368785679SHong Zhang } 16449566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(1 + 2.0 * nz)); 164568785679SHong Zhang } 164668785679SHong Zhang 1647b89f182dSHong Zhang /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 164868785679SHong Zhang rs = 0.0; 164968785679SHong Zhang /* L part */ 1650b89f182dSHong Zhang pc3 = b->a + bi[i + 2]; 1651b89f182dSHong Zhang pj = b->j + bi[i + 2]; 1652b89f182dSHong Zhang nz = bi[i + 3] - bi[i + 2]; 165368785679SHong Zhang for (j = 0; j < nz; j++) { 165468785679SHong Zhang col = pj[j]; 16559371c9d4SSatish Balay pc3[j] = rtmp3[col]; 16569371c9d4SSatish Balay rs += PetscAbsScalar(pc3[j]); 165768785679SHong Zhang } 165868785679SHong Zhang /* U part */ 1659b89f182dSHong Zhang pc3 = b->a + bdiag[i + 3] + 1; 16600e7a5c2bSHong Zhang pj = b->j + bdiag[i + 3] + 1; 16610e7a5c2bSHong Zhang nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */ 166268785679SHong Zhang for (j = 0; j < nz; j++) { 166368785679SHong Zhang col = pj[j]; 16649371c9d4SSatish Balay pc3[j] = rtmp3[col]; 16659371c9d4SSatish Balay rs += PetscAbsScalar(pc3[j]); 166668785679SHong Zhang } 166768785679SHong Zhang 166868785679SHong Zhang sctx.rs = rs; 1669b89f182dSHong Zhang sctx.pv = rtmp3[i + 2]; 16709566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2)); 167107b50cabSHong Zhang if (sctx.newshift) break; 167268785679SHong Zhang pc3 = b->a + bdiag[i + 2]; 1673b89f182dSHong Zhang *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */ 167468785679SHong Zhang break; 16759877982aSShri Abhyankar case 4: 16769877982aSShri Abhyankar /*----------*/ 16779877982aSShri Abhyankar /* zero rtmp */ 16789877982aSShri Abhyankar /* L part */ 16799877982aSShri Abhyankar nz = bi[i + 1] - bi[i]; 16809877982aSShri Abhyankar bjtmp = bj + bi[i]; 16819877982aSShri Abhyankar for (j = 0; j < nz; j++) { 16829877982aSShri Abhyankar col = bjtmp[j]; 16839371c9d4SSatish Balay rtmp1[col] = 0.0; 16849371c9d4SSatish Balay rtmp2[col] = 0.0; 16859371c9d4SSatish Balay rtmp3[col] = 0.0; 16869371c9d4SSatish Balay rtmp4[col] = 0.0; 16879877982aSShri Abhyankar } 16889877982aSShri Abhyankar 16899877982aSShri Abhyankar /* U part */ 16909877982aSShri Abhyankar nz = bdiag[i] - bdiag[i + 1]; 16919877982aSShri Abhyankar bjtmp = bj + bdiag[i + 1] + 1; 16929877982aSShri Abhyankar for (j = 0; j < nz; j++) { 16939877982aSShri Abhyankar col = bjtmp[j]; 16949371c9d4SSatish Balay rtmp1[col] = 0.0; 16959371c9d4SSatish Balay rtmp2[col] = 0.0; 16969371c9d4SSatish Balay rtmp3[col] = 0.0; 16979371c9d4SSatish Balay rtmp4[col] = 0.0; 16989877982aSShri Abhyankar } 16999877982aSShri Abhyankar 17009877982aSShri Abhyankar /* load in initial (unfactored row) */ 17019877982aSShri Abhyankar nz = ai[r[i] + 1] - ai[r[i]]; 17029877982aSShri Abhyankar ajtmp = aj + ai[r[i]]; 17039371c9d4SSatish Balay v1 = aa + ai[r[i]]; 17049371c9d4SSatish Balay v2 = aa + ai[r[i] + 1]; 17059371c9d4SSatish Balay v3 = aa + ai[r[i] + 2]; 17069371c9d4SSatish Balay v4 = aa + ai[r[i] + 3]; 17079877982aSShri Abhyankar for (j = 0; j < nz; j++) { 17089877982aSShri Abhyankar col = ics[ajtmp[j]]; 17099371c9d4SSatish Balay rtmp1[col] = v1[j]; 17109371c9d4SSatish Balay rtmp2[col] = v2[j]; 17119371c9d4SSatish Balay rtmp3[col] = v3[j]; 17129371c9d4SSatish Balay rtmp4[col] = v4[j]; 17139877982aSShri Abhyankar } 17149877982aSShri Abhyankar /* ZeropivotApply(): shift the diagonal of the matrix */ 17159371c9d4SSatish Balay rtmp1[i] += sctx.shift_amount; 17169371c9d4SSatish Balay rtmp2[i + 1] += sctx.shift_amount; 17179371c9d4SSatish Balay rtmp3[i + 2] += sctx.shift_amount; 17189371c9d4SSatish Balay rtmp4[i + 3] += sctx.shift_amount; 17199877982aSShri Abhyankar 17209877982aSShri Abhyankar /* elimination */ 17219877982aSShri Abhyankar bjtmp = bj + bi[i]; 17229877982aSShri Abhyankar row = *bjtmp++; /* pivot row */ 17239877982aSShri Abhyankar nzL = bi[i + 1] - bi[i]; 17249877982aSShri Abhyankar for (k = 0; k < nzL; k++) { 17259877982aSShri Abhyankar pc1 = rtmp1 + row; 17269877982aSShri Abhyankar pc2 = rtmp2 + row; 17279877982aSShri Abhyankar pc3 = rtmp3 + row; 17289877982aSShri Abhyankar pc4 = rtmp4 + row; 17299877982aSShri Abhyankar if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 17309877982aSShri Abhyankar pv = b->a + bdiag[row]; 17319371c9d4SSatish Balay mul1 = *pc1 * (*pv); 17329371c9d4SSatish Balay mul2 = *pc2 * (*pv); 17339371c9d4SSatish Balay mul3 = *pc3 * (*pv); 17349371c9d4SSatish Balay mul4 = *pc4 * (*pv); 17359371c9d4SSatish Balay *pc1 = mul1; 17369371c9d4SSatish Balay *pc2 = mul2; 17379371c9d4SSatish Balay *pc3 = mul3; 17389371c9d4SSatish Balay *pc4 = mul4; 17399877982aSShri Abhyankar 17409877982aSShri Abhyankar pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 17419877982aSShri Abhyankar pv = b->a + bdiag[row + 1] + 1; 17429877982aSShri Abhyankar nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 17439877982aSShri Abhyankar for (j = 0; j < nz; j++) { 17449877982aSShri Abhyankar col = pj[j]; 17459877982aSShri Abhyankar rtmp1[col] -= mul1 * pv[j]; 17469877982aSShri Abhyankar rtmp2[col] -= mul2 * pv[j]; 17479877982aSShri Abhyankar rtmp3[col] -= mul3 * pv[j]; 17489877982aSShri Abhyankar rtmp4[col] -= mul4 * pv[j]; 17499877982aSShri Abhyankar } 17509566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(4 + 8.0 * nz)); 17519877982aSShri Abhyankar } 17529877982aSShri Abhyankar row = *bjtmp++; 17539877982aSShri Abhyankar } 17549877982aSShri Abhyankar 17559877982aSShri Abhyankar /* finished row i; check zero pivot, then stick row i into b->a */ 17569877982aSShri Abhyankar rs = 0.0; 17579877982aSShri Abhyankar /* L part */ 17589877982aSShri Abhyankar pc1 = b->a + bi[i]; 17599877982aSShri Abhyankar pj = b->j + bi[i]; 17609877982aSShri Abhyankar nz = bi[i + 1] - bi[i]; 17619877982aSShri Abhyankar for (j = 0; j < nz; j++) { 17629877982aSShri Abhyankar col = pj[j]; 17639371c9d4SSatish Balay pc1[j] = rtmp1[col]; 17649371c9d4SSatish Balay rs += PetscAbsScalar(pc1[j]); 17659877982aSShri Abhyankar } 17669877982aSShri Abhyankar /* U part */ 17679877982aSShri Abhyankar pc1 = b->a + bdiag[i + 1] + 1; 17689877982aSShri Abhyankar pj = b->j + bdiag[i + 1] + 1; 17699877982aSShri Abhyankar nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */ 17709877982aSShri Abhyankar for (j = 0; j < nz; j++) { 17719877982aSShri Abhyankar col = pj[j]; 17729371c9d4SSatish Balay pc1[j] = rtmp1[col]; 17739371c9d4SSatish Balay rs += PetscAbsScalar(pc1[j]); 17749877982aSShri Abhyankar } 17759877982aSShri Abhyankar 17769877982aSShri Abhyankar sctx.rs = rs; 17779877982aSShri Abhyankar sctx.pv = rtmp1[i]; 17789566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 177907b50cabSHong Zhang if (sctx.newshift) break; 17809877982aSShri Abhyankar pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 17819877982aSShri Abhyankar *pc1 = 1.0 / sctx.pv; 17829877982aSShri Abhyankar 17839877982aSShri Abhyankar /* Now take care of 1st column of diagonal 4x4 block. */ 17849877982aSShri Abhyankar pc2 = rtmp2 + i; 17859877982aSShri Abhyankar pc3 = rtmp3 + i; 17869877982aSShri Abhyankar pc4 = rtmp4 + i; 17879877982aSShri Abhyankar if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 17889371c9d4SSatish Balay mul2 = (*pc2) * (*pc1); 17899371c9d4SSatish Balay *pc2 = mul2; 17909371c9d4SSatish Balay mul3 = (*pc3) * (*pc1); 17919371c9d4SSatish Balay *pc3 = mul3; 17929371c9d4SSatish Balay mul4 = (*pc4) * (*pc1); 17939371c9d4SSatish Balay *pc4 = mul4; 17949877982aSShri Abhyankar pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */ 17959877982aSShri Abhyankar nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */ 17969877982aSShri Abhyankar for (j = 0; j < nz; j++) { 17979877982aSShri Abhyankar col = pj[j]; 17989877982aSShri Abhyankar rtmp2[col] -= mul2 * rtmp1[col]; 17999877982aSShri Abhyankar rtmp3[col] -= mul3 * rtmp1[col]; 18009877982aSShri Abhyankar rtmp4[col] -= mul4 * rtmp1[col]; 18019877982aSShri Abhyankar } 18029566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(3 + 6.0 * nz)); 18039877982aSShri Abhyankar } 18049877982aSShri Abhyankar 18059877982aSShri Abhyankar /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 18069877982aSShri Abhyankar rs = 0.0; 18079877982aSShri Abhyankar /* L part */ 18089877982aSShri Abhyankar pc2 = b->a + bi[i + 1]; 18099877982aSShri Abhyankar pj = b->j + bi[i + 1]; 18109877982aSShri Abhyankar nz = bi[i + 2] - bi[i + 1]; 18119877982aSShri Abhyankar for (j = 0; j < nz; j++) { 18129877982aSShri Abhyankar col = pj[j]; 18139371c9d4SSatish Balay pc2[j] = rtmp2[col]; 18149371c9d4SSatish Balay rs += PetscAbsScalar(pc2[j]); 18159877982aSShri Abhyankar } 18169877982aSShri Abhyankar /* U part */ 18179877982aSShri Abhyankar pc2 = b->a + bdiag[i + 2] + 1; 18189877982aSShri Abhyankar pj = b->j + bdiag[i + 2] + 1; 18199877982aSShri Abhyankar nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */ 18209877982aSShri Abhyankar for (j = 0; j < nz; j++) { 18219877982aSShri Abhyankar col = pj[j]; 18229371c9d4SSatish Balay pc2[j] = rtmp2[col]; 18239371c9d4SSatish Balay rs += PetscAbsScalar(pc2[j]); 18249877982aSShri Abhyankar } 18259877982aSShri Abhyankar 18269877982aSShri Abhyankar sctx.rs = rs; 18279877982aSShri Abhyankar sctx.pv = rtmp2[i + 1]; 18289566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1)); 182907b50cabSHong Zhang if (sctx.newshift) break; 18309877982aSShri Abhyankar pc2 = b->a + bdiag[i + 1]; 18319877982aSShri Abhyankar *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */ 18329877982aSShri Abhyankar 18339877982aSShri Abhyankar /* Now take care of 2nd column of diagonal 4x4 block. */ 18349877982aSShri Abhyankar pc3 = rtmp3 + i + 1; 18359877982aSShri Abhyankar pc4 = rtmp4 + i + 1; 18369877982aSShri Abhyankar if (*pc3 != 0.0 || *pc4 != 0.0) { 18379371c9d4SSatish Balay mul3 = (*pc3) * (*pc2); 18389371c9d4SSatish Balay *pc3 = mul3; 18399371c9d4SSatish Balay mul4 = (*pc4) * (*pc2); 18409371c9d4SSatish Balay *pc4 = mul4; 18419877982aSShri Abhyankar pj = b->j + bdiag[i + 2] + 1; /* beginning of U(i+1,:) */ 18429877982aSShri Abhyankar nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */ 18439877982aSShri Abhyankar for (j = 0; j < nz; j++) { 18449877982aSShri Abhyankar col = pj[j]; 18459877982aSShri Abhyankar rtmp3[col] -= mul3 * rtmp2[col]; 18469877982aSShri Abhyankar rtmp4[col] -= mul4 * rtmp2[col]; 18479877982aSShri Abhyankar } 18489566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(4.0 * nz)); 18499877982aSShri Abhyankar } 18509877982aSShri Abhyankar 18519877982aSShri Abhyankar /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 18529877982aSShri Abhyankar rs = 0.0; 18539877982aSShri Abhyankar /* L part */ 18549877982aSShri Abhyankar pc3 = b->a + bi[i + 2]; 18559877982aSShri Abhyankar pj = b->j + bi[i + 2]; 18569877982aSShri Abhyankar nz = bi[i + 3] - bi[i + 2]; 18579877982aSShri Abhyankar for (j = 0; j < nz; j++) { 18589877982aSShri Abhyankar col = pj[j]; 18599371c9d4SSatish Balay pc3[j] = rtmp3[col]; 18609371c9d4SSatish Balay rs += PetscAbsScalar(pc3[j]); 18619877982aSShri Abhyankar } 18629877982aSShri Abhyankar /* U part */ 18639877982aSShri Abhyankar pc3 = b->a + bdiag[i + 3] + 1; 18649877982aSShri Abhyankar pj = b->j + bdiag[i + 3] + 1; 18659877982aSShri Abhyankar nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */ 18669877982aSShri Abhyankar for (j = 0; j < nz; j++) { 18679877982aSShri Abhyankar col = pj[j]; 18689371c9d4SSatish Balay pc3[j] = rtmp3[col]; 18699371c9d4SSatish Balay rs += PetscAbsScalar(pc3[j]); 18709877982aSShri Abhyankar } 18719877982aSShri Abhyankar 18729877982aSShri Abhyankar sctx.rs = rs; 18739877982aSShri Abhyankar sctx.pv = rtmp3[i + 2]; 18749566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2)); 187507b50cabSHong Zhang if (sctx.newshift) break; 18769877982aSShri Abhyankar pc3 = b->a + bdiag[i + 2]; 18779877982aSShri Abhyankar *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */ 18789877982aSShri Abhyankar 18799877982aSShri Abhyankar /* Now take care of 3rd column of diagonal 4x4 block. */ 18809877982aSShri Abhyankar pc4 = rtmp4 + i + 2; 18819877982aSShri Abhyankar if (*pc4 != 0.0) { 18829371c9d4SSatish Balay mul4 = (*pc4) * (*pc3); 18839371c9d4SSatish Balay *pc4 = mul4; 18849877982aSShri Abhyankar pj = b->j + bdiag[i + 3] + 1; /* beginning of U(i+2,:) */ 18859877982aSShri Abhyankar nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* num of entries in U(i+2,:) excluding diag */ 18869877982aSShri Abhyankar for (j = 0; j < nz; j++) { 18879877982aSShri Abhyankar col = pj[j]; 18889877982aSShri Abhyankar rtmp4[col] -= mul4 * rtmp3[col]; 18899877982aSShri Abhyankar } 18909566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(1 + 2.0 * nz)); 18919877982aSShri Abhyankar } 18929877982aSShri Abhyankar 18939877982aSShri Abhyankar /* finished i+3; check zero pivot, then stick row i+3 into b->a */ 18949877982aSShri Abhyankar rs = 0.0; 18959877982aSShri Abhyankar /* L part */ 18969877982aSShri Abhyankar pc4 = b->a + bi[i + 3]; 18979877982aSShri Abhyankar pj = b->j + bi[i + 3]; 18989877982aSShri Abhyankar nz = bi[i + 4] - bi[i + 3]; 18999877982aSShri Abhyankar for (j = 0; j < nz; j++) { 19009877982aSShri Abhyankar col = pj[j]; 19019371c9d4SSatish Balay pc4[j] = rtmp4[col]; 19029371c9d4SSatish Balay rs += PetscAbsScalar(pc4[j]); 19039877982aSShri Abhyankar } 19049877982aSShri Abhyankar /* U part */ 19059877982aSShri Abhyankar pc4 = b->a + bdiag[i + 4] + 1; 19069877982aSShri Abhyankar pj = b->j + bdiag[i + 4] + 1; 19079877982aSShri Abhyankar nz = bdiag[i + 3] - bdiag[i + 4] - 1; /* exclude diagonal */ 19089877982aSShri Abhyankar for (j = 0; j < nz; j++) { 19099877982aSShri Abhyankar col = pj[j]; 19109371c9d4SSatish Balay pc4[j] = rtmp4[col]; 19119371c9d4SSatish Balay rs += PetscAbsScalar(pc4[j]); 19129877982aSShri Abhyankar } 19139877982aSShri Abhyankar 19149877982aSShri Abhyankar sctx.rs = rs; 19159877982aSShri Abhyankar sctx.pv = rtmp4[i + 3]; 19169566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(B, A, info, &sctx, i + 3)); 191707b50cabSHong Zhang if (sctx.newshift) break; 19189877982aSShri Abhyankar pc4 = b->a + bdiag[i + 3]; 19199877982aSShri Abhyankar *pc4 = 1.0 / sctx.pv; /* Mark diag[i+3] */ 19209877982aSShri Abhyankar break; 192168785679SHong Zhang 1922d71ae5a4SJacob Faibussowitsch default: 1923d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported "); 192428f1b45aSHong Zhang } 1925c2b86aeeSHong Zhang if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */ 192628f1b45aSHong Zhang i += nodesz; /* Update the row */ 192768785679SHong Zhang } 192828f1b45aSHong Zhang 192928f1b45aSHong Zhang /* MatPivotRefine() */ 193007b50cabSHong Zhang if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) { 193128f1b45aSHong Zhang /* 193228f1b45aSHong Zhang * if no shift in this attempt & shifting & started shifting & can refine, 193328f1b45aSHong Zhang * then try lower shift 193428f1b45aSHong Zhang */ 193528f1b45aSHong Zhang sctx.shift_hi = sctx.shift_fraction; 193628f1b45aSHong Zhang sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.; 193728f1b45aSHong Zhang sctx.shift_amount = sctx.shift_fraction * sctx.shift_top; 193807b50cabSHong Zhang sctx.newshift = PETSC_TRUE; 193928f1b45aSHong Zhang sctx.nshift++; 194028f1b45aSHong Zhang } 194107b50cabSHong Zhang } while (sctx.newshift); 194228f1b45aSHong Zhang 19439566063dSJacob Faibussowitsch PetscCall(PetscFree4(rtmp1, rtmp2, rtmp3, rtmp4)); 19449566063dSJacob Faibussowitsch PetscCall(PetscFree(tmp_vec2)); 19459566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isicol, &ic)); 19469566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &r)); 194728f1b45aSHong Zhang 1948abb87a52SBarry Smith if (b->inode.size) { 1949abb87a52SBarry Smith C->ops->solve = MatSolve_SeqAIJ_Inode; 1950abb87a52SBarry Smith } else { 1951d3ac4fa3SBarry Smith C->ops->solve = MatSolve_SeqAIJ; 1952abb87a52SBarry Smith } 195328f1b45aSHong Zhang C->ops->solveadd = MatSolveAdd_SeqAIJ; 195428f1b45aSHong Zhang C->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 195528f1b45aSHong Zhang C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 195628f1b45aSHong Zhang C->ops->matsolve = MatMatSolve_SeqAIJ; 195728f1b45aSHong Zhang C->assembled = PETSC_TRUE; 195828f1b45aSHong Zhang C->preallocated = PETSC_TRUE; 19592205254eSKarl Rupp 19609566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(C->cmap->n)); 196128f1b45aSHong Zhang 196228f1b45aSHong Zhang /* MatShiftView(A,info,&sctx) */ 196328f1b45aSHong Zhang if (sctx.nshift) { 1964f4db908eSBarry Smith if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 19659566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top)); 1966f4db908eSBarry Smith } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 19679566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 1968f4db908eSBarry Smith } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) { 19699566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount)); 197028f1b45aSHong Zhang } 197128f1b45aSHong Zhang } 19723ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 197328f1b45aSHong Zhang } 1974628f99d7SShri Abhyankar 1975*ff6a9541SJacob Faibussowitsch #if 0 1976*ff6a9541SJacob Faibussowitsch // unused 1977*ff6a9541SJacob Faibussowitsch static PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B, Mat A, const MatFactorInfo *info) 1978d71ae5a4SJacob Faibussowitsch { 1979628f99d7SShri Abhyankar Mat C = B; 1980628f99d7SShri Abhyankar Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data; 1981628f99d7SShri Abhyankar IS iscol = b->col, isrow = b->row, isicol = b->icol; 1982628f99d7SShri Abhyankar const PetscInt *r, *ic, *c, *ics; 1983628f99d7SShri Abhyankar PetscInt n = A->rmap->n, *bi = b->i; 1984628f99d7SShri Abhyankar PetscInt *bj = b->j, *nbj = b->j + 1, *ajtmp, *bjtmp, nz, nz_tmp, row, prow; 19858758e1faSBarry Smith PetscInt i, j, idx, *bd = b->diag, node_max, nodesz; 19868758e1faSBarry Smith PetscInt *ai = a->i, *aj = a->j; 1987628f99d7SShri Abhyankar PetscInt *ns, *tmp_vec1, *tmp_vec2, *nsmap, *pj; 1988628f99d7SShri Abhyankar PetscScalar mul1, mul2, mul3, tmp; 1989628f99d7SShri Abhyankar MatScalar *pc1, *pc2, *pc3, *ba = b->a, *pv, *rtmp11, *rtmp22, *rtmp33; 1990628f99d7SShri Abhyankar const MatScalar *v1, *v2, *v3, *aa = a->a, *rtmp1; 1991628f99d7SShri Abhyankar PetscReal rs = 0.0; 1992628f99d7SShri Abhyankar FactorShiftCtx sctx; 1993628f99d7SShri Abhyankar 1994628f99d7SShri Abhyankar PetscFunctionBegin; 1995628f99d7SShri Abhyankar sctx.shift_top = 0; 1996628f99d7SShri Abhyankar sctx.nshift_max = 0; 1997628f99d7SShri Abhyankar sctx.shift_lo = 0; 1998628f99d7SShri Abhyankar sctx.shift_hi = 0; 1999628f99d7SShri Abhyankar sctx.shift_fraction = 0; 2000628f99d7SShri Abhyankar 2001628f99d7SShri Abhyankar /* if both shift schemes are chosen by user, only use info->shiftpd */ 2002f4db908eSBarry Smith if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 2003628f99d7SShri Abhyankar sctx.shift_top = 0; 2004628f99d7SShri Abhyankar for (i = 0; i < n; i++) { 2005628f99d7SShri Abhyankar /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 2006628f99d7SShri Abhyankar rs = 0.0; 2007628f99d7SShri Abhyankar ajtmp = aj + ai[i]; 2008628f99d7SShri Abhyankar rtmp1 = aa + ai[i]; 2009628f99d7SShri Abhyankar nz = ai[i + 1] - ai[i]; 2010628f99d7SShri Abhyankar for (j = 0; j < nz; j++) { 2011628f99d7SShri Abhyankar if (*ajtmp != i) { 2012628f99d7SShri Abhyankar rs += PetscAbsScalar(*rtmp1++); 2013628f99d7SShri Abhyankar } else { 2014628f99d7SShri Abhyankar rs -= PetscRealPart(*rtmp1++); 2015628f99d7SShri Abhyankar } 2016628f99d7SShri Abhyankar ajtmp++; 2017628f99d7SShri Abhyankar } 2018628f99d7SShri Abhyankar if (rs > sctx.shift_top) sctx.shift_top = rs; 2019628f99d7SShri Abhyankar } 2020628f99d7SShri Abhyankar if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12; 2021628f99d7SShri Abhyankar sctx.shift_top *= 1.1; 2022628f99d7SShri Abhyankar sctx.nshift_max = 5; 2023628f99d7SShri Abhyankar sctx.shift_lo = 0.; 2024628f99d7SShri Abhyankar sctx.shift_hi = 1.; 2025628f99d7SShri Abhyankar } 2026628f99d7SShri Abhyankar sctx.shift_amount = 0; 2027628f99d7SShri Abhyankar sctx.nshift = 0; 2028628f99d7SShri Abhyankar 20299566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow, &r)); 20309566063dSJacob Faibussowitsch PetscCall(ISGetIndices(iscol, &c)); 20319566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isicol, &ic)); 20329566063dSJacob Faibussowitsch PetscCall(PetscCalloc3(n, &rtmp11, n, &rtmp22, n, &rtmp33)); 2033628f99d7SShri Abhyankar ics = ic; 2034628f99d7SShri Abhyankar 2035628f99d7SShri Abhyankar node_max = a->inode.node_count; 2036628f99d7SShri Abhyankar ns = a->inode.size; 203728b400f6SJacob Faibussowitsch PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information"); 2038628f99d7SShri Abhyankar 2039628f99d7SShri Abhyankar /* If max inode size > 3, split it into two inodes.*/ 2040628f99d7SShri Abhyankar /* also map the inode sizes according to the ordering */ 20419566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(n + 1, &tmp_vec1)); 2042628f99d7SShri Abhyankar for (i = 0, j = 0; i < node_max; ++i, ++j) { 2043628f99d7SShri Abhyankar if (ns[i] > 3) { 2044628f99d7SShri Abhyankar tmp_vec1[j] = ns[i] / 2; /* Assuming ns[i] < =5 */ 2045628f99d7SShri Abhyankar ++j; 2046628f99d7SShri Abhyankar tmp_vec1[j] = ns[i] - tmp_vec1[j - 1]; 2047628f99d7SShri Abhyankar } else { 2048628f99d7SShri Abhyankar tmp_vec1[j] = ns[i]; 2049628f99d7SShri Abhyankar } 2050628f99d7SShri Abhyankar } 2051628f99d7SShri Abhyankar /* Use the correct node_max */ 2052628f99d7SShri Abhyankar node_max = j; 2053628f99d7SShri Abhyankar 2054628f99d7SShri Abhyankar /* Now reorder the inode info based on mat re-ordering info */ 2055628f99d7SShri Abhyankar /* First create a row -> inode_size_array_index map */ 20569566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(n + 1, &nsmap, node_max + 1, &tmp_vec2)); 2057628f99d7SShri Abhyankar for (i = 0, row = 0; i < node_max; i++) { 2058628f99d7SShri Abhyankar nodesz = tmp_vec1[i]; 2059ad540459SPierre Jolivet for (j = 0; j < nodesz; j++, row++) nsmap[row] = i; 2060628f99d7SShri Abhyankar } 2061628f99d7SShri Abhyankar /* Using nsmap, create a reordered ns structure */ 2062628f99d7SShri Abhyankar for (i = 0, j = 0; i < node_max; i++) { 2063628f99d7SShri Abhyankar nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 2064628f99d7SShri Abhyankar tmp_vec2[i] = nodesz; 2065628f99d7SShri Abhyankar j += nodesz; 2066628f99d7SShri Abhyankar } 20679566063dSJacob Faibussowitsch PetscCall(PetscFree2(nsmap, tmp_vec1)); 2068628f99d7SShri Abhyankar /* Now use the correct ns */ 2069628f99d7SShri Abhyankar ns = tmp_vec2; 2070628f99d7SShri Abhyankar 2071628f99d7SShri Abhyankar do { 207207b50cabSHong Zhang sctx.newshift = PETSC_FALSE; 2073628f99d7SShri Abhyankar /* Now loop over each block-row, and do the factorization */ 2074628f99d7SShri Abhyankar for (i = 0, row = 0; i < node_max; i++) { 2075628f99d7SShri Abhyankar nodesz = ns[i]; 2076628f99d7SShri Abhyankar nz = bi[row + 1] - bi[row]; 2077628f99d7SShri Abhyankar bjtmp = bj + bi[row]; 2078628f99d7SShri Abhyankar 2079628f99d7SShri Abhyankar switch (nodesz) { 2080628f99d7SShri Abhyankar case 1: 2081628f99d7SShri Abhyankar for (j = 0; j < nz; j++) { 2082628f99d7SShri Abhyankar idx = bjtmp[j]; 2083628f99d7SShri Abhyankar rtmp11[idx] = 0.0; 2084628f99d7SShri Abhyankar } 2085628f99d7SShri Abhyankar 2086628f99d7SShri Abhyankar /* load in initial (unfactored row) */ 2087628f99d7SShri Abhyankar idx = r[row]; 2088628f99d7SShri Abhyankar nz_tmp = ai[idx + 1] - ai[idx]; 2089628f99d7SShri Abhyankar ajtmp = aj + ai[idx]; 2090628f99d7SShri Abhyankar v1 = aa + ai[idx]; 2091628f99d7SShri Abhyankar 2092628f99d7SShri Abhyankar for (j = 0; j < nz_tmp; j++) { 2093628f99d7SShri Abhyankar idx = ics[ajtmp[j]]; 2094628f99d7SShri Abhyankar rtmp11[idx] = v1[j]; 2095628f99d7SShri Abhyankar } 2096628f99d7SShri Abhyankar rtmp11[ics[r[row]]] += sctx.shift_amount; 2097628f99d7SShri Abhyankar 2098628f99d7SShri Abhyankar prow = *bjtmp++; 2099628f99d7SShri Abhyankar while (prow < row) { 2100628f99d7SShri Abhyankar pc1 = rtmp11 + prow; 2101628f99d7SShri Abhyankar if (*pc1 != 0.0) { 2102628f99d7SShri Abhyankar pv = ba + bd[prow]; 2103628f99d7SShri Abhyankar pj = nbj + bd[prow]; 2104628f99d7SShri Abhyankar mul1 = *pc1 * *pv++; 2105628f99d7SShri Abhyankar *pc1 = mul1; 2106628f99d7SShri Abhyankar nz_tmp = bi[prow + 1] - bd[prow] - 1; 21079566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp)); 2108628f99d7SShri Abhyankar for (j = 0; j < nz_tmp; j++) { 2109628f99d7SShri Abhyankar tmp = pv[j]; 2110628f99d7SShri Abhyankar idx = pj[j]; 2111628f99d7SShri Abhyankar rtmp11[idx] -= mul1 * tmp; 2112628f99d7SShri Abhyankar } 2113628f99d7SShri Abhyankar } 2114628f99d7SShri Abhyankar prow = *bjtmp++; 2115628f99d7SShri Abhyankar } 2116628f99d7SShri Abhyankar pj = bj + bi[row]; 2117628f99d7SShri Abhyankar pc1 = ba + bi[row]; 2118628f99d7SShri Abhyankar 2119628f99d7SShri Abhyankar sctx.pv = rtmp11[row]; 2120628f99d7SShri Abhyankar rtmp11[row] = 1.0 / rtmp11[row]; /* invert diag */ 2121628f99d7SShri Abhyankar rs = 0.0; 2122628f99d7SShri Abhyankar for (j = 0; j < nz; j++) { 2123628f99d7SShri Abhyankar idx = pj[j]; 2124628f99d7SShri Abhyankar pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */ 2125628f99d7SShri Abhyankar if (idx != row) rs += PetscAbsScalar(pc1[j]); 2126628f99d7SShri Abhyankar } 2127628f99d7SShri Abhyankar sctx.rs = rs; 21289566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(B, A, info, &sctx, row)); 212907b50cabSHong Zhang if (sctx.newshift) goto endofwhile; 2130628f99d7SShri Abhyankar break; 2131628f99d7SShri Abhyankar 2132628f99d7SShri Abhyankar case 2: 2133628f99d7SShri Abhyankar for (j = 0; j < nz; j++) { 2134628f99d7SShri Abhyankar idx = bjtmp[j]; 2135628f99d7SShri Abhyankar rtmp11[idx] = 0.0; 2136628f99d7SShri Abhyankar rtmp22[idx] = 0.0; 2137628f99d7SShri Abhyankar } 2138628f99d7SShri Abhyankar 2139628f99d7SShri Abhyankar /* load in initial (unfactored row) */ 2140628f99d7SShri Abhyankar idx = r[row]; 2141628f99d7SShri Abhyankar nz_tmp = ai[idx + 1] - ai[idx]; 2142628f99d7SShri Abhyankar ajtmp = aj + ai[idx]; 2143628f99d7SShri Abhyankar v1 = aa + ai[idx]; 2144628f99d7SShri Abhyankar v2 = aa + ai[idx + 1]; 2145628f99d7SShri Abhyankar for (j = 0; j < nz_tmp; j++) { 2146628f99d7SShri Abhyankar idx = ics[ajtmp[j]]; 2147628f99d7SShri Abhyankar rtmp11[idx] = v1[j]; 2148628f99d7SShri Abhyankar rtmp22[idx] = v2[j]; 2149628f99d7SShri Abhyankar } 2150628f99d7SShri Abhyankar rtmp11[ics[r[row]]] += sctx.shift_amount; 2151628f99d7SShri Abhyankar rtmp22[ics[r[row + 1]]] += sctx.shift_amount; 2152628f99d7SShri Abhyankar 2153628f99d7SShri Abhyankar prow = *bjtmp++; 2154628f99d7SShri Abhyankar while (prow < row) { 2155628f99d7SShri Abhyankar pc1 = rtmp11 + prow; 2156628f99d7SShri Abhyankar pc2 = rtmp22 + prow; 2157628f99d7SShri Abhyankar if (*pc1 != 0.0 || *pc2 != 0.0) { 2158628f99d7SShri Abhyankar pv = ba + bd[prow]; 2159628f99d7SShri Abhyankar pj = nbj + bd[prow]; 2160628f99d7SShri Abhyankar mul1 = *pc1 * *pv; 2161628f99d7SShri Abhyankar mul2 = *pc2 * *pv; 2162628f99d7SShri Abhyankar ++pv; 2163628f99d7SShri Abhyankar *pc1 = mul1; 2164628f99d7SShri Abhyankar *pc2 = mul2; 2165628f99d7SShri Abhyankar 2166628f99d7SShri Abhyankar nz_tmp = bi[prow + 1] - bd[prow] - 1; 2167628f99d7SShri Abhyankar for (j = 0; j < nz_tmp; j++) { 2168628f99d7SShri Abhyankar tmp = pv[j]; 2169628f99d7SShri Abhyankar idx = pj[j]; 2170628f99d7SShri Abhyankar rtmp11[idx] -= mul1 * tmp; 2171628f99d7SShri Abhyankar rtmp22[idx] -= mul2 * tmp; 2172628f99d7SShri Abhyankar } 21739566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp)); 2174628f99d7SShri Abhyankar } 2175628f99d7SShri Abhyankar prow = *bjtmp++; 2176628f99d7SShri Abhyankar } 2177628f99d7SShri Abhyankar 2178628f99d7SShri Abhyankar /* Now take care of diagonal 2x2 block. Note: prow = row here */ 2179628f99d7SShri Abhyankar pc1 = rtmp11 + prow; 2180628f99d7SShri Abhyankar pc2 = rtmp22 + prow; 2181628f99d7SShri Abhyankar 2182628f99d7SShri Abhyankar sctx.pv = *pc1; 2183628f99d7SShri Abhyankar pj = bj + bi[prow]; 2184628f99d7SShri Abhyankar rs = 0.0; 2185628f99d7SShri Abhyankar for (j = 0; j < nz; j++) { 2186628f99d7SShri Abhyankar idx = pj[j]; 2187628f99d7SShri Abhyankar if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]); 2188628f99d7SShri Abhyankar } 2189628f99d7SShri Abhyankar sctx.rs = rs; 21909566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(B, A, info, &sctx, row)); 219107b50cabSHong Zhang if (sctx.newshift) goto endofwhile; 2192628f99d7SShri Abhyankar 2193628f99d7SShri Abhyankar if (*pc2 != 0.0) { 2194628f99d7SShri Abhyankar pj = nbj + bd[prow]; 2195628f99d7SShri Abhyankar mul2 = (*pc2) / (*pc1); /* since diag is not yet inverted.*/ 2196628f99d7SShri Abhyankar *pc2 = mul2; 2197628f99d7SShri Abhyankar nz_tmp = bi[prow + 1] - bd[prow] - 1; 2198628f99d7SShri Abhyankar for (j = 0; j < nz_tmp; j++) { 2199628f99d7SShri Abhyankar idx = pj[j]; 2200628f99d7SShri Abhyankar tmp = rtmp11[idx]; 2201628f99d7SShri Abhyankar rtmp22[idx] -= mul2 * tmp; 2202628f99d7SShri Abhyankar } 22039566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp)); 2204628f99d7SShri Abhyankar } 2205628f99d7SShri Abhyankar 2206628f99d7SShri Abhyankar pj = bj + bi[row]; 2207628f99d7SShri Abhyankar pc1 = ba + bi[row]; 2208628f99d7SShri Abhyankar pc2 = ba + bi[row + 1]; 2209628f99d7SShri Abhyankar 2210628f99d7SShri Abhyankar sctx.pv = rtmp22[row + 1]; 2211628f99d7SShri Abhyankar rs = 0.0; 2212628f99d7SShri Abhyankar rtmp11[row] = 1.0 / rtmp11[row]; 2213628f99d7SShri Abhyankar rtmp22[row + 1] = 1.0 / rtmp22[row + 1]; 2214628f99d7SShri Abhyankar /* copy row entries from dense representation to sparse */ 2215628f99d7SShri Abhyankar for (j = 0; j < nz; j++) { 2216628f99d7SShri Abhyankar idx = pj[j]; 2217628f99d7SShri Abhyankar pc1[j] = rtmp11[idx]; 2218628f99d7SShri Abhyankar pc2[j] = rtmp22[idx]; 2219628f99d7SShri Abhyankar if (idx != row + 1) rs += PetscAbsScalar(pc2[j]); 2220628f99d7SShri Abhyankar } 2221628f99d7SShri Abhyankar sctx.rs = rs; 22229566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1)); 222307b50cabSHong Zhang if (sctx.newshift) goto endofwhile; 2224628f99d7SShri Abhyankar break; 2225628f99d7SShri Abhyankar 2226628f99d7SShri Abhyankar case 3: 2227628f99d7SShri Abhyankar for (j = 0; j < nz; j++) { 2228628f99d7SShri Abhyankar idx = bjtmp[j]; 2229628f99d7SShri Abhyankar rtmp11[idx] = 0.0; 2230628f99d7SShri Abhyankar rtmp22[idx] = 0.0; 2231628f99d7SShri Abhyankar rtmp33[idx] = 0.0; 2232628f99d7SShri Abhyankar } 2233628f99d7SShri Abhyankar /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */ 2234628f99d7SShri Abhyankar idx = r[row]; 2235628f99d7SShri Abhyankar nz_tmp = ai[idx + 1] - ai[idx]; 2236628f99d7SShri Abhyankar ajtmp = aj + ai[idx]; 2237628f99d7SShri Abhyankar v1 = aa + ai[idx]; 2238628f99d7SShri Abhyankar v2 = aa + ai[idx + 1]; 2239628f99d7SShri Abhyankar v3 = aa + ai[idx + 2]; 2240628f99d7SShri Abhyankar for (j = 0; j < nz_tmp; j++) { 2241628f99d7SShri Abhyankar idx = ics[ajtmp[j]]; 2242628f99d7SShri Abhyankar rtmp11[idx] = v1[j]; 2243628f99d7SShri Abhyankar rtmp22[idx] = v2[j]; 2244628f99d7SShri Abhyankar rtmp33[idx] = v3[j]; 2245628f99d7SShri Abhyankar } 2246628f99d7SShri Abhyankar rtmp11[ics[r[row]]] += sctx.shift_amount; 2247628f99d7SShri Abhyankar rtmp22[ics[r[row + 1]]] += sctx.shift_amount; 2248628f99d7SShri Abhyankar rtmp33[ics[r[row + 2]]] += sctx.shift_amount; 2249628f99d7SShri Abhyankar 2250628f99d7SShri Abhyankar /* loop over all pivot row blocks above this row block */ 2251628f99d7SShri Abhyankar prow = *bjtmp++; 2252628f99d7SShri Abhyankar while (prow < row) { 2253628f99d7SShri Abhyankar pc1 = rtmp11 + prow; 2254628f99d7SShri Abhyankar pc2 = rtmp22 + prow; 2255628f99d7SShri Abhyankar pc3 = rtmp33 + prow; 2256628f99d7SShri Abhyankar if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) { 2257628f99d7SShri Abhyankar pv = ba + bd[prow]; 2258628f99d7SShri Abhyankar pj = nbj + bd[prow]; 2259628f99d7SShri Abhyankar mul1 = *pc1 * *pv; 2260628f99d7SShri Abhyankar mul2 = *pc2 * *pv; 2261628f99d7SShri Abhyankar mul3 = *pc3 * *pv; 2262628f99d7SShri Abhyankar ++pv; 2263628f99d7SShri Abhyankar *pc1 = mul1; 2264628f99d7SShri Abhyankar *pc2 = mul2; 2265628f99d7SShri Abhyankar *pc3 = mul3; 2266628f99d7SShri Abhyankar 2267628f99d7SShri Abhyankar nz_tmp = bi[prow + 1] - bd[prow] - 1; 2268628f99d7SShri Abhyankar /* update this row based on pivot row */ 2269628f99d7SShri Abhyankar for (j = 0; j < nz_tmp; j++) { 2270628f99d7SShri Abhyankar tmp = pv[j]; 2271628f99d7SShri Abhyankar idx = pj[j]; 2272628f99d7SShri Abhyankar rtmp11[idx] -= mul1 * tmp; 2273628f99d7SShri Abhyankar rtmp22[idx] -= mul2 * tmp; 2274628f99d7SShri Abhyankar rtmp33[idx] -= mul3 * tmp; 2275628f99d7SShri Abhyankar } 22769566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(3 + 6.0 * nz_tmp)); 2277628f99d7SShri Abhyankar } 2278628f99d7SShri Abhyankar prow = *bjtmp++; 2279628f99d7SShri Abhyankar } 2280628f99d7SShri Abhyankar 2281628f99d7SShri Abhyankar /* Now take care of diagonal 3x3 block in this set of rows */ 2282628f99d7SShri Abhyankar /* note: prow = row here */ 2283628f99d7SShri Abhyankar pc1 = rtmp11 + prow; 2284628f99d7SShri Abhyankar pc2 = rtmp22 + prow; 2285628f99d7SShri Abhyankar pc3 = rtmp33 + prow; 2286628f99d7SShri Abhyankar 2287628f99d7SShri Abhyankar sctx.pv = *pc1; 2288628f99d7SShri Abhyankar pj = bj + bi[prow]; 2289628f99d7SShri Abhyankar rs = 0.0; 2290628f99d7SShri Abhyankar for (j = 0; j < nz; j++) { 2291628f99d7SShri Abhyankar idx = pj[j]; 2292628f99d7SShri Abhyankar if (idx != row) rs += PetscAbsScalar(rtmp11[idx]); 2293628f99d7SShri Abhyankar } 2294628f99d7SShri Abhyankar sctx.rs = rs; 22959566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(B, A, info, &sctx, row)); 229607b50cabSHong Zhang if (sctx.newshift) goto endofwhile; 2297628f99d7SShri Abhyankar 2298628f99d7SShri Abhyankar if (*pc2 != 0.0 || *pc3 != 0.0) { 2299628f99d7SShri Abhyankar mul2 = (*pc2) / (*pc1); 2300628f99d7SShri Abhyankar mul3 = (*pc3) / (*pc1); 2301628f99d7SShri Abhyankar *pc2 = mul2; 2302628f99d7SShri Abhyankar *pc3 = mul3; 2303628f99d7SShri Abhyankar nz_tmp = bi[prow + 1] - bd[prow] - 1; 2304628f99d7SShri Abhyankar pj = nbj + bd[prow]; 2305628f99d7SShri Abhyankar for (j = 0; j < nz_tmp; j++) { 2306628f99d7SShri Abhyankar idx = pj[j]; 2307628f99d7SShri Abhyankar tmp = rtmp11[idx]; 2308628f99d7SShri Abhyankar rtmp22[idx] -= mul2 * tmp; 2309628f99d7SShri Abhyankar rtmp33[idx] -= mul3 * tmp; 2310628f99d7SShri Abhyankar } 23119566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp)); 2312628f99d7SShri Abhyankar } 2313628f99d7SShri Abhyankar ++prow; 2314628f99d7SShri Abhyankar 2315628f99d7SShri Abhyankar pc2 = rtmp22 + prow; 2316628f99d7SShri Abhyankar pc3 = rtmp33 + prow; 2317628f99d7SShri Abhyankar sctx.pv = *pc2; 2318628f99d7SShri Abhyankar pj = bj + bi[prow]; 2319628f99d7SShri Abhyankar rs = 0.0; 2320628f99d7SShri Abhyankar for (j = 0; j < nz; j++) { 2321628f99d7SShri Abhyankar idx = pj[j]; 2322628f99d7SShri Abhyankar if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]); 2323628f99d7SShri Abhyankar } 2324628f99d7SShri Abhyankar sctx.rs = rs; 23259566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1)); 232607b50cabSHong Zhang if (sctx.newshift) goto endofwhile; 2327628f99d7SShri Abhyankar 2328628f99d7SShri Abhyankar if (*pc3 != 0.0) { 2329628f99d7SShri Abhyankar mul3 = (*pc3) / (*pc2); 2330628f99d7SShri Abhyankar *pc3 = mul3; 2331628f99d7SShri Abhyankar pj = nbj + bd[prow]; 2332628f99d7SShri Abhyankar nz_tmp = bi[prow + 1] - bd[prow] - 1; 2333628f99d7SShri Abhyankar for (j = 0; j < nz_tmp; j++) { 2334628f99d7SShri Abhyankar idx = pj[j]; 2335628f99d7SShri Abhyankar tmp = rtmp22[idx]; 2336628f99d7SShri Abhyankar rtmp33[idx] -= mul3 * tmp; 2337628f99d7SShri Abhyankar } 23389566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp)); 2339628f99d7SShri Abhyankar } 2340628f99d7SShri Abhyankar 2341628f99d7SShri Abhyankar pj = bj + bi[row]; 2342628f99d7SShri Abhyankar pc1 = ba + bi[row]; 2343628f99d7SShri Abhyankar pc2 = ba + bi[row + 1]; 2344628f99d7SShri Abhyankar pc3 = ba + bi[row + 2]; 2345628f99d7SShri Abhyankar 2346628f99d7SShri Abhyankar sctx.pv = rtmp33[row + 2]; 2347628f99d7SShri Abhyankar rs = 0.0; 2348628f99d7SShri Abhyankar rtmp11[row] = 1.0 / rtmp11[row]; 2349628f99d7SShri Abhyankar rtmp22[row + 1] = 1.0 / rtmp22[row + 1]; 2350628f99d7SShri Abhyankar rtmp33[row + 2] = 1.0 / rtmp33[row + 2]; 2351628f99d7SShri Abhyankar /* copy row entries from dense representation to sparse */ 2352628f99d7SShri Abhyankar for (j = 0; j < nz; j++) { 2353628f99d7SShri Abhyankar idx = pj[j]; 2354628f99d7SShri Abhyankar pc1[j] = rtmp11[idx]; 2355628f99d7SShri Abhyankar pc2[j] = rtmp22[idx]; 2356628f99d7SShri Abhyankar pc3[j] = rtmp33[idx]; 2357628f99d7SShri Abhyankar if (idx != row + 2) rs += PetscAbsScalar(pc3[j]); 2358628f99d7SShri Abhyankar } 2359628f99d7SShri Abhyankar 2360628f99d7SShri Abhyankar sctx.rs = rs; 23619566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(B, A, info, &sctx, row + 2)); 236207b50cabSHong Zhang if (sctx.newshift) goto endofwhile; 2363628f99d7SShri Abhyankar break; 2364628f99d7SShri Abhyankar 2365d71ae5a4SJacob Faibussowitsch default: 2366d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported "); 2367628f99d7SShri Abhyankar } 2368628f99d7SShri Abhyankar row += nodesz; /* Update the row */ 2369628f99d7SShri Abhyankar } 2370628f99d7SShri Abhyankar endofwhile:; 237107b50cabSHong Zhang } while (sctx.newshift); 23729566063dSJacob Faibussowitsch PetscCall(PetscFree3(rtmp11, rtmp22, rtmp33)); 23739566063dSJacob Faibussowitsch PetscCall(PetscFree(tmp_vec2)); 23749566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isicol, &ic)); 23759566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &r)); 23769566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &c)); 23772205254eSKarl Rupp 2378d3ac4fa3SBarry Smith (B)->ops->solve = MatSolve_SeqAIJ_inplace; 2379628f99d7SShri Abhyankar /* do not set solve add, since MatSolve_Inode + Add is faster */ 2380628f99d7SShri Abhyankar C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace; 2381628f99d7SShri Abhyankar C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace; 2382628f99d7SShri Abhyankar C->assembled = PETSC_TRUE; 2383628f99d7SShri Abhyankar C->preallocated = PETSC_TRUE; 2384628f99d7SShri Abhyankar if (sctx.nshift) { 2385f4db908eSBarry Smith if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 23869566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top)); 2387f4db908eSBarry Smith } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 23889566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 2389628f99d7SShri Abhyankar } 2390628f99d7SShri Abhyankar } 23919566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(C->cmap->n)); 23929566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCheckInode(C)); 23933ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2394628f99d7SShri Abhyankar } 2395*ff6a9541SJacob Faibussowitsch #endif 2396628f99d7SShri Abhyankar 2397019b515eSShri Abhyankar /* ----------------------------------------------------------- */ 2398d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A, Vec bb, Vec xx) 2399d71ae5a4SJacob Faibussowitsch { 2400019b515eSShri Abhyankar Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2401019b515eSShri Abhyankar IS iscol = a->col, isrow = a->row; 2402019b515eSShri Abhyankar const PetscInt *r, *c, *rout, *cout; 24038758e1faSBarry Smith PetscInt i, j, n = A->rmap->n; 24048758e1faSBarry Smith PetscInt node_max, row, nsz, aii, i0, i1, nz; 24058758e1faSBarry Smith const PetscInt *ai = a->i, *a_j = a->j, *ns, *vi, *ad, *aj; 2406019b515eSShri Abhyankar PetscScalar *x, *tmp, *tmps, tmp0, tmp1; 2407019b515eSShri Abhyankar PetscScalar sum1, sum2, sum3, sum4, sum5; 2408019b515eSShri Abhyankar const MatScalar *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa; 2409019b515eSShri Abhyankar const PetscScalar *b; 2410019b515eSShri Abhyankar 2411019b515eSShri Abhyankar PetscFunctionBegin; 241208401ef6SPierre Jolivet PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 2413019b515eSShri Abhyankar node_max = a->inode.node_count; 2414019b515eSShri Abhyankar ns = a->inode.size; /* Node Size array */ 2415019b515eSShri Abhyankar 24169566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 24179566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(xx, &x)); 2418019b515eSShri Abhyankar tmp = a->solve_work; 2419019b515eSShri Abhyankar 24209371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 24219371c9d4SSatish Balay r = rout; 24229371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 24239371c9d4SSatish Balay c = cout; 2424019b515eSShri Abhyankar 2425019b515eSShri Abhyankar /* forward solve the lower triangular */ 2426019b515eSShri Abhyankar tmps = tmp; 2427019b515eSShri Abhyankar aa = a_a; 2428019b515eSShri Abhyankar aj = a_j; 2429019b515eSShri Abhyankar ad = a->diag; 2430019b515eSShri Abhyankar 2431019b515eSShri Abhyankar for (i = 0, row = 0; i < node_max; ++i) { 2432019b515eSShri Abhyankar nsz = ns[i]; 2433019b515eSShri Abhyankar aii = ai[row]; 2434019b515eSShri Abhyankar v1 = aa + aii; 2435019b515eSShri Abhyankar vi = aj + aii; 2436019b515eSShri Abhyankar nz = ai[row + 1] - ai[row]; 2437019b515eSShri Abhyankar 243898991853SShri Abhyankar if (i < node_max - 1) { 243998991853SShri Abhyankar /* Prefetch the indices for the next block */ 244050d8bf02SJed Brown PetscPrefetchBlock(aj + ai[row + nsz], ai[row + nsz + 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */ 244198991853SShri Abhyankar /* Prefetch the data for the next block */ 244250d8bf02SJed Brown PetscPrefetchBlock(aa + ai[row + nsz], ai[row + nsz + ns[i + 1]] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); 244398991853SShri Abhyankar } 244498991853SShri Abhyankar 2445019b515eSShri Abhyankar switch (nsz) { /* Each loop in 'case' is unrolled */ 2446019b515eSShri Abhyankar case 1: 2447019b515eSShri Abhyankar sum1 = b[r[row]]; 2448019b515eSShri Abhyankar for (j = 0; j < nz - 1; j += 2) { 2449019b515eSShri Abhyankar i0 = vi[j]; 2450019b515eSShri Abhyankar i1 = vi[j + 1]; 2451019b515eSShri Abhyankar tmp0 = tmps[i0]; 2452019b515eSShri Abhyankar tmp1 = tmps[i1]; 2453019b515eSShri Abhyankar sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2454019b515eSShri Abhyankar } 2455019b515eSShri Abhyankar if (j == nz - 1) { 2456019b515eSShri Abhyankar tmp0 = tmps[vi[j]]; 2457019b515eSShri Abhyankar sum1 -= v1[j] * tmp0; 2458019b515eSShri Abhyankar } 2459019b515eSShri Abhyankar tmp[row++] = sum1; 2460019b515eSShri Abhyankar break; 2461019b515eSShri Abhyankar case 2: 2462019b515eSShri Abhyankar sum1 = b[r[row]]; 2463019b515eSShri Abhyankar sum2 = b[r[row + 1]]; 2464019b515eSShri Abhyankar v2 = aa + ai[row + 1]; 2465019b515eSShri Abhyankar 2466019b515eSShri Abhyankar for (j = 0; j < nz - 1; j += 2) { 2467019b515eSShri Abhyankar i0 = vi[j]; 2468019b515eSShri Abhyankar i1 = vi[j + 1]; 2469019b515eSShri Abhyankar tmp0 = tmps[i0]; 2470019b515eSShri Abhyankar tmp1 = tmps[i1]; 2471019b515eSShri Abhyankar sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2472019b515eSShri Abhyankar sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1; 2473019b515eSShri Abhyankar } 2474019b515eSShri Abhyankar if (j == nz - 1) { 2475019b515eSShri Abhyankar tmp0 = tmps[vi[j]]; 2476019b515eSShri Abhyankar sum1 -= v1[j] * tmp0; 2477019b515eSShri Abhyankar sum2 -= v2[j] * tmp0; 2478019b515eSShri Abhyankar } 2479019b515eSShri Abhyankar sum2 -= v2[nz] * sum1; 2480019b515eSShri Abhyankar tmp[row++] = sum1; 2481019b515eSShri Abhyankar tmp[row++] = sum2; 2482019b515eSShri Abhyankar break; 2483019b515eSShri Abhyankar case 3: 2484019b515eSShri Abhyankar sum1 = b[r[row]]; 2485019b515eSShri Abhyankar sum2 = b[r[row + 1]]; 2486019b515eSShri Abhyankar sum3 = b[r[row + 2]]; 2487019b515eSShri Abhyankar v2 = aa + ai[row + 1]; 2488019b515eSShri Abhyankar v3 = aa + ai[row + 2]; 2489019b515eSShri Abhyankar 2490019b515eSShri Abhyankar for (j = 0; j < nz - 1; j += 2) { 2491019b515eSShri Abhyankar i0 = vi[j]; 2492019b515eSShri Abhyankar i1 = vi[j + 1]; 2493019b515eSShri Abhyankar tmp0 = tmps[i0]; 2494019b515eSShri Abhyankar tmp1 = tmps[i1]; 2495019b515eSShri Abhyankar sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2496019b515eSShri Abhyankar sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1; 2497019b515eSShri Abhyankar sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1; 2498019b515eSShri Abhyankar } 2499019b515eSShri Abhyankar if (j == nz - 1) { 2500019b515eSShri Abhyankar tmp0 = tmps[vi[j]]; 2501019b515eSShri Abhyankar sum1 -= v1[j] * tmp0; 2502019b515eSShri Abhyankar sum2 -= v2[j] * tmp0; 2503019b515eSShri Abhyankar sum3 -= v3[j] * tmp0; 2504019b515eSShri Abhyankar } 2505019b515eSShri Abhyankar sum2 -= v2[nz] * sum1; 2506019b515eSShri Abhyankar sum3 -= v3[nz] * sum1; 2507019b515eSShri Abhyankar sum3 -= v3[nz + 1] * sum2; 2508019b515eSShri Abhyankar tmp[row++] = sum1; 2509019b515eSShri Abhyankar tmp[row++] = sum2; 2510019b515eSShri Abhyankar tmp[row++] = sum3; 2511019b515eSShri Abhyankar break; 2512019b515eSShri Abhyankar 2513019b515eSShri Abhyankar case 4: 2514019b515eSShri Abhyankar sum1 = b[r[row]]; 2515019b515eSShri Abhyankar sum2 = b[r[row + 1]]; 2516019b515eSShri Abhyankar sum3 = b[r[row + 2]]; 2517019b515eSShri Abhyankar sum4 = b[r[row + 3]]; 2518019b515eSShri Abhyankar v2 = aa + ai[row + 1]; 2519019b515eSShri Abhyankar v3 = aa + ai[row + 2]; 2520019b515eSShri Abhyankar v4 = aa + ai[row + 3]; 2521019b515eSShri Abhyankar 2522019b515eSShri Abhyankar for (j = 0; j < nz - 1; j += 2) { 2523019b515eSShri Abhyankar i0 = vi[j]; 2524019b515eSShri Abhyankar i1 = vi[j + 1]; 2525019b515eSShri Abhyankar tmp0 = tmps[i0]; 2526019b515eSShri Abhyankar tmp1 = tmps[i1]; 2527019b515eSShri Abhyankar sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2528019b515eSShri Abhyankar sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1; 2529019b515eSShri Abhyankar sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1; 2530019b515eSShri Abhyankar sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1; 2531019b515eSShri Abhyankar } 2532019b515eSShri Abhyankar if (j == nz - 1) { 2533019b515eSShri Abhyankar tmp0 = tmps[vi[j]]; 2534019b515eSShri Abhyankar sum1 -= v1[j] * tmp0; 2535019b515eSShri Abhyankar sum2 -= v2[j] * tmp0; 2536019b515eSShri Abhyankar sum3 -= v3[j] * tmp0; 2537019b515eSShri Abhyankar sum4 -= v4[j] * tmp0; 2538019b515eSShri Abhyankar } 2539019b515eSShri Abhyankar sum2 -= v2[nz] * sum1; 2540019b515eSShri Abhyankar sum3 -= v3[nz] * sum1; 2541019b515eSShri Abhyankar sum4 -= v4[nz] * sum1; 2542019b515eSShri Abhyankar sum3 -= v3[nz + 1] * sum2; 2543019b515eSShri Abhyankar sum4 -= v4[nz + 1] * sum2; 2544019b515eSShri Abhyankar sum4 -= v4[nz + 2] * sum3; 2545019b515eSShri Abhyankar 2546019b515eSShri Abhyankar tmp[row++] = sum1; 2547019b515eSShri Abhyankar tmp[row++] = sum2; 2548019b515eSShri Abhyankar tmp[row++] = sum3; 2549019b515eSShri Abhyankar tmp[row++] = sum4; 2550019b515eSShri Abhyankar break; 2551019b515eSShri Abhyankar case 5: 2552019b515eSShri Abhyankar sum1 = b[r[row]]; 2553019b515eSShri Abhyankar sum2 = b[r[row + 1]]; 2554019b515eSShri Abhyankar sum3 = b[r[row + 2]]; 2555019b515eSShri Abhyankar sum4 = b[r[row + 3]]; 2556019b515eSShri Abhyankar sum5 = b[r[row + 4]]; 2557019b515eSShri Abhyankar v2 = aa + ai[row + 1]; 2558019b515eSShri Abhyankar v3 = aa + ai[row + 2]; 2559019b515eSShri Abhyankar v4 = aa + ai[row + 3]; 2560019b515eSShri Abhyankar v5 = aa + ai[row + 4]; 2561019b515eSShri Abhyankar 2562019b515eSShri Abhyankar for (j = 0; j < nz - 1; j += 2) { 2563019b515eSShri Abhyankar i0 = vi[j]; 2564019b515eSShri Abhyankar i1 = vi[j + 1]; 2565019b515eSShri Abhyankar tmp0 = tmps[i0]; 2566019b515eSShri Abhyankar tmp1 = tmps[i1]; 2567019b515eSShri Abhyankar sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2568019b515eSShri Abhyankar sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1; 2569019b515eSShri Abhyankar sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1; 2570019b515eSShri Abhyankar sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1; 2571019b515eSShri Abhyankar sum5 -= v5[j] * tmp0 + v5[j + 1] * tmp1; 2572019b515eSShri Abhyankar } 2573019b515eSShri Abhyankar if (j == nz - 1) { 2574019b515eSShri Abhyankar tmp0 = tmps[vi[j]]; 2575019b515eSShri Abhyankar sum1 -= v1[j] * tmp0; 2576019b515eSShri Abhyankar sum2 -= v2[j] * tmp0; 2577019b515eSShri Abhyankar sum3 -= v3[j] * tmp0; 2578019b515eSShri Abhyankar sum4 -= v4[j] * tmp0; 2579019b515eSShri Abhyankar sum5 -= v5[j] * tmp0; 2580019b515eSShri Abhyankar } 2581019b515eSShri Abhyankar 2582019b515eSShri Abhyankar sum2 -= v2[nz] * sum1; 2583019b515eSShri Abhyankar sum3 -= v3[nz] * sum1; 2584019b515eSShri Abhyankar sum4 -= v4[nz] * sum1; 2585019b515eSShri Abhyankar sum5 -= v5[nz] * sum1; 2586019b515eSShri Abhyankar sum3 -= v3[nz + 1] * sum2; 2587019b515eSShri Abhyankar sum4 -= v4[nz + 1] * sum2; 2588019b515eSShri Abhyankar sum5 -= v5[nz + 1] * sum2; 2589019b515eSShri Abhyankar sum4 -= v4[nz + 2] * sum3; 2590019b515eSShri Abhyankar sum5 -= v5[nz + 2] * sum3; 2591019b515eSShri Abhyankar sum5 -= v5[nz + 3] * sum4; 2592019b515eSShri Abhyankar 2593019b515eSShri Abhyankar tmp[row++] = sum1; 2594019b515eSShri Abhyankar tmp[row++] = sum2; 2595019b515eSShri Abhyankar tmp[row++] = sum3; 2596019b515eSShri Abhyankar tmp[row++] = sum4; 2597019b515eSShri Abhyankar tmp[row++] = sum5; 2598019b515eSShri Abhyankar break; 2599d71ae5a4SJacob Faibussowitsch default: 2600d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported "); 2601019b515eSShri Abhyankar } 2602019b515eSShri Abhyankar } 2603019b515eSShri Abhyankar /* backward solve the upper triangular */ 2604019b515eSShri Abhyankar for (i = node_max - 1, row = n - 1; i >= 0; i--) { 2605019b515eSShri Abhyankar nsz = ns[i]; 2606019b515eSShri Abhyankar aii = ad[row + 1] + 1; 2607019b515eSShri Abhyankar v1 = aa + aii; 2608019b515eSShri Abhyankar vi = aj + aii; 2609019b515eSShri Abhyankar nz = ad[row] - ad[row + 1] - 1; 261098991853SShri Abhyankar 261198991853SShri Abhyankar if (i > 0) { 261298991853SShri Abhyankar /* Prefetch the indices for the next block */ 261350d8bf02SJed Brown PetscPrefetchBlock(aj + ad[row - nsz + 1] + 1, ad[row - nsz] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA); 261498991853SShri Abhyankar /* Prefetch the data for the next block */ 261550d8bf02SJed Brown PetscPrefetchBlock(aa + ad[row - nsz + 1] + 1, ad[row - nsz - ns[i - 1] + 1] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA); 261698991853SShri Abhyankar } 261798991853SShri Abhyankar 2618019b515eSShri Abhyankar switch (nsz) { /* Each loop in 'case' is unrolled */ 2619019b515eSShri Abhyankar case 1: 2620019b515eSShri Abhyankar sum1 = tmp[row]; 2621019b515eSShri Abhyankar 2622019b515eSShri Abhyankar for (j = 0; j < nz - 1; j += 2) { 2623019b515eSShri Abhyankar i0 = vi[j]; 2624019b515eSShri Abhyankar i1 = vi[j + 1]; 2625019b515eSShri Abhyankar tmp0 = tmps[i0]; 2626019b515eSShri Abhyankar tmp1 = tmps[i1]; 2627019b515eSShri Abhyankar sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2628019b515eSShri Abhyankar } 2629019b515eSShri Abhyankar if (j == nz - 1) { 2630019b515eSShri Abhyankar tmp0 = tmps[vi[j]]; 2631019b515eSShri Abhyankar sum1 -= v1[j] * tmp0; 2632019b515eSShri Abhyankar } 26339371c9d4SSatish Balay x[c[row]] = tmp[row] = sum1 * v1[nz]; 26349371c9d4SSatish Balay row--; 2635019b515eSShri Abhyankar break; 2636019b515eSShri Abhyankar case 2: 2637019b515eSShri Abhyankar sum1 = tmp[row]; 2638019b515eSShri Abhyankar sum2 = tmp[row - 1]; 2639019b515eSShri Abhyankar v2 = aa + ad[row] + 1; 2640019b515eSShri Abhyankar for (j = 0; j < nz - 1; j += 2) { 2641019b515eSShri Abhyankar i0 = vi[j]; 2642019b515eSShri Abhyankar i1 = vi[j + 1]; 2643019b515eSShri Abhyankar tmp0 = tmps[i0]; 2644019b515eSShri Abhyankar tmp1 = tmps[i1]; 2645019b515eSShri Abhyankar sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2646019b515eSShri Abhyankar sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1; 2647019b515eSShri Abhyankar } 2648019b515eSShri Abhyankar if (j == nz - 1) { 2649019b515eSShri Abhyankar tmp0 = tmps[vi[j]]; 2650019b515eSShri Abhyankar sum1 -= v1[j] * tmp0; 2651019b515eSShri Abhyankar sum2 -= v2[j + 1] * tmp0; 2652019b515eSShri Abhyankar } 2653019b515eSShri Abhyankar 26549371c9d4SSatish Balay tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz]; 26559371c9d4SSatish Balay row--; 2656019b515eSShri Abhyankar sum2 -= v2[0] * tmp0; 26579371c9d4SSatish Balay x[c[row]] = tmp[row] = sum2 * v2[nz + 1]; 26589371c9d4SSatish Balay row--; 2659019b515eSShri Abhyankar break; 2660019b515eSShri Abhyankar case 3: 2661019b515eSShri Abhyankar sum1 = tmp[row]; 2662019b515eSShri Abhyankar sum2 = tmp[row - 1]; 2663019b515eSShri Abhyankar sum3 = tmp[row - 2]; 2664019b515eSShri Abhyankar v2 = aa + ad[row] + 1; 2665019b515eSShri Abhyankar v3 = aa + ad[row - 1] + 1; 2666019b515eSShri Abhyankar for (j = 0; j < nz - 1; j += 2) { 2667019b515eSShri Abhyankar i0 = vi[j]; 2668019b515eSShri Abhyankar i1 = vi[j + 1]; 2669019b515eSShri Abhyankar tmp0 = tmps[i0]; 2670019b515eSShri Abhyankar tmp1 = tmps[i1]; 2671019b515eSShri Abhyankar sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2672019b515eSShri Abhyankar sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1; 2673019b515eSShri Abhyankar sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1; 2674019b515eSShri Abhyankar } 2675019b515eSShri Abhyankar if (j == nz - 1) { 2676019b515eSShri Abhyankar tmp0 = tmps[vi[j]]; 2677019b515eSShri Abhyankar sum1 -= v1[j] * tmp0; 2678019b515eSShri Abhyankar sum2 -= v2[j + 1] * tmp0; 2679019b515eSShri Abhyankar sum3 -= v3[j + 2] * tmp0; 2680019b515eSShri Abhyankar } 26819371c9d4SSatish Balay tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz]; 26829371c9d4SSatish Balay row--; 2683019b515eSShri Abhyankar sum2 -= v2[0] * tmp0; 2684019b515eSShri Abhyankar sum3 -= v3[1] * tmp0; 26859371c9d4SSatish Balay tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1]; 26869371c9d4SSatish Balay row--; 2687019b515eSShri Abhyankar sum3 -= v3[0] * tmp0; 26889371c9d4SSatish Balay x[c[row]] = tmp[row] = sum3 * v3[nz + 2]; 26899371c9d4SSatish Balay row--; 2690019b515eSShri Abhyankar 2691019b515eSShri Abhyankar break; 2692019b515eSShri Abhyankar case 4: 2693019b515eSShri Abhyankar sum1 = tmp[row]; 2694019b515eSShri Abhyankar sum2 = tmp[row - 1]; 2695019b515eSShri Abhyankar sum3 = tmp[row - 2]; 2696019b515eSShri Abhyankar sum4 = tmp[row - 3]; 2697019b515eSShri Abhyankar v2 = aa + ad[row] + 1; 2698019b515eSShri Abhyankar v3 = aa + ad[row - 1] + 1; 2699019b515eSShri Abhyankar v4 = aa + ad[row - 2] + 1; 2700019b515eSShri Abhyankar 2701019b515eSShri Abhyankar for (j = 0; j < nz - 1; j += 2) { 2702019b515eSShri Abhyankar i0 = vi[j]; 2703019b515eSShri Abhyankar i1 = vi[j + 1]; 2704019b515eSShri Abhyankar tmp0 = tmps[i0]; 2705019b515eSShri Abhyankar tmp1 = tmps[i1]; 2706019b515eSShri Abhyankar sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2707019b515eSShri Abhyankar sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1; 2708019b515eSShri Abhyankar sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1; 2709019b515eSShri Abhyankar sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1; 2710019b515eSShri Abhyankar } 2711019b515eSShri Abhyankar if (j == nz - 1) { 2712019b515eSShri Abhyankar tmp0 = tmps[vi[j]]; 2713019b515eSShri Abhyankar sum1 -= v1[j] * tmp0; 2714019b515eSShri Abhyankar sum2 -= v2[j + 1] * tmp0; 2715019b515eSShri Abhyankar sum3 -= v3[j + 2] * tmp0; 2716019b515eSShri Abhyankar sum4 -= v4[j + 3] * tmp0; 2717019b515eSShri Abhyankar } 2718019b515eSShri Abhyankar 27199371c9d4SSatish Balay tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz]; 27209371c9d4SSatish Balay row--; 2721019b515eSShri Abhyankar sum2 -= v2[0] * tmp0; 2722019b515eSShri Abhyankar sum3 -= v3[1] * tmp0; 2723019b515eSShri Abhyankar sum4 -= v4[2] * tmp0; 27249371c9d4SSatish Balay tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1]; 27259371c9d4SSatish Balay row--; 2726019b515eSShri Abhyankar sum3 -= v3[0] * tmp0; 2727019b515eSShri Abhyankar sum4 -= v4[1] * tmp0; 27289371c9d4SSatish Balay tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2]; 27299371c9d4SSatish Balay row--; 2730019b515eSShri Abhyankar sum4 -= v4[0] * tmp0; 27319371c9d4SSatish Balay x[c[row]] = tmp[row] = sum4 * v4[nz + 3]; 27329371c9d4SSatish Balay row--; 2733019b515eSShri Abhyankar break; 2734019b515eSShri Abhyankar case 5: 2735019b515eSShri Abhyankar sum1 = tmp[row]; 2736019b515eSShri Abhyankar sum2 = tmp[row - 1]; 2737019b515eSShri Abhyankar sum3 = tmp[row - 2]; 2738019b515eSShri Abhyankar sum4 = tmp[row - 3]; 2739019b515eSShri Abhyankar sum5 = tmp[row - 4]; 2740019b515eSShri Abhyankar v2 = aa + ad[row] + 1; 2741019b515eSShri Abhyankar v3 = aa + ad[row - 1] + 1; 2742019b515eSShri Abhyankar v4 = aa + ad[row - 2] + 1; 2743019b515eSShri Abhyankar v5 = aa + ad[row - 3] + 1; 2744019b515eSShri Abhyankar for (j = 0; j < nz - 1; j += 2) { 2745019b515eSShri Abhyankar i0 = vi[j]; 2746019b515eSShri Abhyankar i1 = vi[j + 1]; 2747019b515eSShri Abhyankar tmp0 = tmps[i0]; 2748019b515eSShri Abhyankar tmp1 = tmps[i1]; 2749019b515eSShri Abhyankar sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2750019b515eSShri Abhyankar sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1; 2751019b515eSShri Abhyankar sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1; 2752019b515eSShri Abhyankar sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1; 2753019b515eSShri Abhyankar sum5 -= v5[j + 4] * tmp0 + v5[j + 5] * tmp1; 2754019b515eSShri Abhyankar } 2755019b515eSShri Abhyankar if (j == nz - 1) { 2756019b515eSShri Abhyankar tmp0 = tmps[vi[j]]; 2757019b515eSShri Abhyankar sum1 -= v1[j] * tmp0; 2758019b515eSShri Abhyankar sum2 -= v2[j + 1] * tmp0; 2759019b515eSShri Abhyankar sum3 -= v3[j + 2] * tmp0; 2760019b515eSShri Abhyankar sum4 -= v4[j + 3] * tmp0; 2761019b515eSShri Abhyankar sum5 -= v5[j + 4] * tmp0; 2762019b515eSShri Abhyankar } 2763019b515eSShri Abhyankar 27649371c9d4SSatish Balay tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz]; 27659371c9d4SSatish Balay row--; 2766019b515eSShri Abhyankar sum2 -= v2[0] * tmp0; 2767019b515eSShri Abhyankar sum3 -= v3[1] * tmp0; 2768019b515eSShri Abhyankar sum4 -= v4[2] * tmp0; 2769019b515eSShri Abhyankar sum5 -= v5[3] * tmp0; 27709371c9d4SSatish Balay tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1]; 27719371c9d4SSatish Balay row--; 2772019b515eSShri Abhyankar sum3 -= v3[0] * tmp0; 2773019b515eSShri Abhyankar sum4 -= v4[1] * tmp0; 2774019b515eSShri Abhyankar sum5 -= v5[2] * tmp0; 27759371c9d4SSatish Balay tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2]; 27769371c9d4SSatish Balay row--; 2777019b515eSShri Abhyankar sum4 -= v4[0] * tmp0; 2778019b515eSShri Abhyankar sum5 -= v5[1] * tmp0; 27799371c9d4SSatish Balay tmp0 = x[c[row]] = tmp[row] = sum4 * v4[nz + 3]; 27809371c9d4SSatish Balay row--; 2781019b515eSShri Abhyankar sum5 -= v5[0] * tmp0; 27829371c9d4SSatish Balay x[c[row]] = tmp[row] = sum5 * v5[nz + 4]; 27839371c9d4SSatish Balay row--; 2784019b515eSShri Abhyankar break; 2785d71ae5a4SJacob Faibussowitsch default: 2786d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported "); 2787019b515eSShri Abhyankar } 2788019b515eSShri Abhyankar } 27899566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 27909566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 27919566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 27929566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(xx, &x)); 27939566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n)); 27943ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2795019b515eSShri Abhyankar } 2796019b515eSShri Abhyankar 27974c1414c8SBarry Smith /* 27984c1414c8SBarry Smith Makes a longer coloring[] array and calls the usual code with that 27994c1414c8SBarry Smith */ 2800d71ae5a4SJacob Faibussowitsch PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat, PetscInt ncolors, PetscInt nin, ISColoringValue coloring[], ISColoring *iscoloring) 2801d71ae5a4SJacob Faibussowitsch { 28024c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)mat->data; 2803d0f46423SBarry Smith PetscInt n = mat->cmap->n, m = a->inode.node_count, j, *ns = a->inode.size, row; 28044c1414c8SBarry Smith PetscInt *colorused, i; 28054c1414c8SBarry Smith ISColoringValue *newcolor; 28064c1414c8SBarry Smith 28074c1414c8SBarry Smith PetscFunctionBegin; 280808401ef6SPierre Jolivet PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 28099566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(n + 1, &newcolor)); 28104c1414c8SBarry Smith /* loop over inodes, marking a color for each column*/ 28114c1414c8SBarry Smith row = 0; 28124c1414c8SBarry Smith for (i = 0; i < m; i++) { 2813ad540459SPierre Jolivet for (j = 0; j < ns[i]; j++) newcolor[row++] = coloring[i] + j * ncolors; 28144c1414c8SBarry Smith } 28154c1414c8SBarry Smith 28164c1414c8SBarry Smith /* eliminate unneeded colors */ 28179566063dSJacob Faibussowitsch PetscCall(PetscCalloc1(5 * ncolors, &colorused)); 2818ad540459SPierre Jolivet for (i = 0; i < n; i++) colorused[newcolor[i]] = 1; 28194c1414c8SBarry Smith 2820ad540459SPierre Jolivet for (i = 1; i < 5 * ncolors; i++) colorused[i] += colorused[i - 1]; 28214c1414c8SBarry Smith ncolors = colorused[5 * ncolors - 1]; 2822ad540459SPierre Jolivet for (i = 0; i < n; i++) newcolor[i] = colorused[newcolor[i]] - 1; 28239566063dSJacob Faibussowitsch PetscCall(PetscFree(colorused)); 28249566063dSJacob Faibussowitsch PetscCall(ISColoringCreate(PetscObjectComm((PetscObject)mat), ncolors, n, newcolor, PETSC_OWN_POINTER, iscoloring)); 28259566063dSJacob Faibussowitsch PetscCall(PetscFree(coloring)); 28263ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 28274c1414c8SBarry Smith } 28284c1414c8SBarry Smith 2829af0996ceSBarry Smith #include <petsc/private/kernels/blockinvert.h> 28302af78befSBarry Smith 2831d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx) 2832d71ae5a4SJacob Faibussowitsch { 28332af78befSBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 28347aaeff0aSMatthew G. Knepley PetscScalar sum1 = 0.0, sum2 = 0.0, sum3 = 0.0, sum4 = 0.0, sum5 = 0.0, tmp0, tmp1, tmp2, tmp3; 28355850ef23SBarry Smith MatScalar *ibdiag, *bdiag, work[25], *t; 2836a8b09249SBarry Smith PetscScalar *x, tmp4, tmp5, x1, x2, x3, x4, x5; 28377aaeff0aSMatthew G. Knepley const MatScalar *v = a->a, *v1 = NULL, *v2 = NULL, *v3 = NULL, *v4 = NULL, *v5 = NULL; 28385850ef23SBarry Smith const PetscScalar *xb, *b; 28397b6c816cSBarry Smith PetscReal zeropivot = 100. * PETSC_MACHINE_EPSILON, shift = 0.0; 28408758e1faSBarry Smith PetscInt n, m = a->inode.node_count, cnt = 0, i, j, row, i1, i2; 28418758e1faSBarry Smith PetscInt sz, k, ipvt[5]; 28427b6c816cSBarry Smith PetscBool allowzeropivot, zeropivotdetected; 28438758e1faSBarry Smith const PetscInt *sizes = a->inode.size, *idx, *diag = a->diag, *ii = a->i; 28442af78befSBarry Smith 28452af78befSBarry Smith PetscFunctionBegin; 2846a455e926SHong Zhang allowzeropivot = PetscNot(A->erroriffailure); 284708401ef6SPierre Jolivet PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 284808401ef6SPierre Jolivet PetscCheck(omega == 1.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for omega != 1.0; use -mat_no_inode"); 284908401ef6SPierre Jolivet PetscCheck(fshift == 0.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for fshift != 0.0; use -mat_no_inode"); 28502af78befSBarry Smith 285171f1c65dSBarry Smith if (!a->inode.ibdiagvalid) { 28522af78befSBarry Smith if (!a->inode.ibdiag) { 28532af78befSBarry Smith /* calculate space needed for diagonal blocks */ 2854ad540459SPierre Jolivet for (i = 0; i < m; i++) cnt += sizes[i] * sizes[i]; 2855f0d39aaaSBarry Smith a->inode.bdiagsize = cnt; 28562205254eSKarl Rupp 28579566063dSJacob Faibussowitsch PetscCall(PetscMalloc3(cnt, &a->inode.ibdiag, cnt, &a->inode.bdiag, A->rmap->n, &a->inode.ssor_work)); 285871f1c65dSBarry Smith } 285971f1c65dSBarry Smith 286071f1c65dSBarry Smith /* copy over the diagonal blocks and invert them */ 28612af78befSBarry Smith ibdiag = a->inode.ibdiag; 28622af78befSBarry Smith bdiag = a->inode.bdiag; 28632af78befSBarry Smith cnt = 0; 28642af78befSBarry Smith for (i = 0, row = 0; i < m; i++) { 28652af78befSBarry Smith for (j = 0; j < sizes[i]; j++) { 2866ad540459SPierre Jolivet for (k = 0; k < sizes[i]; k++) bdiag[cnt + k * sizes[i] + j] = v[diag[row + j] - j + k]; 28672af78befSBarry Smith } 28689566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(ibdiag + cnt, bdiag + cnt, sizes[i] * sizes[i])); 28692af78befSBarry Smith 28702af78befSBarry Smith switch (sizes[i]) { 28712af78befSBarry Smith case 1: 28722af78befSBarry Smith /* Create matrix data structure */ 28738e0e2a9aSHong Zhang if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) { 28748e0e2a9aSHong Zhang if (allowzeropivot) { 28757b6c816cSBarry Smith A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 28767b6c816cSBarry Smith A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]); 28777b6c816cSBarry Smith A->factorerror_zeropivot_row = row; 28789566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "Zero pivot, row %" PetscInt_FMT "\n", row)); 287998921bdaSJacob Faibussowitsch } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot on row %" PetscInt_FMT, row); 28808e0e2a9aSHong Zhang } 288164c62002SMatthew Knepley ibdiag[cnt] = 1.0 / ibdiag[cnt]; 28822af78befSBarry Smith break; 28832af78befSBarry Smith case 2: 28849566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_inverse_A_2(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected)); 28857b6c816cSBarry Smith if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 28862af78befSBarry Smith break; 28872af78befSBarry Smith case 3: 28889566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_inverse_A_3(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected)); 28897b6c816cSBarry Smith if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 28902af78befSBarry Smith break; 28912af78befSBarry Smith case 4: 28929566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_inverse_A_4(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected)); 28937b6c816cSBarry Smith if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 28942af78befSBarry Smith break; 28952af78befSBarry Smith case 5: 28969566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_inverse_A_5(ibdiag + cnt, ipvt, work, shift, allowzeropivot, &zeropivotdetected)); 28977b6c816cSBarry Smith if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 28982af78befSBarry Smith break; 2899d71ae5a4SJacob Faibussowitsch default: 2900d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 29012af78befSBarry Smith } 29022af78befSBarry Smith cnt += sizes[i] * sizes[i]; 29032af78befSBarry Smith row += sizes[i]; 29042af78befSBarry Smith } 290571f1c65dSBarry Smith a->inode.ibdiagvalid = PETSC_TRUE; 29062af78befSBarry Smith } 29072af78befSBarry Smith ibdiag = a->inode.ibdiag; 29082af78befSBarry Smith bdiag = a->inode.bdiag; 29095850ef23SBarry Smith t = a->inode.ssor_work; 29102af78befSBarry Smith 29119566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 29129566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 29135850ef23SBarry Smith /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 29145850ef23SBarry Smith if (flag & SOR_ZERO_INITIAL_GUESS) { 29152af78befSBarry Smith if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 29168862d2efSBarry Smith for (i = 0, row = 0; i < m; i++) { 29178862d2efSBarry Smith sz = diag[row] - ii[row]; 29188862d2efSBarry Smith v1 = a->a + ii[row]; 29198862d2efSBarry Smith idx = a->j + ii[row]; 29208862d2efSBarry Smith 29214108e4d5SBarry Smith /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 29228862d2efSBarry Smith switch (sizes[i]) { 29238862d2efSBarry Smith case 1: 29248862d2efSBarry Smith 29258862d2efSBarry Smith sum1 = b[row]; 29268862d2efSBarry Smith for (n = 0; n < sz - 1; n += 2) { 29278862d2efSBarry Smith i1 = idx[0]; 29288862d2efSBarry Smith i2 = idx[1]; 29298862d2efSBarry Smith idx += 2; 29308862d2efSBarry Smith tmp0 = x[i1]; 29318862d2efSBarry Smith tmp1 = x[i2]; 29329371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 29339371c9d4SSatish Balay v1 += 2; 29348862d2efSBarry Smith } 29358862d2efSBarry Smith 29368862d2efSBarry Smith if (n == sz - 1) { 2937f0d39aaaSBarry Smith tmp0 = x[*idx]; 2938f0d39aaaSBarry Smith sum1 -= *v1 * tmp0; 29398862d2efSBarry Smith } 29405850ef23SBarry Smith t[row] = sum1; 29418862d2efSBarry Smith x[row++] = sum1 * (*ibdiag++); 29428862d2efSBarry Smith break; 2943f0d39aaaSBarry Smith case 2: 2944f0d39aaaSBarry Smith v2 = a->a + ii[row + 1]; 2945f0d39aaaSBarry Smith sum1 = b[row]; 2946f0d39aaaSBarry Smith sum2 = b[row + 1]; 2947f0d39aaaSBarry Smith for (n = 0; n < sz - 1; n += 2) { 2948f0d39aaaSBarry Smith i1 = idx[0]; 2949f0d39aaaSBarry Smith i2 = idx[1]; 2950f0d39aaaSBarry Smith idx += 2; 2951f0d39aaaSBarry Smith tmp0 = x[i1]; 2952f0d39aaaSBarry Smith tmp1 = x[i2]; 29539371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 29549371c9d4SSatish Balay v1 += 2; 29559371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 29569371c9d4SSatish Balay v2 += 2; 2957f0d39aaaSBarry Smith } 2958f0d39aaaSBarry Smith 2959f0d39aaaSBarry Smith if (n == sz - 1) { 2960f0d39aaaSBarry Smith tmp0 = x[*idx]; 2961f0d39aaaSBarry Smith sum1 -= v1[0] * tmp0; 2962f0d39aaaSBarry Smith sum2 -= v2[0] * tmp0; 2963f0d39aaaSBarry Smith } 29645850ef23SBarry Smith t[row] = sum1; 29655850ef23SBarry Smith t[row + 1] = sum2; 2966f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[2]; 2967f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[3]; 2968f0d39aaaSBarry Smith ibdiag += 4; 2969f0d39aaaSBarry Smith break; 2970f0d39aaaSBarry Smith case 3: 2971f0d39aaaSBarry Smith v2 = a->a + ii[row + 1]; 2972f0d39aaaSBarry Smith v3 = a->a + ii[row + 2]; 2973f0d39aaaSBarry Smith sum1 = b[row]; 2974f0d39aaaSBarry Smith sum2 = b[row + 1]; 2975f0d39aaaSBarry Smith sum3 = b[row + 2]; 2976f0d39aaaSBarry Smith for (n = 0; n < sz - 1; n += 2) { 2977f0d39aaaSBarry Smith i1 = idx[0]; 2978f0d39aaaSBarry Smith i2 = idx[1]; 2979f0d39aaaSBarry Smith idx += 2; 2980f0d39aaaSBarry Smith tmp0 = x[i1]; 2981f0d39aaaSBarry Smith tmp1 = x[i2]; 29829371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 29839371c9d4SSatish Balay v1 += 2; 29849371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 29859371c9d4SSatish Balay v2 += 2; 29869371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 29879371c9d4SSatish Balay v3 += 2; 2988f0d39aaaSBarry Smith } 2989f0d39aaaSBarry Smith 2990f0d39aaaSBarry Smith if (n == sz - 1) { 2991f0d39aaaSBarry Smith tmp0 = x[*idx]; 2992f0d39aaaSBarry Smith sum1 -= v1[0] * tmp0; 2993f0d39aaaSBarry Smith sum2 -= v2[0] * tmp0; 2994f0d39aaaSBarry Smith sum3 -= v3[0] * tmp0; 2995f0d39aaaSBarry Smith } 29965850ef23SBarry Smith t[row] = sum1; 29975850ef23SBarry Smith t[row + 1] = sum2; 29985850ef23SBarry Smith t[row + 2] = sum3; 2999f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6]; 3000f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7]; 3001f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8]; 3002f0d39aaaSBarry Smith ibdiag += 9; 3003f0d39aaaSBarry Smith break; 3004f0d39aaaSBarry Smith case 4: 3005f0d39aaaSBarry Smith v2 = a->a + ii[row + 1]; 3006f0d39aaaSBarry Smith v3 = a->a + ii[row + 2]; 3007f0d39aaaSBarry Smith v4 = a->a + ii[row + 3]; 3008f0d39aaaSBarry Smith sum1 = b[row]; 3009f0d39aaaSBarry Smith sum2 = b[row + 1]; 3010f0d39aaaSBarry Smith sum3 = b[row + 2]; 3011f0d39aaaSBarry Smith sum4 = b[row + 3]; 3012f0d39aaaSBarry Smith for (n = 0; n < sz - 1; n += 2) { 3013f0d39aaaSBarry Smith i1 = idx[0]; 3014f0d39aaaSBarry Smith i2 = idx[1]; 3015f0d39aaaSBarry Smith idx += 2; 3016f0d39aaaSBarry Smith tmp0 = x[i1]; 3017f0d39aaaSBarry Smith tmp1 = x[i2]; 30189371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 30199371c9d4SSatish Balay v1 += 2; 30209371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 30219371c9d4SSatish Balay v2 += 2; 30229371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 30239371c9d4SSatish Balay v3 += 2; 30249371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 30259371c9d4SSatish Balay v4 += 2; 3026f0d39aaaSBarry Smith } 3027f0d39aaaSBarry Smith 3028f0d39aaaSBarry Smith if (n == sz - 1) { 3029f0d39aaaSBarry Smith tmp0 = x[*idx]; 3030f0d39aaaSBarry Smith sum1 -= v1[0] * tmp0; 3031f0d39aaaSBarry Smith sum2 -= v2[0] * tmp0; 3032f0d39aaaSBarry Smith sum3 -= v3[0] * tmp0; 3033f0d39aaaSBarry Smith sum4 -= v4[0] * tmp0; 3034f0d39aaaSBarry Smith } 30355850ef23SBarry Smith t[row] = sum1; 30365850ef23SBarry Smith t[row + 1] = sum2; 30375850ef23SBarry Smith t[row + 2] = sum3; 30385850ef23SBarry Smith t[row + 3] = sum4; 3039f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12]; 3040f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13]; 3041f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14]; 3042f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15]; 3043f0d39aaaSBarry Smith ibdiag += 16; 3044f0d39aaaSBarry Smith break; 3045f0d39aaaSBarry Smith case 5: 3046f0d39aaaSBarry Smith v2 = a->a + ii[row + 1]; 3047f0d39aaaSBarry Smith v3 = a->a + ii[row + 2]; 3048f0d39aaaSBarry Smith v4 = a->a + ii[row + 3]; 3049f0d39aaaSBarry Smith v5 = a->a + ii[row + 4]; 3050f0d39aaaSBarry Smith sum1 = b[row]; 3051f0d39aaaSBarry Smith sum2 = b[row + 1]; 3052f0d39aaaSBarry Smith sum3 = b[row + 2]; 3053f0d39aaaSBarry Smith sum4 = b[row + 3]; 3054f0d39aaaSBarry Smith sum5 = b[row + 4]; 3055f0d39aaaSBarry Smith for (n = 0; n < sz - 1; n += 2) { 3056f0d39aaaSBarry Smith i1 = idx[0]; 3057f0d39aaaSBarry Smith i2 = idx[1]; 3058f0d39aaaSBarry Smith idx += 2; 3059f0d39aaaSBarry Smith tmp0 = x[i1]; 3060f0d39aaaSBarry Smith tmp1 = x[i2]; 30619371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 30629371c9d4SSatish Balay v1 += 2; 30639371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 30649371c9d4SSatish Balay v2 += 2; 30659371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 30669371c9d4SSatish Balay v3 += 2; 30679371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 30689371c9d4SSatish Balay v4 += 2; 30699371c9d4SSatish Balay sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 30709371c9d4SSatish Balay v5 += 2; 3071f0d39aaaSBarry Smith } 3072f0d39aaaSBarry Smith 3073f0d39aaaSBarry Smith if (n == sz - 1) { 3074f0d39aaaSBarry Smith tmp0 = x[*idx]; 3075f0d39aaaSBarry Smith sum1 -= v1[0] * tmp0; 3076f0d39aaaSBarry Smith sum2 -= v2[0] * tmp0; 3077f0d39aaaSBarry Smith sum3 -= v3[0] * tmp0; 3078f0d39aaaSBarry Smith sum4 -= v4[0] * tmp0; 3079f0d39aaaSBarry Smith sum5 -= v5[0] * tmp0; 3080f0d39aaaSBarry Smith } 30815850ef23SBarry Smith t[row] = sum1; 30825850ef23SBarry Smith t[row + 1] = sum2; 30835850ef23SBarry Smith t[row + 2] = sum3; 30845850ef23SBarry Smith t[row + 3] = sum4; 30855850ef23SBarry Smith t[row + 4] = sum5; 3086f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20]; 3087f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21]; 3088f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22]; 3089f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23]; 3090f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24]; 3091f0d39aaaSBarry Smith ibdiag += 25; 3092f0d39aaaSBarry Smith break; 3093d71ae5a4SJacob Faibussowitsch default: 3094d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 30958862d2efSBarry Smith } 30962af78befSBarry Smith } 30972af78befSBarry Smith 30985850ef23SBarry Smith xb = t; 30999566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(a->nz)); 31002af78befSBarry Smith } else xb = b; 31012af78befSBarry Smith if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 3102f0d39aaaSBarry Smith ibdiag = a->inode.ibdiag + a->inode.bdiagsize; 3103d0f46423SBarry Smith for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) { 3104f0d39aaaSBarry Smith ibdiag -= sizes[i] * sizes[i]; 31058862d2efSBarry Smith sz = ii[row + 1] - diag[row] - 1; 31068862d2efSBarry Smith v1 = a->a + diag[row] + 1; 31078862d2efSBarry Smith idx = a->j + diag[row] + 1; 31082af78befSBarry Smith 31094108e4d5SBarry Smith /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 31108862d2efSBarry Smith switch (sizes[i]) { 31118862d2efSBarry Smith case 1: 31128862d2efSBarry Smith 31138862d2efSBarry Smith sum1 = xb[row]; 31148862d2efSBarry Smith for (n = 0; n < sz - 1; n += 2) { 31158862d2efSBarry Smith i1 = idx[0]; 31168862d2efSBarry Smith i2 = idx[1]; 31178862d2efSBarry Smith idx += 2; 31188862d2efSBarry Smith tmp0 = x[i1]; 31198862d2efSBarry Smith tmp1 = x[i2]; 31209371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 31219371c9d4SSatish Balay v1 += 2; 31228862d2efSBarry Smith } 31238862d2efSBarry Smith 31248862d2efSBarry Smith if (n == sz - 1) { 3125f0d39aaaSBarry Smith tmp0 = x[*idx]; 3126f0d39aaaSBarry Smith sum1 -= *v1 * tmp0; 31278862d2efSBarry Smith } 3128f0d39aaaSBarry Smith x[row--] = sum1 * (*ibdiag); 3129f0d39aaaSBarry Smith break; 3130f0d39aaaSBarry Smith 3131f0d39aaaSBarry Smith case 2: 3132f0d39aaaSBarry Smith 3133f0d39aaaSBarry Smith sum1 = xb[row]; 3134f0d39aaaSBarry Smith sum2 = xb[row - 1]; 3135f0d39aaaSBarry Smith /* note that sum1 is associated with the second of the two rows */ 3136f0d39aaaSBarry Smith v2 = a->a + diag[row - 1] + 2; 3137f0d39aaaSBarry Smith for (n = 0; n < sz - 1; n += 2) { 3138f0d39aaaSBarry Smith i1 = idx[0]; 3139f0d39aaaSBarry Smith i2 = idx[1]; 3140f0d39aaaSBarry Smith idx += 2; 3141f0d39aaaSBarry Smith tmp0 = x[i1]; 3142f0d39aaaSBarry Smith tmp1 = x[i2]; 31439371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 31449371c9d4SSatish Balay v1 += 2; 31459371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 31469371c9d4SSatish Balay v2 += 2; 3147f0d39aaaSBarry Smith } 3148f0d39aaaSBarry Smith 3149f0d39aaaSBarry Smith if (n == sz - 1) { 3150f0d39aaaSBarry Smith tmp0 = x[*idx]; 3151f0d39aaaSBarry Smith sum1 -= *v1 * tmp0; 3152f0d39aaaSBarry Smith sum2 -= *v2 * tmp0; 3153f0d39aaaSBarry Smith } 3154f0d39aaaSBarry Smith x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3]; 3155f0d39aaaSBarry Smith x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2]; 3156f0d39aaaSBarry Smith break; 3157f0d39aaaSBarry Smith case 3: 3158f0d39aaaSBarry Smith 3159f0d39aaaSBarry Smith sum1 = xb[row]; 3160f0d39aaaSBarry Smith sum2 = xb[row - 1]; 3161f0d39aaaSBarry Smith sum3 = xb[row - 2]; 3162f0d39aaaSBarry Smith v2 = a->a + diag[row - 1] + 2; 3163f0d39aaaSBarry Smith v3 = a->a + diag[row - 2] + 3; 3164f0d39aaaSBarry Smith for (n = 0; n < sz - 1; n += 2) { 3165f0d39aaaSBarry Smith i1 = idx[0]; 3166f0d39aaaSBarry Smith i2 = idx[1]; 3167f0d39aaaSBarry Smith idx += 2; 3168f0d39aaaSBarry Smith tmp0 = x[i1]; 3169f0d39aaaSBarry Smith tmp1 = x[i2]; 31709371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 31719371c9d4SSatish Balay v1 += 2; 31729371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 31739371c9d4SSatish Balay v2 += 2; 31749371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 31759371c9d4SSatish Balay v3 += 2; 3176f0d39aaaSBarry Smith } 3177f0d39aaaSBarry Smith 3178f0d39aaaSBarry Smith if (n == sz - 1) { 3179f0d39aaaSBarry Smith tmp0 = x[*idx]; 3180f0d39aaaSBarry Smith sum1 -= *v1 * tmp0; 3181f0d39aaaSBarry Smith sum2 -= *v2 * tmp0; 3182f0d39aaaSBarry Smith sum3 -= *v3 * tmp0; 3183f0d39aaaSBarry Smith } 3184f0d39aaaSBarry Smith x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8]; 3185f0d39aaaSBarry Smith x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7]; 3186f0d39aaaSBarry Smith x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6]; 3187f0d39aaaSBarry Smith break; 3188f0d39aaaSBarry Smith case 4: 3189f0d39aaaSBarry Smith 3190f0d39aaaSBarry Smith sum1 = xb[row]; 3191f0d39aaaSBarry Smith sum2 = xb[row - 1]; 3192f0d39aaaSBarry Smith sum3 = xb[row - 2]; 3193f0d39aaaSBarry Smith sum4 = xb[row - 3]; 3194f0d39aaaSBarry Smith v2 = a->a + diag[row - 1] + 2; 3195f0d39aaaSBarry Smith v3 = a->a + diag[row - 2] + 3; 3196f0d39aaaSBarry Smith v4 = a->a + diag[row - 3] + 4; 3197f0d39aaaSBarry Smith for (n = 0; n < sz - 1; n += 2) { 3198f0d39aaaSBarry Smith i1 = idx[0]; 3199f0d39aaaSBarry Smith i2 = idx[1]; 3200f0d39aaaSBarry Smith idx += 2; 3201f0d39aaaSBarry Smith tmp0 = x[i1]; 3202f0d39aaaSBarry Smith tmp1 = x[i2]; 32039371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 32049371c9d4SSatish Balay v1 += 2; 32059371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 32069371c9d4SSatish Balay v2 += 2; 32079371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 32089371c9d4SSatish Balay v3 += 2; 32099371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 32109371c9d4SSatish Balay v4 += 2; 3211f0d39aaaSBarry Smith } 3212f0d39aaaSBarry Smith 3213f0d39aaaSBarry Smith if (n == sz - 1) { 3214f0d39aaaSBarry Smith tmp0 = x[*idx]; 3215f0d39aaaSBarry Smith sum1 -= *v1 * tmp0; 3216f0d39aaaSBarry Smith sum2 -= *v2 * tmp0; 3217f0d39aaaSBarry Smith sum3 -= *v3 * tmp0; 3218f0d39aaaSBarry Smith sum4 -= *v4 * tmp0; 3219f0d39aaaSBarry Smith } 3220f0d39aaaSBarry Smith x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15]; 3221f0d39aaaSBarry Smith x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14]; 3222f0d39aaaSBarry Smith x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13]; 3223f0d39aaaSBarry Smith x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12]; 3224f0d39aaaSBarry Smith break; 3225f0d39aaaSBarry Smith case 5: 3226f0d39aaaSBarry Smith 3227f0d39aaaSBarry Smith sum1 = xb[row]; 3228f0d39aaaSBarry Smith sum2 = xb[row - 1]; 3229f0d39aaaSBarry Smith sum3 = xb[row - 2]; 3230f0d39aaaSBarry Smith sum4 = xb[row - 3]; 3231f0d39aaaSBarry Smith sum5 = xb[row - 4]; 3232f0d39aaaSBarry Smith v2 = a->a + diag[row - 1] + 2; 3233f0d39aaaSBarry Smith v3 = a->a + diag[row - 2] + 3; 3234f0d39aaaSBarry Smith v4 = a->a + diag[row - 3] + 4; 3235f0d39aaaSBarry Smith v5 = a->a + diag[row - 4] + 5; 3236f0d39aaaSBarry Smith for (n = 0; n < sz - 1; n += 2) { 3237f0d39aaaSBarry Smith i1 = idx[0]; 3238f0d39aaaSBarry Smith i2 = idx[1]; 3239f0d39aaaSBarry Smith idx += 2; 3240f0d39aaaSBarry Smith tmp0 = x[i1]; 3241f0d39aaaSBarry Smith tmp1 = x[i2]; 32429371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 32439371c9d4SSatish Balay v1 += 2; 32449371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 32459371c9d4SSatish Balay v2 += 2; 32469371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 32479371c9d4SSatish Balay v3 += 2; 32489371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 32499371c9d4SSatish Balay v4 += 2; 32509371c9d4SSatish Balay sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 32519371c9d4SSatish Balay v5 += 2; 3252f0d39aaaSBarry Smith } 3253f0d39aaaSBarry Smith 3254f0d39aaaSBarry Smith if (n == sz - 1) { 3255f0d39aaaSBarry Smith tmp0 = x[*idx]; 3256f0d39aaaSBarry Smith sum1 -= *v1 * tmp0; 3257f0d39aaaSBarry Smith sum2 -= *v2 * tmp0; 3258f0d39aaaSBarry Smith sum3 -= *v3 * tmp0; 3259f0d39aaaSBarry Smith sum4 -= *v4 * tmp0; 3260f0d39aaaSBarry Smith sum5 -= *v5 * tmp0; 3261f0d39aaaSBarry Smith } 3262f0d39aaaSBarry Smith x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24]; 3263f0d39aaaSBarry Smith x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23]; 3264f0d39aaaSBarry Smith x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22]; 3265f0d39aaaSBarry Smith x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21]; 3266f0d39aaaSBarry Smith x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20]; 32678862d2efSBarry Smith break; 3268d71ae5a4SJacob Faibussowitsch default: 3269d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 32708862d2efSBarry Smith } 32712af78befSBarry Smith } 32722af78befSBarry Smith 32739566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(a->nz)); 32742af78befSBarry Smith } 32752af78befSBarry Smith its--; 32765850ef23SBarry Smith } 32775850ef23SBarry Smith while (its--) { 32785850ef23SBarry Smith if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 32799371c9d4SSatish Balay for (i = 0, row = 0, ibdiag = a->inode.ibdiag; i < m; row += sizes[i], ibdiag += sizes[i] * sizes[i], i++) { 3280d876e2b0SMark Adams sz = diag[row] - ii[row]; 32815850ef23SBarry Smith v1 = a->a + ii[row]; 32825850ef23SBarry Smith idx = a->j + ii[row]; 32835850ef23SBarry Smith /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 32845850ef23SBarry Smith switch (sizes[i]) { 32855850ef23SBarry Smith case 1: 32865850ef23SBarry Smith sum1 = b[row]; 32875850ef23SBarry Smith for (n = 0; n < sz - 1; n += 2) { 32885850ef23SBarry Smith i1 = idx[0]; 32895850ef23SBarry Smith i2 = idx[1]; 32905850ef23SBarry Smith idx += 2; 32915850ef23SBarry Smith tmp0 = x[i1]; 32925850ef23SBarry Smith tmp1 = x[i2]; 32939371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 32949371c9d4SSatish Balay v1 += 2; 32955850ef23SBarry Smith } 32965850ef23SBarry Smith if (n == sz - 1) { 3297d876e2b0SMark Adams tmp0 = x[*idx++]; 3298d876e2b0SMark Adams sum1 -= *v1 * tmp0; 3299d876e2b0SMark Adams v1++; 3300d876e2b0SMark Adams } 3301d876e2b0SMark Adams t[row] = sum1; 3302d876e2b0SMark Adams sz = ii[row + 1] - diag[row] - 1; 3303d876e2b0SMark Adams idx = a->j + diag[row] + 1; 3304d876e2b0SMark Adams v1 += 1; 3305d876e2b0SMark Adams for (n = 0; n < sz - 1; n += 2) { 3306d876e2b0SMark Adams i1 = idx[0]; 3307d876e2b0SMark Adams i2 = idx[1]; 3308d876e2b0SMark Adams idx += 2; 3309d876e2b0SMark Adams tmp0 = x[i1]; 3310d876e2b0SMark Adams tmp1 = x[i2]; 33119371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 33129371c9d4SSatish Balay v1 += 2; 3313d876e2b0SMark Adams } 3314d876e2b0SMark Adams if (n == sz - 1) { 3315d876e2b0SMark Adams tmp0 = x[*idx++]; 33165850ef23SBarry Smith sum1 -= *v1 * tmp0; 33175850ef23SBarry Smith } 33185850ef23SBarry Smith /* in MatSOR_SeqAIJ this line would be 33195850ef23SBarry Smith * 33205850ef23SBarry Smith * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++); 33215850ef23SBarry Smith * 33225850ef23SBarry Smith * but omega == 1, so this becomes 33235850ef23SBarry Smith * 3324d876e2b0SMark Adams * x[row] = sum1*(*ibdiag++); 33255850ef23SBarry Smith * 33265850ef23SBarry Smith */ 3327d876e2b0SMark Adams x[row] = sum1 * (*ibdiag); 33285850ef23SBarry Smith break; 33295850ef23SBarry Smith case 2: 33305850ef23SBarry Smith v2 = a->a + ii[row + 1]; 33315850ef23SBarry Smith sum1 = b[row]; 33325850ef23SBarry Smith sum2 = b[row + 1]; 33335850ef23SBarry Smith for (n = 0; n < sz - 1; n += 2) { 33345850ef23SBarry Smith i1 = idx[0]; 33355850ef23SBarry Smith i2 = idx[1]; 33365850ef23SBarry Smith idx += 2; 33375850ef23SBarry Smith tmp0 = x[i1]; 33385850ef23SBarry Smith tmp1 = x[i2]; 33399371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 33409371c9d4SSatish Balay v1 += 2; 33419371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 33429371c9d4SSatish Balay v2 += 2; 33435850ef23SBarry Smith } 3344d876e2b0SMark Adams if (n == sz - 1) { 3345d876e2b0SMark Adams tmp0 = x[*idx++]; 3346d876e2b0SMark Adams sum1 -= v1[0] * tmp0; 3347d876e2b0SMark Adams sum2 -= v2[0] * tmp0; 33489371c9d4SSatish Balay v1++; 33499371c9d4SSatish Balay v2++; 3350d876e2b0SMark Adams } 3351d876e2b0SMark Adams t[row] = sum1; 3352d876e2b0SMark Adams t[row + 1] = sum2; 3353d876e2b0SMark Adams sz = ii[row + 1] - diag[row] - 2; 3354d876e2b0SMark Adams idx = a->j + diag[row] + 2; 3355d876e2b0SMark Adams v1 += 2; 3356d876e2b0SMark Adams v2 += 2; 3357d876e2b0SMark Adams for (n = 0; n < sz - 1; n += 2) { 3358d876e2b0SMark Adams i1 = idx[0]; 3359d876e2b0SMark Adams i2 = idx[1]; 3360d876e2b0SMark Adams idx += 2; 3361d876e2b0SMark Adams tmp0 = x[i1]; 3362d876e2b0SMark Adams tmp1 = x[i2]; 33639371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 33649371c9d4SSatish Balay v1 += 2; 33659371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 33669371c9d4SSatish Balay v2 += 2; 3367d876e2b0SMark Adams } 33685850ef23SBarry Smith if (n == sz - 1) { 33695850ef23SBarry Smith tmp0 = x[*idx]; 33705850ef23SBarry Smith sum1 -= v1[0] * tmp0; 33715850ef23SBarry Smith sum2 -= v2[0] * tmp0; 33725850ef23SBarry Smith } 3373d876e2b0SMark Adams x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[2]; 3374d876e2b0SMark Adams x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3]; 33755850ef23SBarry Smith break; 33765850ef23SBarry Smith case 3: 33775850ef23SBarry Smith v2 = a->a + ii[row + 1]; 33785850ef23SBarry Smith v3 = a->a + ii[row + 2]; 33795850ef23SBarry Smith sum1 = b[row]; 33805850ef23SBarry Smith sum2 = b[row + 1]; 33815850ef23SBarry Smith sum3 = b[row + 2]; 33825850ef23SBarry Smith for (n = 0; n < sz - 1; n += 2) { 33835850ef23SBarry Smith i1 = idx[0]; 33845850ef23SBarry Smith i2 = idx[1]; 33855850ef23SBarry Smith idx += 2; 33865850ef23SBarry Smith tmp0 = x[i1]; 33875850ef23SBarry Smith tmp1 = x[i2]; 33889371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 33899371c9d4SSatish Balay v1 += 2; 33909371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 33919371c9d4SSatish Balay v2 += 2; 33929371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 33939371c9d4SSatish Balay v3 += 2; 33945850ef23SBarry Smith } 3395d876e2b0SMark Adams if (n == sz - 1) { 3396d876e2b0SMark Adams tmp0 = x[*idx++]; 3397d876e2b0SMark Adams sum1 -= v1[0] * tmp0; 3398d876e2b0SMark Adams sum2 -= v2[0] * tmp0; 3399d876e2b0SMark Adams sum3 -= v3[0] * tmp0; 34009371c9d4SSatish Balay v1++; 34019371c9d4SSatish Balay v2++; 34029371c9d4SSatish Balay v3++; 3403d876e2b0SMark Adams } 3404d876e2b0SMark Adams t[row] = sum1; 3405d876e2b0SMark Adams t[row + 1] = sum2; 3406d876e2b0SMark Adams t[row + 2] = sum3; 3407d876e2b0SMark Adams sz = ii[row + 1] - diag[row] - 3; 3408d876e2b0SMark Adams idx = a->j + diag[row] + 3; 3409d876e2b0SMark Adams v1 += 3; 3410d876e2b0SMark Adams v2 += 3; 3411d876e2b0SMark Adams v3 += 3; 3412d876e2b0SMark Adams for (n = 0; n < sz - 1; n += 2) { 3413d876e2b0SMark Adams i1 = idx[0]; 3414d876e2b0SMark Adams i2 = idx[1]; 3415d876e2b0SMark Adams idx += 2; 3416d876e2b0SMark Adams tmp0 = x[i1]; 3417d876e2b0SMark Adams tmp1 = x[i2]; 34189371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 34199371c9d4SSatish Balay v1 += 2; 34209371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 34219371c9d4SSatish Balay v2 += 2; 34229371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 34239371c9d4SSatish Balay v3 += 2; 3424d876e2b0SMark Adams } 34255850ef23SBarry Smith if (n == sz - 1) { 34265850ef23SBarry Smith tmp0 = x[*idx]; 34275850ef23SBarry Smith sum1 -= v1[0] * tmp0; 34285850ef23SBarry Smith sum2 -= v2[0] * tmp0; 34295850ef23SBarry Smith sum3 -= v3[0] * tmp0; 34305850ef23SBarry Smith } 3431d876e2b0SMark Adams x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6]; 3432d876e2b0SMark Adams x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7]; 3433d876e2b0SMark Adams x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8]; 34345850ef23SBarry Smith break; 34355850ef23SBarry Smith case 4: 34365850ef23SBarry Smith v2 = a->a + ii[row + 1]; 34375850ef23SBarry Smith v3 = a->a + ii[row + 2]; 34385850ef23SBarry Smith v4 = a->a + ii[row + 3]; 34395850ef23SBarry Smith sum1 = b[row]; 34405850ef23SBarry Smith sum2 = b[row + 1]; 34415850ef23SBarry Smith sum3 = b[row + 2]; 34425850ef23SBarry Smith sum4 = b[row + 3]; 34435850ef23SBarry Smith for (n = 0; n < sz - 1; n += 2) { 34445850ef23SBarry Smith i1 = idx[0]; 34455850ef23SBarry Smith i2 = idx[1]; 34465850ef23SBarry Smith idx += 2; 34475850ef23SBarry Smith tmp0 = x[i1]; 34485850ef23SBarry Smith tmp1 = x[i2]; 34499371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 34509371c9d4SSatish Balay v1 += 2; 34519371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 34529371c9d4SSatish Balay v2 += 2; 34539371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 34549371c9d4SSatish Balay v3 += 2; 34559371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 34569371c9d4SSatish Balay v4 += 2; 34575850ef23SBarry Smith } 3458d876e2b0SMark Adams if (n == sz - 1) { 3459d876e2b0SMark Adams tmp0 = x[*idx++]; 3460d876e2b0SMark Adams sum1 -= v1[0] * tmp0; 3461d876e2b0SMark Adams sum2 -= v2[0] * tmp0; 3462d876e2b0SMark Adams sum3 -= v3[0] * tmp0; 3463d876e2b0SMark Adams sum4 -= v4[0] * tmp0; 34649371c9d4SSatish Balay v1++; 34659371c9d4SSatish Balay v2++; 34669371c9d4SSatish Balay v3++; 34679371c9d4SSatish Balay v4++; 3468d876e2b0SMark Adams } 3469d876e2b0SMark Adams t[row] = sum1; 3470d876e2b0SMark Adams t[row + 1] = sum2; 3471d876e2b0SMark Adams t[row + 2] = sum3; 3472d876e2b0SMark Adams t[row + 3] = sum4; 3473d876e2b0SMark Adams sz = ii[row + 1] - diag[row] - 4; 3474d876e2b0SMark Adams idx = a->j + diag[row] + 4; 3475d876e2b0SMark Adams v1 += 4; 3476d876e2b0SMark Adams v2 += 4; 3477d876e2b0SMark Adams v3 += 4; 3478d876e2b0SMark Adams v4 += 4; 3479d876e2b0SMark Adams for (n = 0; n < sz - 1; n += 2) { 3480d876e2b0SMark Adams i1 = idx[0]; 3481d876e2b0SMark Adams i2 = idx[1]; 3482d876e2b0SMark Adams idx += 2; 3483d876e2b0SMark Adams tmp0 = x[i1]; 3484d876e2b0SMark Adams tmp1 = x[i2]; 34859371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 34869371c9d4SSatish Balay v1 += 2; 34879371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 34889371c9d4SSatish Balay v2 += 2; 34899371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 34909371c9d4SSatish Balay v3 += 2; 34919371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 34929371c9d4SSatish Balay v4 += 2; 3493d876e2b0SMark Adams } 34945850ef23SBarry Smith if (n == sz - 1) { 34955850ef23SBarry Smith tmp0 = x[*idx]; 34965850ef23SBarry Smith sum1 -= v1[0] * tmp0; 34975850ef23SBarry Smith sum2 -= v2[0] * tmp0; 34985850ef23SBarry Smith sum3 -= v3[0] * tmp0; 34995850ef23SBarry Smith sum4 -= v4[0] * tmp0; 35005850ef23SBarry Smith } 3501d876e2b0SMark Adams x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12]; 3502d876e2b0SMark Adams x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13]; 3503d876e2b0SMark Adams x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14]; 3504d876e2b0SMark Adams x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15]; 35055850ef23SBarry Smith break; 35065850ef23SBarry Smith case 5: 35075850ef23SBarry Smith v2 = a->a + ii[row + 1]; 35085850ef23SBarry Smith v3 = a->a + ii[row + 2]; 35095850ef23SBarry Smith v4 = a->a + ii[row + 3]; 35105850ef23SBarry Smith v5 = a->a + ii[row + 4]; 35115850ef23SBarry Smith sum1 = b[row]; 35125850ef23SBarry Smith sum2 = b[row + 1]; 35135850ef23SBarry Smith sum3 = b[row + 2]; 35145850ef23SBarry Smith sum4 = b[row + 3]; 35155850ef23SBarry Smith sum5 = b[row + 4]; 35165850ef23SBarry Smith for (n = 0; n < sz - 1; n += 2) { 35175850ef23SBarry Smith i1 = idx[0]; 35185850ef23SBarry Smith i2 = idx[1]; 35195850ef23SBarry Smith idx += 2; 35205850ef23SBarry Smith tmp0 = x[i1]; 35215850ef23SBarry Smith tmp1 = x[i2]; 35229371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 35239371c9d4SSatish Balay v1 += 2; 35249371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 35259371c9d4SSatish Balay v2 += 2; 35269371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 35279371c9d4SSatish Balay v3 += 2; 35289371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 35299371c9d4SSatish Balay v4 += 2; 35309371c9d4SSatish Balay sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 35319371c9d4SSatish Balay v5 += 2; 35325850ef23SBarry Smith } 35335850ef23SBarry Smith if (n == sz - 1) { 3534d876e2b0SMark Adams tmp0 = x[*idx++]; 35355850ef23SBarry Smith sum1 -= v1[0] * tmp0; 35365850ef23SBarry Smith sum2 -= v2[0] * tmp0; 35375850ef23SBarry Smith sum3 -= v3[0] * tmp0; 35385850ef23SBarry Smith sum4 -= v4[0] * tmp0; 35395850ef23SBarry Smith sum5 -= v5[0] * tmp0; 35409371c9d4SSatish Balay v1++; 35419371c9d4SSatish Balay v2++; 35429371c9d4SSatish Balay v3++; 35439371c9d4SSatish Balay v4++; 35449371c9d4SSatish Balay v5++; 35455850ef23SBarry Smith } 3546d876e2b0SMark Adams t[row] = sum1; 3547d876e2b0SMark Adams t[row + 1] = sum2; 3548d876e2b0SMark Adams t[row + 2] = sum3; 3549d876e2b0SMark Adams t[row + 3] = sum4; 3550d876e2b0SMark Adams t[row + 4] = sum5; 3551d876e2b0SMark Adams sz = ii[row + 1] - diag[row] - 5; 3552d876e2b0SMark Adams idx = a->j + diag[row] + 5; 3553d876e2b0SMark Adams v1 += 5; 3554d876e2b0SMark Adams v2 += 5; 3555d876e2b0SMark Adams v3 += 5; 3556d876e2b0SMark Adams v4 += 5; 3557d876e2b0SMark Adams v5 += 5; 35585850ef23SBarry Smith for (n = 0; n < sz - 1; n += 2) { 35595850ef23SBarry Smith i1 = idx[0]; 35605850ef23SBarry Smith i2 = idx[1]; 35615850ef23SBarry Smith idx += 2; 35625850ef23SBarry Smith tmp0 = x[i1]; 35635850ef23SBarry Smith tmp1 = x[i2]; 35649371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 35659371c9d4SSatish Balay v1 += 2; 35669371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 35679371c9d4SSatish Balay v2 += 2; 35689371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 35699371c9d4SSatish Balay v3 += 2; 35709371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 35719371c9d4SSatish Balay v4 += 2; 35729371c9d4SSatish Balay sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 35739371c9d4SSatish Balay v5 += 2; 35745850ef23SBarry Smith } 35755850ef23SBarry Smith if (n == sz - 1) { 35765850ef23SBarry Smith tmp0 = x[*idx]; 3577d876e2b0SMark Adams sum1 -= v1[0] * tmp0; 3578d876e2b0SMark Adams sum2 -= v2[0] * tmp0; 3579d876e2b0SMark Adams sum3 -= v3[0] * tmp0; 3580d876e2b0SMark Adams sum4 -= v4[0] * tmp0; 3581d876e2b0SMark Adams sum5 -= v5[0] * tmp0; 35825850ef23SBarry Smith } 3583d876e2b0SMark Adams x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20]; 3584d876e2b0SMark Adams x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21]; 3585d876e2b0SMark Adams x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22]; 3586d876e2b0SMark Adams x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23]; 3587d876e2b0SMark Adams x[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24]; 3588d876e2b0SMark Adams break; 3589d71ae5a4SJacob Faibussowitsch default: 3590d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 3591d876e2b0SMark Adams } 3592d876e2b0SMark Adams } 3593d876e2b0SMark Adams xb = t; 35949566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz)); /* undercounts diag inverse */ 3595d876e2b0SMark Adams } else xb = b; 3596d876e2b0SMark Adams 3597d876e2b0SMark Adams if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 3598d876e2b0SMark Adams ibdiag = a->inode.ibdiag + a->inode.bdiagsize; 3599d876e2b0SMark Adams for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) { 3600d876e2b0SMark Adams ibdiag -= sizes[i] * sizes[i]; 3601d876e2b0SMark Adams 3602d876e2b0SMark Adams /* set RHS */ 3603d876e2b0SMark Adams if (xb == b) { 3604d876e2b0SMark Adams /* whole (old way) */ 3605d876e2b0SMark Adams sz = ii[row + 1] - ii[row]; 3606d876e2b0SMark Adams idx = a->j + ii[row]; 3607d876e2b0SMark Adams switch (sizes[i]) { 3608d71ae5a4SJacob Faibussowitsch case 5: 3609d71ae5a4SJacob Faibussowitsch v5 = a->a + ii[row - 4]; /* fall through */ 3610d71ae5a4SJacob Faibussowitsch case 4: 3611d71ae5a4SJacob Faibussowitsch v4 = a->a + ii[row - 3]; /* fall through */ 3612d71ae5a4SJacob Faibussowitsch case 3: 3613d71ae5a4SJacob Faibussowitsch v3 = a->a + ii[row - 2]; /* fall through */ 3614d71ae5a4SJacob Faibussowitsch case 2: 3615d71ae5a4SJacob Faibussowitsch v2 = a->a + ii[row - 1]; /* fall through */ 3616d71ae5a4SJacob Faibussowitsch case 1: 3617d71ae5a4SJacob Faibussowitsch v1 = a->a + ii[row]; 3618d71ae5a4SJacob Faibussowitsch break; 3619d71ae5a4SJacob Faibussowitsch default: 3620d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 3621d876e2b0SMark Adams } 3622d876e2b0SMark Adams } else { 3623d876e2b0SMark Adams /* upper, no diag */ 3624d876e2b0SMark Adams sz = ii[row + 1] - diag[row] - 1; 3625d876e2b0SMark Adams idx = a->j + diag[row] + 1; 3626d876e2b0SMark Adams switch (sizes[i]) { 3627d71ae5a4SJacob Faibussowitsch case 5: 3628d71ae5a4SJacob Faibussowitsch v5 = a->a + diag[row - 4] + 5; /* fall through */ 3629d71ae5a4SJacob Faibussowitsch case 4: 3630d71ae5a4SJacob Faibussowitsch v4 = a->a + diag[row - 3] + 4; /* fall through */ 3631d71ae5a4SJacob Faibussowitsch case 3: 3632d71ae5a4SJacob Faibussowitsch v3 = a->a + diag[row - 2] + 3; /* fall through */ 3633d71ae5a4SJacob Faibussowitsch case 2: 3634d71ae5a4SJacob Faibussowitsch v2 = a->a + diag[row - 1] + 2; /* fall through */ 3635d71ae5a4SJacob Faibussowitsch case 1: 3636d71ae5a4SJacob Faibussowitsch v1 = a->a + diag[row] + 1; 3637d876e2b0SMark Adams } 3638d876e2b0SMark Adams } 3639d876e2b0SMark Adams /* set sum */ 3640d876e2b0SMark Adams switch (sizes[i]) { 3641d71ae5a4SJacob Faibussowitsch case 5: 3642d71ae5a4SJacob Faibussowitsch sum5 = xb[row - 4]; /* fall through */ 3643d71ae5a4SJacob Faibussowitsch case 4: 3644d71ae5a4SJacob Faibussowitsch sum4 = xb[row - 3]; /* fall through */ 3645d71ae5a4SJacob Faibussowitsch case 3: 3646d71ae5a4SJacob Faibussowitsch sum3 = xb[row - 2]; /* fall through */ 3647d71ae5a4SJacob Faibussowitsch case 2: 3648d71ae5a4SJacob Faibussowitsch sum2 = xb[row - 1]; /* fall through */ 3649d876e2b0SMark Adams case 1: 3650d876e2b0SMark Adams /* note that sum1 is associated with the last row */ 3651d876e2b0SMark Adams sum1 = xb[row]; 3652d876e2b0SMark Adams } 3653d876e2b0SMark Adams /* do sums */ 3654d876e2b0SMark Adams for (n = 0; n < sz - 1; n += 2) { 3655d876e2b0SMark Adams i1 = idx[0]; 3656d876e2b0SMark Adams i2 = idx[1]; 3657d876e2b0SMark Adams idx += 2; 3658d876e2b0SMark Adams tmp0 = x[i1]; 3659d876e2b0SMark Adams tmp1 = x[i2]; 3660d876e2b0SMark Adams switch (sizes[i]) { 3661d71ae5a4SJacob Faibussowitsch case 5: 3662d71ae5a4SJacob Faibussowitsch sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3663d71ae5a4SJacob Faibussowitsch v5 += 2; /* fall through */ 3664d71ae5a4SJacob Faibussowitsch case 4: 3665d71ae5a4SJacob Faibussowitsch sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3666d71ae5a4SJacob Faibussowitsch v4 += 2; /* fall through */ 3667d71ae5a4SJacob Faibussowitsch case 3: 3668d71ae5a4SJacob Faibussowitsch sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3669d71ae5a4SJacob Faibussowitsch v3 += 2; /* fall through */ 3670d71ae5a4SJacob Faibussowitsch case 2: 3671d71ae5a4SJacob Faibussowitsch sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3672d71ae5a4SJacob Faibussowitsch v2 += 2; /* fall through */ 3673d71ae5a4SJacob Faibussowitsch case 1: 3674d71ae5a4SJacob Faibussowitsch sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3675d71ae5a4SJacob Faibussowitsch v1 += 2; 3676d876e2b0SMark Adams } 3677d876e2b0SMark Adams } 3678d876e2b0SMark Adams /* ragged edge */ 3679d876e2b0SMark Adams if (n == sz - 1) { 3680d876e2b0SMark Adams tmp0 = x[*idx]; 3681d876e2b0SMark Adams switch (sizes[i]) { 3682d71ae5a4SJacob Faibussowitsch case 5: 3683d71ae5a4SJacob Faibussowitsch sum5 -= *v5 * tmp0; /* fall through */ 3684d71ae5a4SJacob Faibussowitsch case 4: 3685d71ae5a4SJacob Faibussowitsch sum4 -= *v4 * tmp0; /* fall through */ 3686d71ae5a4SJacob Faibussowitsch case 3: 3687d71ae5a4SJacob Faibussowitsch sum3 -= *v3 * tmp0; /* fall through */ 3688d71ae5a4SJacob Faibussowitsch case 2: 3689d71ae5a4SJacob Faibussowitsch sum2 -= *v2 * tmp0; /* fall through */ 3690d71ae5a4SJacob Faibussowitsch case 1: 3691d71ae5a4SJacob Faibussowitsch sum1 -= *v1 * tmp0; 3692d876e2b0SMark Adams } 3693d876e2b0SMark Adams } 3694d876e2b0SMark Adams /* update */ 3695d876e2b0SMark Adams if (xb == b) { 3696d876e2b0SMark Adams /* whole (old way) w/ diag */ 3697d876e2b0SMark Adams switch (sizes[i]) { 3698d876e2b0SMark Adams case 5: 36995850ef23SBarry Smith x[row--] += sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24]; 37005850ef23SBarry Smith x[row--] += sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23]; 37015850ef23SBarry Smith x[row--] += sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22]; 37025850ef23SBarry Smith x[row--] += sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21]; 37035850ef23SBarry Smith x[row--] += sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20]; 37045850ef23SBarry Smith break; 3705d876e2b0SMark Adams case 4: 3706d876e2b0SMark Adams x[row--] += sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15]; 3707d876e2b0SMark Adams x[row--] += sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14]; 3708d876e2b0SMark Adams x[row--] += sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13]; 3709d876e2b0SMark Adams x[row--] += sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12]; 3710d876e2b0SMark Adams break; 3711d876e2b0SMark Adams case 3: 3712d876e2b0SMark Adams x[row--] += sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8]; 3713d876e2b0SMark Adams x[row--] += sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7]; 3714d876e2b0SMark Adams x[row--] += sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6]; 3715d876e2b0SMark Adams break; 3716d876e2b0SMark Adams case 2: 3717d876e2b0SMark Adams x[row--] += sum2 * ibdiag[1] + sum1 * ibdiag[3]; 3718d876e2b0SMark Adams x[row--] += sum2 * ibdiag[0] + sum1 * ibdiag[2]; 3719d876e2b0SMark Adams break; 3720d71ae5a4SJacob Faibussowitsch case 1: 3721d71ae5a4SJacob Faibussowitsch x[row--] += sum1 * (*ibdiag); 3722d71ae5a4SJacob Faibussowitsch break; 3723d876e2b0SMark Adams } 3724d876e2b0SMark Adams } else { 3725d876e2b0SMark Adams /* no diag so set = */ 3726d876e2b0SMark Adams switch (sizes[i]) { 3727d876e2b0SMark Adams case 5: 3728d876e2b0SMark Adams x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24]; 3729d876e2b0SMark Adams x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23]; 3730d876e2b0SMark Adams x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22]; 3731d876e2b0SMark Adams x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21]; 3732d876e2b0SMark Adams x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20]; 3733d876e2b0SMark Adams break; 3734d876e2b0SMark Adams case 4: 3735d876e2b0SMark Adams x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15]; 3736d876e2b0SMark Adams x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14]; 3737d876e2b0SMark Adams x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13]; 3738d876e2b0SMark Adams x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12]; 3739d876e2b0SMark Adams break; 3740d876e2b0SMark Adams case 3: 3741d876e2b0SMark Adams x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8]; 3742d876e2b0SMark Adams x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7]; 3743d876e2b0SMark Adams x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6]; 3744d876e2b0SMark Adams break; 3745d876e2b0SMark Adams case 2: 3746d876e2b0SMark Adams x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3]; 3747d876e2b0SMark Adams x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2]; 3748d876e2b0SMark Adams break; 3749d71ae5a4SJacob Faibussowitsch case 1: 3750d71ae5a4SJacob Faibussowitsch x[row--] = sum1 * (*ibdiag); 3751d71ae5a4SJacob Faibussowitsch break; 37525850ef23SBarry Smith } 37535850ef23SBarry Smith } 3754d876e2b0SMark Adams } 3755d876e2b0SMark Adams if (xb == b) { 37569566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz)); 3757d876e2b0SMark Adams } else { 37589566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper, undercounts diag inverse */ 3759d876e2b0SMark Adams } 37605850ef23SBarry Smith } 37612af78befSBarry Smith } 376289c6957cSBarry Smith if (flag & SOR_EISENSTAT) { 376389c6957cSBarry Smith /* 376489c6957cSBarry Smith Apply (U + D)^-1 where D is now the block diagonal 376589c6957cSBarry Smith */ 376689c6957cSBarry Smith ibdiag = a->inode.ibdiag + a->inode.bdiagsize; 376789c6957cSBarry Smith for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) { 376889c6957cSBarry Smith ibdiag -= sizes[i] * sizes[i]; 376989c6957cSBarry Smith sz = ii[row + 1] - diag[row] - 1; 377089c6957cSBarry Smith v1 = a->a + diag[row] + 1; 377189c6957cSBarry Smith idx = a->j + diag[row] + 1; 37724108e4d5SBarry Smith /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 377389c6957cSBarry Smith switch (sizes[i]) { 377489c6957cSBarry Smith case 1: 377589c6957cSBarry Smith 377689c6957cSBarry Smith sum1 = b[row]; 377789c6957cSBarry Smith for (n = 0; n < sz - 1; n += 2) { 377889c6957cSBarry Smith i1 = idx[0]; 377989c6957cSBarry Smith i2 = idx[1]; 378089c6957cSBarry Smith idx += 2; 378189c6957cSBarry Smith tmp0 = x[i1]; 378289c6957cSBarry Smith tmp1 = x[i2]; 37839371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 37849371c9d4SSatish Balay v1 += 2; 378589c6957cSBarry Smith } 378689c6957cSBarry Smith 378789c6957cSBarry Smith if (n == sz - 1) { 378889c6957cSBarry Smith tmp0 = x[*idx]; 378989c6957cSBarry Smith sum1 -= *v1 * tmp0; 379089c6957cSBarry Smith } 37919371c9d4SSatish Balay x[row] = sum1 * (*ibdiag); 37929371c9d4SSatish Balay row--; 379389c6957cSBarry Smith break; 379489c6957cSBarry Smith 379589c6957cSBarry Smith case 2: 379689c6957cSBarry Smith 379789c6957cSBarry Smith sum1 = b[row]; 379889c6957cSBarry Smith sum2 = b[row - 1]; 379989c6957cSBarry Smith /* note that sum1 is associated with the second of the two rows */ 380089c6957cSBarry Smith v2 = a->a + diag[row - 1] + 2; 380189c6957cSBarry Smith for (n = 0; n < sz - 1; n += 2) { 380289c6957cSBarry Smith i1 = idx[0]; 380389c6957cSBarry Smith i2 = idx[1]; 380489c6957cSBarry Smith idx += 2; 380589c6957cSBarry Smith tmp0 = x[i1]; 380689c6957cSBarry Smith tmp1 = x[i2]; 38079371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 38089371c9d4SSatish Balay v1 += 2; 38099371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 38109371c9d4SSatish Balay v2 += 2; 381189c6957cSBarry Smith } 381289c6957cSBarry Smith 381389c6957cSBarry Smith if (n == sz - 1) { 381489c6957cSBarry Smith tmp0 = x[*idx]; 381589c6957cSBarry Smith sum1 -= *v1 * tmp0; 381689c6957cSBarry Smith sum2 -= *v2 * tmp0; 381789c6957cSBarry Smith } 3818938d4eb3SBarry Smith x[row] = sum2 * ibdiag[1] + sum1 * ibdiag[3]; 3819938d4eb3SBarry Smith x[row - 1] = sum2 * ibdiag[0] + sum1 * ibdiag[2]; 3820938d4eb3SBarry Smith row -= 2; 382189c6957cSBarry Smith break; 382289c6957cSBarry Smith case 3: 382389c6957cSBarry Smith 382489c6957cSBarry Smith sum1 = b[row]; 382589c6957cSBarry Smith sum2 = b[row - 1]; 382689c6957cSBarry Smith sum3 = b[row - 2]; 382789c6957cSBarry Smith v2 = a->a + diag[row - 1] + 2; 382889c6957cSBarry Smith v3 = a->a + diag[row - 2] + 3; 382989c6957cSBarry Smith for (n = 0; n < sz - 1; n += 2) { 383089c6957cSBarry Smith i1 = idx[0]; 383189c6957cSBarry Smith i2 = idx[1]; 383289c6957cSBarry Smith idx += 2; 383389c6957cSBarry Smith tmp0 = x[i1]; 383489c6957cSBarry Smith tmp1 = x[i2]; 38359371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 38369371c9d4SSatish Balay v1 += 2; 38379371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 38389371c9d4SSatish Balay v2 += 2; 38399371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 38409371c9d4SSatish Balay v3 += 2; 384189c6957cSBarry Smith } 384289c6957cSBarry Smith 384389c6957cSBarry Smith if (n == sz - 1) { 384489c6957cSBarry Smith tmp0 = x[*idx]; 384589c6957cSBarry Smith sum1 -= *v1 * tmp0; 384689c6957cSBarry Smith sum2 -= *v2 * tmp0; 384789c6957cSBarry Smith sum3 -= *v3 * tmp0; 384889c6957cSBarry Smith } 3849938d4eb3SBarry Smith x[row] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8]; 3850938d4eb3SBarry Smith x[row - 1] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7]; 3851938d4eb3SBarry Smith x[row - 2] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6]; 3852938d4eb3SBarry Smith row -= 3; 385389c6957cSBarry Smith break; 385489c6957cSBarry Smith case 4: 385589c6957cSBarry Smith 385689c6957cSBarry Smith sum1 = b[row]; 385789c6957cSBarry Smith sum2 = b[row - 1]; 385889c6957cSBarry Smith sum3 = b[row - 2]; 385989c6957cSBarry Smith sum4 = b[row - 3]; 386089c6957cSBarry Smith v2 = a->a + diag[row - 1] + 2; 386189c6957cSBarry Smith v3 = a->a + diag[row - 2] + 3; 386289c6957cSBarry Smith v4 = a->a + diag[row - 3] + 4; 386389c6957cSBarry Smith for (n = 0; n < sz - 1; n += 2) { 386489c6957cSBarry Smith i1 = idx[0]; 386589c6957cSBarry Smith i2 = idx[1]; 386689c6957cSBarry Smith idx += 2; 386789c6957cSBarry Smith tmp0 = x[i1]; 386889c6957cSBarry Smith tmp1 = x[i2]; 38699371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 38709371c9d4SSatish Balay v1 += 2; 38719371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 38729371c9d4SSatish Balay v2 += 2; 38739371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 38749371c9d4SSatish Balay v3 += 2; 38759371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 38769371c9d4SSatish Balay v4 += 2; 387789c6957cSBarry Smith } 387889c6957cSBarry Smith 387989c6957cSBarry Smith if (n == sz - 1) { 388089c6957cSBarry Smith tmp0 = x[*idx]; 388189c6957cSBarry Smith sum1 -= *v1 * tmp0; 388289c6957cSBarry Smith sum2 -= *v2 * tmp0; 388389c6957cSBarry Smith sum3 -= *v3 * tmp0; 388489c6957cSBarry Smith sum4 -= *v4 * tmp0; 388589c6957cSBarry Smith } 3886938d4eb3SBarry Smith x[row] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15]; 3887938d4eb3SBarry Smith x[row - 1] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14]; 3888938d4eb3SBarry Smith x[row - 2] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13]; 3889938d4eb3SBarry Smith x[row - 3] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12]; 3890938d4eb3SBarry Smith row -= 4; 389189c6957cSBarry Smith break; 389289c6957cSBarry Smith case 5: 389389c6957cSBarry Smith 389489c6957cSBarry Smith sum1 = b[row]; 389589c6957cSBarry Smith sum2 = b[row - 1]; 389689c6957cSBarry Smith sum3 = b[row - 2]; 389789c6957cSBarry Smith sum4 = b[row - 3]; 389889c6957cSBarry Smith sum5 = b[row - 4]; 389989c6957cSBarry Smith v2 = a->a + diag[row - 1] + 2; 390089c6957cSBarry Smith v3 = a->a + diag[row - 2] + 3; 390189c6957cSBarry Smith v4 = a->a + diag[row - 3] + 4; 390289c6957cSBarry Smith v5 = a->a + diag[row - 4] + 5; 390389c6957cSBarry Smith for (n = 0; n < sz - 1; n += 2) { 390489c6957cSBarry Smith i1 = idx[0]; 390589c6957cSBarry Smith i2 = idx[1]; 390689c6957cSBarry Smith idx += 2; 390789c6957cSBarry Smith tmp0 = x[i1]; 390889c6957cSBarry Smith tmp1 = x[i2]; 39099371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 39109371c9d4SSatish Balay v1 += 2; 39119371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 39129371c9d4SSatish Balay v2 += 2; 39139371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 39149371c9d4SSatish Balay v3 += 2; 39159371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 39169371c9d4SSatish Balay v4 += 2; 39179371c9d4SSatish Balay sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 39189371c9d4SSatish Balay v5 += 2; 391989c6957cSBarry Smith } 392089c6957cSBarry Smith 392189c6957cSBarry Smith if (n == sz - 1) { 392289c6957cSBarry Smith tmp0 = x[*idx]; 392389c6957cSBarry Smith sum1 -= *v1 * tmp0; 392489c6957cSBarry Smith sum2 -= *v2 * tmp0; 392589c6957cSBarry Smith sum3 -= *v3 * tmp0; 392689c6957cSBarry Smith sum4 -= *v4 * tmp0; 392789c6957cSBarry Smith sum5 -= *v5 * tmp0; 392889c6957cSBarry Smith } 3929938d4eb3SBarry Smith x[row] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24]; 3930938d4eb3SBarry Smith x[row - 1] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23]; 3931938d4eb3SBarry Smith x[row - 2] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22]; 3932938d4eb3SBarry Smith x[row - 3] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21]; 3933938d4eb3SBarry Smith x[row - 4] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20]; 3934938d4eb3SBarry Smith row -= 5; 393589c6957cSBarry Smith break; 3936d71ae5a4SJacob Faibussowitsch default: 3937d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 393889c6957cSBarry Smith } 393989c6957cSBarry Smith } 39409566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(a->nz)); 394189c6957cSBarry Smith 394289c6957cSBarry Smith /* 394389c6957cSBarry Smith t = b - D x where D is the block diagonal 394489c6957cSBarry Smith */ 394589c6957cSBarry Smith cnt = 0; 394689c6957cSBarry Smith for (i = 0, row = 0; i < m; i++) { 394789c6957cSBarry Smith switch (sizes[i]) { 394889c6957cSBarry Smith case 1: 39499371c9d4SSatish Balay t[row] = b[row] - bdiag[cnt++] * x[row]; 39509371c9d4SSatish Balay row++; 395189c6957cSBarry Smith break; 395289c6957cSBarry Smith case 2: 39539371c9d4SSatish Balay x1 = x[row]; 39549371c9d4SSatish Balay x2 = x[row + 1]; 395589c6957cSBarry Smith tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2]; 395689c6957cSBarry Smith tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3]; 395789c6957cSBarry Smith t[row] = b[row] - tmp1; 39589371c9d4SSatish Balay t[row + 1] = b[row + 1] - tmp2; 39599371c9d4SSatish Balay row += 2; 396089c6957cSBarry Smith cnt += 4; 396189c6957cSBarry Smith break; 396289c6957cSBarry Smith case 3: 39639371c9d4SSatish Balay x1 = x[row]; 39649371c9d4SSatish Balay x2 = x[row + 1]; 39659371c9d4SSatish Balay x3 = x[row + 2]; 396689c6957cSBarry Smith tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6]; 396789c6957cSBarry Smith tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7]; 396889c6957cSBarry Smith tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8]; 396989c6957cSBarry Smith t[row] = b[row] - tmp1; 397089c6957cSBarry Smith t[row + 1] = b[row + 1] - tmp2; 39719371c9d4SSatish Balay t[row + 2] = b[row + 2] - tmp3; 39729371c9d4SSatish Balay row += 3; 397389c6957cSBarry Smith cnt += 9; 397489c6957cSBarry Smith break; 397589c6957cSBarry Smith case 4: 39769371c9d4SSatish Balay x1 = x[row]; 39779371c9d4SSatish Balay x2 = x[row + 1]; 39789371c9d4SSatish Balay x3 = x[row + 2]; 39799371c9d4SSatish Balay x4 = x[row + 3]; 398089c6957cSBarry Smith tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12]; 398189c6957cSBarry Smith tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13]; 398289c6957cSBarry Smith tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14]; 398389c6957cSBarry Smith tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15]; 398489c6957cSBarry Smith t[row] = b[row] - tmp1; 398589c6957cSBarry Smith t[row + 1] = b[row + 1] - tmp2; 398689c6957cSBarry Smith t[row + 2] = b[row + 2] - tmp3; 39879371c9d4SSatish Balay t[row + 3] = b[row + 3] - tmp4; 39889371c9d4SSatish Balay row += 4; 398989c6957cSBarry Smith cnt += 16; 399089c6957cSBarry Smith break; 399189c6957cSBarry Smith case 5: 39929371c9d4SSatish Balay x1 = x[row]; 39939371c9d4SSatish Balay x2 = x[row + 1]; 39949371c9d4SSatish Balay x3 = x[row + 2]; 39959371c9d4SSatish Balay x4 = x[row + 3]; 39969371c9d4SSatish Balay x5 = x[row + 4]; 399789c6957cSBarry Smith tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20]; 399889c6957cSBarry Smith tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21]; 399989c6957cSBarry Smith tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22]; 400089c6957cSBarry Smith tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23]; 400189c6957cSBarry Smith tmp5 = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24]; 400289c6957cSBarry Smith t[row] = b[row] - tmp1; 400389c6957cSBarry Smith t[row + 1] = b[row + 1] - tmp2; 400489c6957cSBarry Smith t[row + 2] = b[row + 2] - tmp3; 400589c6957cSBarry Smith t[row + 3] = b[row + 3] - tmp4; 40069371c9d4SSatish Balay t[row + 4] = b[row + 4] - tmp5; 40079371c9d4SSatish Balay row += 5; 400889c6957cSBarry Smith cnt += 25; 400989c6957cSBarry Smith break; 4010d71ae5a4SJacob Faibussowitsch default: 4011d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 401289c6957cSBarry Smith } 401389c6957cSBarry Smith } 40149566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(m)); 401589c6957cSBarry Smith 401689c6957cSBarry Smith /* 401789c6957cSBarry Smith Apply (L + D)^-1 where D is the block diagonal 401889c6957cSBarry Smith */ 401989c6957cSBarry Smith for (i = 0, row = 0; i < m; i++) { 402089c6957cSBarry Smith sz = diag[row] - ii[row]; 402189c6957cSBarry Smith v1 = a->a + ii[row]; 402289c6957cSBarry Smith idx = a->j + ii[row]; 40234108e4d5SBarry Smith /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 402489c6957cSBarry Smith switch (sizes[i]) { 402589c6957cSBarry Smith case 1: 402689c6957cSBarry Smith 402789c6957cSBarry Smith sum1 = t[row]; 402889c6957cSBarry Smith for (n = 0; n < sz - 1; n += 2) { 402989c6957cSBarry Smith i1 = idx[0]; 403089c6957cSBarry Smith i2 = idx[1]; 403189c6957cSBarry Smith idx += 2; 403289c6957cSBarry Smith tmp0 = t[i1]; 403389c6957cSBarry Smith tmp1 = t[i2]; 40349371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 40359371c9d4SSatish Balay v1 += 2; 403689c6957cSBarry Smith } 403789c6957cSBarry Smith 403889c6957cSBarry Smith if (n == sz - 1) { 403989c6957cSBarry Smith tmp0 = t[*idx]; 404089c6957cSBarry Smith sum1 -= *v1 * tmp0; 404189c6957cSBarry Smith } 40429371c9d4SSatish Balay x[row] += t[row] = sum1 * (*ibdiag++); 40439371c9d4SSatish Balay row++; 404489c6957cSBarry Smith break; 404589c6957cSBarry Smith case 2: 404689c6957cSBarry Smith v2 = a->a + ii[row + 1]; 404789c6957cSBarry Smith sum1 = t[row]; 404889c6957cSBarry Smith sum2 = t[row + 1]; 404989c6957cSBarry Smith for (n = 0; n < sz - 1; n += 2) { 405089c6957cSBarry Smith i1 = idx[0]; 405189c6957cSBarry Smith i2 = idx[1]; 405289c6957cSBarry Smith idx += 2; 405389c6957cSBarry Smith tmp0 = t[i1]; 405489c6957cSBarry Smith tmp1 = t[i2]; 40559371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 40569371c9d4SSatish Balay v1 += 2; 40579371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 40589371c9d4SSatish Balay v2 += 2; 405989c6957cSBarry Smith } 406089c6957cSBarry Smith 406189c6957cSBarry Smith if (n == sz - 1) { 406289c6957cSBarry Smith tmp0 = t[*idx]; 406389c6957cSBarry Smith sum1 -= v1[0] * tmp0; 406489c6957cSBarry Smith sum2 -= v2[0] * tmp0; 406589c6957cSBarry Smith } 406689c6957cSBarry Smith x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[2]; 406789c6957cSBarry Smith x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3]; 40689371c9d4SSatish Balay ibdiag += 4; 40699371c9d4SSatish Balay row += 2; 407089c6957cSBarry Smith break; 407189c6957cSBarry Smith case 3: 407289c6957cSBarry Smith v2 = a->a + ii[row + 1]; 407389c6957cSBarry Smith v3 = a->a + ii[row + 2]; 407489c6957cSBarry Smith sum1 = t[row]; 407589c6957cSBarry Smith sum2 = t[row + 1]; 407689c6957cSBarry Smith sum3 = t[row + 2]; 407789c6957cSBarry Smith for (n = 0; n < sz - 1; n += 2) { 407889c6957cSBarry Smith i1 = idx[0]; 407989c6957cSBarry Smith i2 = idx[1]; 408089c6957cSBarry Smith idx += 2; 408189c6957cSBarry Smith tmp0 = t[i1]; 408289c6957cSBarry Smith tmp1 = t[i2]; 40839371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 40849371c9d4SSatish Balay v1 += 2; 40859371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 40869371c9d4SSatish Balay v2 += 2; 40879371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 40889371c9d4SSatish Balay v3 += 2; 408989c6957cSBarry Smith } 409089c6957cSBarry Smith 409189c6957cSBarry Smith if (n == sz - 1) { 409289c6957cSBarry Smith tmp0 = t[*idx]; 409389c6957cSBarry Smith sum1 -= v1[0] * tmp0; 409489c6957cSBarry Smith sum2 -= v2[0] * tmp0; 409589c6957cSBarry Smith sum3 -= v3[0] * tmp0; 409689c6957cSBarry Smith } 409789c6957cSBarry Smith x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6]; 409889c6957cSBarry Smith x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7]; 409989c6957cSBarry Smith x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8]; 41009371c9d4SSatish Balay ibdiag += 9; 41019371c9d4SSatish Balay row += 3; 410289c6957cSBarry Smith break; 410389c6957cSBarry Smith case 4: 410489c6957cSBarry Smith v2 = a->a + ii[row + 1]; 410589c6957cSBarry Smith v3 = a->a + ii[row + 2]; 410689c6957cSBarry Smith v4 = a->a + ii[row + 3]; 410789c6957cSBarry Smith sum1 = t[row]; 410889c6957cSBarry Smith sum2 = t[row + 1]; 410989c6957cSBarry Smith sum3 = t[row + 2]; 411089c6957cSBarry Smith sum4 = t[row + 3]; 411189c6957cSBarry Smith for (n = 0; n < sz - 1; n += 2) { 411289c6957cSBarry Smith i1 = idx[0]; 411389c6957cSBarry Smith i2 = idx[1]; 411489c6957cSBarry Smith idx += 2; 411589c6957cSBarry Smith tmp0 = t[i1]; 411689c6957cSBarry Smith tmp1 = t[i2]; 41179371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 41189371c9d4SSatish Balay v1 += 2; 41199371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 41209371c9d4SSatish Balay v2 += 2; 41219371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 41229371c9d4SSatish Balay v3 += 2; 41239371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 41249371c9d4SSatish Balay v4 += 2; 412589c6957cSBarry Smith } 412689c6957cSBarry Smith 412789c6957cSBarry Smith if (n == sz - 1) { 412889c6957cSBarry Smith tmp0 = t[*idx]; 412989c6957cSBarry Smith sum1 -= v1[0] * tmp0; 413089c6957cSBarry Smith sum2 -= v2[0] * tmp0; 413189c6957cSBarry Smith sum3 -= v3[0] * tmp0; 413289c6957cSBarry Smith sum4 -= v4[0] * tmp0; 413389c6957cSBarry Smith } 413489c6957cSBarry Smith x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12]; 413589c6957cSBarry Smith x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13]; 413689c6957cSBarry Smith x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14]; 413789c6957cSBarry Smith x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15]; 41389371c9d4SSatish Balay ibdiag += 16; 41399371c9d4SSatish Balay row += 4; 414089c6957cSBarry Smith break; 414189c6957cSBarry Smith case 5: 414289c6957cSBarry Smith v2 = a->a + ii[row + 1]; 414389c6957cSBarry Smith v3 = a->a + ii[row + 2]; 414489c6957cSBarry Smith v4 = a->a + ii[row + 3]; 414589c6957cSBarry Smith v5 = a->a + ii[row + 4]; 414689c6957cSBarry Smith sum1 = t[row]; 414789c6957cSBarry Smith sum2 = t[row + 1]; 414889c6957cSBarry Smith sum3 = t[row + 2]; 414989c6957cSBarry Smith sum4 = t[row + 3]; 415089c6957cSBarry Smith sum5 = t[row + 4]; 415189c6957cSBarry Smith for (n = 0; n < sz - 1; n += 2) { 415289c6957cSBarry Smith i1 = idx[0]; 415389c6957cSBarry Smith i2 = idx[1]; 415489c6957cSBarry Smith idx += 2; 415589c6957cSBarry Smith tmp0 = t[i1]; 415689c6957cSBarry Smith tmp1 = t[i2]; 41579371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 41589371c9d4SSatish Balay v1 += 2; 41599371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 41609371c9d4SSatish Balay v2 += 2; 41619371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 41629371c9d4SSatish Balay v3 += 2; 41639371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 41649371c9d4SSatish Balay v4 += 2; 41659371c9d4SSatish Balay sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 41669371c9d4SSatish Balay v5 += 2; 416789c6957cSBarry Smith } 416889c6957cSBarry Smith 416989c6957cSBarry Smith if (n == sz - 1) { 417089c6957cSBarry Smith tmp0 = t[*idx]; 417189c6957cSBarry Smith sum1 -= v1[0] * tmp0; 417289c6957cSBarry Smith sum2 -= v2[0] * tmp0; 417389c6957cSBarry Smith sum3 -= v3[0] * tmp0; 417489c6957cSBarry Smith sum4 -= v4[0] * tmp0; 417589c6957cSBarry Smith sum5 -= v5[0] * tmp0; 417689c6957cSBarry Smith } 417789c6957cSBarry Smith x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20]; 417889c6957cSBarry Smith x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21]; 417989c6957cSBarry Smith x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22]; 418089c6957cSBarry Smith x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23]; 418189c6957cSBarry Smith x[row + 4] += t[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24]; 41829371c9d4SSatish Balay ibdiag += 25; 41839371c9d4SSatish Balay row += 5; 418489c6957cSBarry Smith break; 4185d71ae5a4SJacob Faibussowitsch default: 4186d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 418789c6957cSBarry Smith } 418889c6957cSBarry Smith } 41899566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(a->nz)); 41905850ef23SBarry Smith } 41919566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 41929566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 41933ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 41942af78befSBarry Smith } 41952af78befSBarry Smith 4196*ff6a9541SJacob Faibussowitsch static PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A, Vec bb, Vec xx) 4197d71ae5a4SJacob Faibussowitsch { 419889c6957cSBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 419989c6957cSBarry Smith PetscScalar *x, tmp1, tmp2, tmp3, tmp4, tmp5, x1, x2, x3, x4, x5; 420089c6957cSBarry Smith const MatScalar *bdiag = a->inode.bdiag; 420189c6957cSBarry Smith const PetscScalar *b; 420289c6957cSBarry Smith PetscInt m = a->inode.node_count, cnt = 0, i, row; 420389c6957cSBarry Smith const PetscInt *sizes = a->inode.size; 42042af78befSBarry Smith 420589c6957cSBarry Smith PetscFunctionBegin; 420608401ef6SPierre Jolivet PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 42079566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 42089566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 420989c6957cSBarry Smith cnt = 0; 421089c6957cSBarry Smith for (i = 0, row = 0; i < m; i++) { 421189c6957cSBarry Smith switch (sizes[i]) { 421289c6957cSBarry Smith case 1: 42139371c9d4SSatish Balay x[row] = b[row] * bdiag[cnt++]; 42149371c9d4SSatish Balay row++; 421589c6957cSBarry Smith break; 421689c6957cSBarry Smith case 2: 42179371c9d4SSatish Balay x1 = b[row]; 42189371c9d4SSatish Balay x2 = b[row + 1]; 421989c6957cSBarry Smith tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2]; 422089c6957cSBarry Smith tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3]; 422189c6957cSBarry Smith x[row++] = tmp1; 422289c6957cSBarry Smith x[row++] = tmp2; 422389c6957cSBarry Smith cnt += 4; 422489c6957cSBarry Smith break; 422589c6957cSBarry Smith case 3: 42269371c9d4SSatish Balay x1 = b[row]; 42279371c9d4SSatish Balay x2 = b[row + 1]; 42289371c9d4SSatish Balay x3 = b[row + 2]; 422989c6957cSBarry Smith tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6]; 423089c6957cSBarry Smith tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7]; 423189c6957cSBarry Smith tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8]; 423289c6957cSBarry Smith x[row++] = tmp1; 423389c6957cSBarry Smith x[row++] = tmp2; 423489c6957cSBarry Smith x[row++] = tmp3; 423589c6957cSBarry Smith cnt += 9; 423689c6957cSBarry Smith break; 423789c6957cSBarry Smith case 4: 42389371c9d4SSatish Balay x1 = b[row]; 42399371c9d4SSatish Balay x2 = b[row + 1]; 42409371c9d4SSatish Balay x3 = b[row + 2]; 42419371c9d4SSatish Balay x4 = b[row + 3]; 424289c6957cSBarry Smith tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12]; 424389c6957cSBarry Smith tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13]; 424489c6957cSBarry Smith tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14]; 424589c6957cSBarry Smith tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15]; 424689c6957cSBarry Smith x[row++] = tmp1; 424789c6957cSBarry Smith x[row++] = tmp2; 424889c6957cSBarry Smith x[row++] = tmp3; 424989c6957cSBarry Smith x[row++] = tmp4; 425089c6957cSBarry Smith cnt += 16; 425189c6957cSBarry Smith break; 425289c6957cSBarry Smith case 5: 42539371c9d4SSatish Balay x1 = b[row]; 42549371c9d4SSatish Balay x2 = b[row + 1]; 42559371c9d4SSatish Balay x3 = b[row + 2]; 42569371c9d4SSatish Balay x4 = b[row + 3]; 42579371c9d4SSatish Balay x5 = b[row + 4]; 425889c6957cSBarry Smith tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20]; 425989c6957cSBarry Smith tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21]; 426089c6957cSBarry Smith tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22]; 426189c6957cSBarry Smith tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23]; 426289c6957cSBarry Smith tmp5 = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24]; 426389c6957cSBarry Smith x[row++] = tmp1; 426489c6957cSBarry Smith x[row++] = tmp2; 426589c6957cSBarry Smith x[row++] = tmp3; 426689c6957cSBarry Smith x[row++] = tmp4; 426789c6957cSBarry Smith x[row++] = tmp5; 426889c6957cSBarry Smith cnt += 25; 426989c6957cSBarry Smith break; 4270d71ae5a4SJacob Faibussowitsch default: 4271d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 427289c6957cSBarry Smith } 427389c6957cSBarry Smith } 42749566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * cnt)); 42759566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 42769566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 42773ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 427889c6957cSBarry Smith } 427989c6957cSBarry Smith 4280d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJ_Inode_ResetOps(Mat A) 4281d71ae5a4SJacob Faibussowitsch { 4282b215bc84SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4283b215bc84SStefano Zampini 4284b215bc84SStefano Zampini PetscFunctionBegin; 4285b215bc84SStefano Zampini a->inode.node_count = 0; 4286b215bc84SStefano Zampini a->inode.use = PETSC_FALSE; 4287b215bc84SStefano Zampini a->inode.checked = PETSC_FALSE; 4288b215bc84SStefano Zampini a->inode.mat_nonzerostate = -1; 4289b215bc84SStefano Zampini A->ops->getrowij = MatGetRowIJ_SeqAIJ; 4290b215bc84SStefano Zampini A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ; 4291b215bc84SStefano Zampini A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ; 4292b215bc84SStefano Zampini A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ; 4293b215bc84SStefano Zampini A->ops->coloringpatch = NULL; 4294b215bc84SStefano Zampini A->ops->multdiagonalblock = NULL; 4295ad540459SPierre Jolivet if (A->factortype) A->ops->solve = MatSolve_SeqAIJ_inplace; 42963ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4297b215bc84SStefano Zampini } 4298b215bc84SStefano Zampini 42994c1414c8SBarry Smith /* 43004c1414c8SBarry Smith samestructure indicates that the matrix has not changed its nonzero structure so we 43014c1414c8SBarry Smith do not need to recompute the inodes 43024c1414c8SBarry Smith */ 4303d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCheckInode(Mat A) 4304d71ae5a4SJacob Faibussowitsch { 43054c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 43068758e1faSBarry Smith PetscInt i, j, m, nzx, nzy, *ns, node_count, blk_size; 4307ace3abfcSBarry Smith PetscBool flag; 43088758e1faSBarry Smith const PetscInt *idx, *idy, *ii; 43094c1414c8SBarry Smith 43104c1414c8SBarry Smith PetscFunctionBegin; 4311b215bc84SStefano Zampini if (!a->inode.use) { 43129566063dSJacob Faibussowitsch PetscCall(MatSeqAIJ_Inode_ResetOps(A)); 43139566063dSJacob Faibussowitsch PetscCall(PetscFree(a->inode.size)); 43143ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4315b215bc84SStefano Zampini } 43163ba16761SJacob Faibussowitsch if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(PETSC_SUCCESS); 43174c1414c8SBarry Smith 4318d0f46423SBarry Smith m = A->rmap->n; 43199566063dSJacob Faibussowitsch if (!a->inode.size) PetscCall(PetscMalloc1(m + 1, &a->inode.size)); 4320b215bc84SStefano Zampini ns = a->inode.size; 43214c1414c8SBarry Smith 43224c1414c8SBarry Smith i = 0; 43234c1414c8SBarry Smith node_count = 0; 43244c1414c8SBarry Smith idx = a->j; 43254c1414c8SBarry Smith ii = a->i; 43264c1414c8SBarry Smith while (i < m) { /* For each row */ 43274c1414c8SBarry Smith nzx = ii[i + 1] - ii[i]; /* Number of nonzeros */ 43284c1414c8SBarry Smith /* Limits the number of elements in a node to 'a->inode.limit' */ 43294c1414c8SBarry Smith for (j = i + 1, idy = idx, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) { 43304c1414c8SBarry Smith nzy = ii[j + 1] - ii[j]; /* Same number of nonzeros */ 43314c1414c8SBarry Smith if (nzy != nzx) break; 43324c1414c8SBarry Smith idy += nzx; /* Same nonzero pattern */ 43339566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(idx, idy, nzx, &flag)); 43344c1414c8SBarry Smith if (!flag) break; 43354c1414c8SBarry Smith } 43364c1414c8SBarry Smith ns[node_count++] = blk_size; 43374c1414c8SBarry Smith idx += blk_size * nzx; 43384c1414c8SBarry Smith i = j; 43394c1414c8SBarry Smith } 43402cb58ee3SKarl Rupp 43414c1414c8SBarry Smith /* If not enough inodes found,, do not use inode version of the routines */ 4342be6adb11SBarry Smith if (!m || node_count > .8 * m) { 43439566063dSJacob Faibussowitsch PetscCall(MatSeqAIJ_Inode_ResetOps(A)); 43449566063dSJacob Faibussowitsch PetscCall(PetscFree(a->inode.size)); 43459566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m)); 43464c1414c8SBarry Smith } else { 4347d5f3da31SBarry Smith if (!A->factortype) { 4348375a6242SBarry Smith A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4349375a6242SBarry Smith if (A->rmap->n == A->cmap->n) { 43504108e4d5SBarry Smith A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 43514108e4d5SBarry Smith A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 43524108e4d5SBarry Smith A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 43534108e4d5SBarry Smith A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 43544108e4d5SBarry Smith A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 4355375a6242SBarry Smith } 4356d3ac4fa3SBarry Smith } else { 4357d3ac4fa3SBarry Smith A->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4358d3ac4fa3SBarry Smith } 43594c1414c8SBarry Smith a->inode.node_count = node_count; 43609566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit)); 43614c1414c8SBarry Smith } 4362be6adb11SBarry Smith a->inode.checked = PETSC_TRUE; 4363a02bda8eSBarry Smith a->inode.mat_nonzerostate = A->nonzerostate; 43643ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 43654c1414c8SBarry Smith } 43664c1414c8SBarry Smith 4367d71ae5a4SJacob Faibussowitsch PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A, MatDuplicateOption cpvalues, Mat *C) 4368d71ae5a4SJacob Faibussowitsch { 4369150f0143SBarry Smith Mat B = *C; 4370150f0143SBarry Smith Mat_SeqAIJ *c = (Mat_SeqAIJ *)B->data, *a = (Mat_SeqAIJ *)A->data; 4371150f0143SBarry Smith PetscInt m = A->rmap->n; 4372150f0143SBarry Smith 4373150f0143SBarry Smith PetscFunctionBegin; 4374150f0143SBarry Smith c->inode.use = a->inode.use; 4375150f0143SBarry Smith c->inode.limit = a->inode.limit; 4376150f0143SBarry Smith c->inode.max_limit = a->inode.max_limit; 4377ec710b6aSStefano Zampini c->inode.checked = PETSC_FALSE; 4378ec710b6aSStefano Zampini c->inode.size = NULL; 4379ec710b6aSStefano Zampini c->inode.node_count = 0; 4380ec710b6aSStefano Zampini c->inode.ibdiagvalid = PETSC_FALSE; 4381ec710b6aSStefano Zampini c->inode.ibdiag = NULL; 4382ec710b6aSStefano Zampini c->inode.bdiag = NULL; 4383ec710b6aSStefano Zampini c->inode.mat_nonzerostate = -1; 4384b215bc84SStefano Zampini if (a->inode.use) { 4385ec710b6aSStefano Zampini if (a->inode.checked && a->inode.size) { 43869566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m + 1, &c->inode.size)); 43879566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(c->inode.size, a->inode.size, m + 1)); 4388ec710b6aSStefano Zampini 4389ec710b6aSStefano Zampini c->inode.checked = PETSC_TRUE; 4390ec710b6aSStefano Zampini c->inode.node_count = a->inode.node_count; 4391ec710b6aSStefano Zampini c->inode.mat_nonzerostate = (*C)->nonzerostate; 4392ec710b6aSStefano Zampini } 4393a02bda8eSBarry Smith /* note the table of functions below should match that in MatSeqAIJCheckInode() */ 43942c451681SBarry Smith if (!B->factortype) { 43952c451681SBarry Smith B->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 43962c451681SBarry Smith B->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 43972c451681SBarry Smith B->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 43982c451681SBarry Smith B->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 43992c451681SBarry Smith B->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 44002c451681SBarry Smith B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4401150f0143SBarry Smith } else { 44022c451681SBarry Smith B->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4403150f0143SBarry Smith } 4404150f0143SBarry Smith } 44053ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4406150f0143SBarry Smith } 4407150f0143SBarry Smith 4408d71ae5a4SJacob Faibussowitsch static inline PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols, PetscInt nzl, PetscInt nzu, PetscInt nz, const PetscInt *ai, const PetscInt *aj, const PetscInt *adiag, PetscInt row) 4409d71ae5a4SJacob Faibussowitsch { 44108758e1faSBarry Smith PetscInt k; 44118758e1faSBarry Smith const PetscInt *vi; 44126e111a19SKarl Rupp 441317454e89SShri Abhyankar PetscFunctionBegin; 441417454e89SShri Abhyankar vi = aj + ai[row]; 441517454e89SShri Abhyankar for (k = 0; k < nzl; k++) cols[k] = vi[k]; 441617454e89SShri Abhyankar vi = aj + adiag[row]; 441717454e89SShri Abhyankar cols[nzl] = vi[0]; 441817454e89SShri Abhyankar vi = aj + adiag[row + 1] + 1; 441917454e89SShri Abhyankar for (k = 0; k < nzu; k++) cols[nzl + 1 + k] = vi[k]; 44203ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 442117454e89SShri Abhyankar } 44226936b636SHong Zhang /* 4423a02bda8eSBarry Smith MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix. 4424a02bda8eSBarry Smith Modified from MatSeqAIJCheckInode(). 44256936b636SHong Zhang 44266936b636SHong Zhang Input Parameters: 4427abb87a52SBarry Smith . Mat A - ILU or LU matrix factor 4428abb87a52SBarry Smith 44296936b636SHong Zhang */ 4430d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A) 4431d71ae5a4SJacob Faibussowitsch { 4432019b515eSShri Abhyankar Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4433019b515eSShri Abhyankar PetscInt i, j, m, nzl1, nzu1, nzl2, nzu2, nzx, nzy, node_count, blk_size; 44348758e1faSBarry Smith PetscInt *cols1, *cols2, *ns; 44358758e1faSBarry Smith const PetscInt *ai = a->i, *aj = a->j, *adiag = a->diag; 4436ace3abfcSBarry Smith PetscBool flag; 4437019b515eSShri Abhyankar 4438019b515eSShri Abhyankar PetscFunctionBegin; 44393ba16761SJacob Faibussowitsch if (!a->inode.use) PetscFunctionReturn(PETSC_SUCCESS); 44403ba16761SJacob Faibussowitsch if (a->inode.checked) PetscFunctionReturn(PETSC_SUCCESS); 4441019b515eSShri Abhyankar 4442019b515eSShri Abhyankar m = A->rmap->n; 44432205254eSKarl Rupp if (a->inode.size) ns = a->inode.size; 444448a46eb9SPierre Jolivet else PetscCall(PetscMalloc1(m + 1, &ns)); 4445019b515eSShri Abhyankar 4446019b515eSShri Abhyankar i = 0; 4447019b515eSShri Abhyankar node_count = 0; 44489566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(m, &cols1, m, &cols2)); 4449019b515eSShri Abhyankar while (i < m) { /* For each row */ 4450019b515eSShri Abhyankar nzl1 = ai[i + 1] - ai[i]; /* Number of nonzeros in L */ 4451019b515eSShri Abhyankar nzu1 = adiag[i] - adiag[i + 1] - 1; /* Number of nonzeros in U excluding diagonal*/ 4452019b515eSShri Abhyankar nzx = nzl1 + nzu1 + 1; 44533ba16761SJacob Faibussowitsch PetscCall(MatGetRow_FactoredLU(cols1, nzl1, nzu1, nzx, ai, aj, adiag, i)); 4454019b515eSShri Abhyankar 4455019b515eSShri Abhyankar /* Limits the number of elements in a node to 'a->inode.limit' */ 4456019b515eSShri Abhyankar for (j = i + 1, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) { 4457019b515eSShri Abhyankar nzl2 = ai[j + 1] - ai[j]; 4458019b515eSShri Abhyankar nzu2 = adiag[j] - adiag[j + 1] - 1; 4459019b515eSShri Abhyankar nzy = nzl2 + nzu2 + 1; 4460019b515eSShri Abhyankar if (nzy != nzx) break; 44619566063dSJacob Faibussowitsch PetscCall(MatGetRow_FactoredLU(cols2, nzl2, nzu2, nzy, ai, aj, adiag, j)); 44629566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(cols1, cols2, nzx, &flag)); 44638758e1faSBarry Smith if (!flag) break; 4464019b515eSShri Abhyankar } 4465019b515eSShri Abhyankar ns[node_count++] = blk_size; 4466019b515eSShri Abhyankar i = j; 4467019b515eSShri Abhyankar } 44689566063dSJacob Faibussowitsch PetscCall(PetscFree2(cols1, cols2)); 4469019b515eSShri Abhyankar /* If not enough inodes found,, do not use inode version of the routines */ 4470be6adb11SBarry Smith if (!m || node_count > .8 * m) { 44719566063dSJacob Faibussowitsch PetscCall(PetscFree(ns)); 44722205254eSKarl Rupp 4473019b515eSShri Abhyankar a->inode.node_count = 0; 44740298fd71SBarry Smith a->inode.size = NULL; 4475019b515eSShri Abhyankar a->inode.use = PETSC_FALSE; 44762205254eSKarl Rupp 44779566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m)); 4478019b515eSShri Abhyankar } else { 4479f4259b30SLisandro Dalcin A->ops->mult = NULL; 4480f4259b30SLisandro Dalcin A->ops->sor = NULL; 4481f4259b30SLisandro Dalcin A->ops->multadd = NULL; 4482f4259b30SLisandro Dalcin A->ops->getrowij = NULL; 4483f4259b30SLisandro Dalcin A->ops->restorerowij = NULL; 4484f4259b30SLisandro Dalcin A->ops->getcolumnij = NULL; 4485f4259b30SLisandro Dalcin A->ops->restorecolumnij = NULL; 4486f4259b30SLisandro Dalcin A->ops->coloringpatch = NULL; 4487f4259b30SLisandro Dalcin A->ops->multdiagonalblock = NULL; 4488019b515eSShri Abhyankar a->inode.node_count = node_count; 4489019b515eSShri Abhyankar a->inode.size = ns; 44902205254eSKarl Rupp 44919566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit)); 4492019b515eSShri Abhyankar } 4493be6adb11SBarry Smith a->inode.checked = PETSC_TRUE; 44943ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4495019b515eSShri Abhyankar } 4496019b515eSShri Abhyankar 4497d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A) 4498d71ae5a4SJacob Faibussowitsch { 4499acf2f550SJed Brown Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4500acf2f550SJed Brown 4501acf2f550SJed Brown PetscFunctionBegin; 4502acf2f550SJed Brown a->inode.ibdiagvalid = PETSC_FALSE; 45033ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4504acf2f550SJed Brown } 4505acf2f550SJed Brown 45064c1414c8SBarry Smith /* 45074c1414c8SBarry Smith This is really ugly. if inodes are used this replaces the 45084c1414c8SBarry Smith permutations with ones that correspond to rows/cols of the matrix 45094c1414c8SBarry Smith rather then inode blocks 45104c1414c8SBarry Smith */ 4511d71ae5a4SJacob Faibussowitsch PetscErrorCode MatInodeAdjustForInodes(Mat A, IS *rperm, IS *cperm) 4512d71ae5a4SJacob Faibussowitsch { 45134c1414c8SBarry Smith PetscFunctionBegin; 4514cac4c232SBarry Smith PetscTryMethod(A, "MatInodeAdjustForInodes_C", (Mat, IS *, IS *), (A, rperm, cperm)); 45153ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 45164c1414c8SBarry Smith } 45174c1414c8SBarry Smith 4518d71ae5a4SJacob Faibussowitsch PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A, IS *rperm, IS *cperm) 4519d71ae5a4SJacob Faibussowitsch { 45204c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 45215d0c19d7SBarry Smith PetscInt m = A->rmap->n, n = A->cmap->n, i, j, nslim_row = a->inode.node_count; 45225d0c19d7SBarry Smith const PetscInt *ridx, *cidx; 45234c1414c8SBarry Smith PetscInt row, col, *permr, *permc, *ns_row = a->inode.size, *tns, start_val, end_val, indx; 45244c1414c8SBarry Smith PetscInt nslim_col, *ns_col; 45254c1414c8SBarry Smith IS ris = *rperm, cis = *cperm; 45264c1414c8SBarry Smith 45274c1414c8SBarry Smith PetscFunctionBegin; 45283ba16761SJacob Faibussowitsch if (!a->inode.size) PetscFunctionReturn(PETSC_SUCCESS); /* no inodes so return */ 45293ba16761SJacob Faibussowitsch if (a->inode.node_count == m) PetscFunctionReturn(PETSC_SUCCESS); /* all inodes are of size 1 */ 45304c1414c8SBarry Smith 45319566063dSJacob Faibussowitsch PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col)); 45329566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(((nslim_row > nslim_col) ? nslim_row : nslim_col) + 1, &tns)); 45339566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(m, &permr, n, &permc)); 45344c1414c8SBarry Smith 45359566063dSJacob Faibussowitsch PetscCall(ISGetIndices(ris, &ridx)); 45369566063dSJacob Faibussowitsch PetscCall(ISGetIndices(cis, &cidx)); 45374c1414c8SBarry Smith 45384c1414c8SBarry Smith /* Form the inode structure for the rows of permuted matric using inv perm*/ 45394c1414c8SBarry Smith for (i = 0, tns[0] = 0; i < nslim_row; ++i) tns[i + 1] = tns[i] + ns_row[i]; 45404c1414c8SBarry Smith 45414c1414c8SBarry Smith /* Construct the permutations for rows*/ 45424c1414c8SBarry Smith for (i = 0, row = 0; i < nslim_row; ++i) { 45434c1414c8SBarry Smith indx = ridx[i]; 45444c1414c8SBarry Smith start_val = tns[indx]; 45454c1414c8SBarry Smith end_val = tns[indx + 1]; 45464c1414c8SBarry Smith for (j = start_val; j < end_val; ++j, ++row) permr[row] = j; 45474c1414c8SBarry Smith } 45484c1414c8SBarry Smith 45494c1414c8SBarry Smith /* Form the inode structure for the columns of permuted matrix using inv perm*/ 45504c1414c8SBarry Smith for (i = 0, tns[0] = 0; i < nslim_col; ++i) tns[i + 1] = tns[i] + ns_col[i]; 45514c1414c8SBarry Smith 45524c1414c8SBarry Smith /* Construct permutations for columns */ 45534c1414c8SBarry Smith for (i = 0, col = 0; i < nslim_col; ++i) { 45544c1414c8SBarry Smith indx = cidx[i]; 45554c1414c8SBarry Smith start_val = tns[indx]; 45564c1414c8SBarry Smith end_val = tns[indx + 1]; 45574c1414c8SBarry Smith for (j = start_val; j < end_val; ++j, ++col) permc[col] = j; 45584c1414c8SBarry Smith } 45594c1414c8SBarry Smith 45609566063dSJacob Faibussowitsch PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permr, PETSC_COPY_VALUES, rperm)); 45619566063dSJacob Faibussowitsch PetscCall(ISSetPermutation(*rperm)); 45629566063dSJacob Faibussowitsch PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permc, PETSC_COPY_VALUES, cperm)); 45639566063dSJacob Faibussowitsch PetscCall(ISSetPermutation(*cperm)); 45644c1414c8SBarry Smith 45659566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(ris, &ridx)); 45669566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(cis, &cidx)); 45674c1414c8SBarry Smith 45689566063dSJacob Faibussowitsch PetscCall(PetscFree(ns_col)); 45699566063dSJacob Faibussowitsch PetscCall(PetscFree2(permr, permc)); 45709566063dSJacob Faibussowitsch PetscCall(ISDestroy(&cis)); 45719566063dSJacob Faibussowitsch PetscCall(ISDestroy(&ris)); 45729566063dSJacob Faibussowitsch PetscCall(PetscFree(tns)); 45733ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 45744c1414c8SBarry Smith } 45754c1414c8SBarry Smith 45764c1414c8SBarry Smith /*@C 457711a5261eSBarry Smith MatInodeGetInodeSizes - Returns the inode information of a matrix with inodes 45784c1414c8SBarry Smith 45793f9fe445SBarry Smith Not Collective 45804c1414c8SBarry Smith 45814c1414c8SBarry Smith Input Parameter: 458211a5261eSBarry Smith . A - the Inode matrix or matrix derived from the Inode class -- e.g., `MATSEQAIJ` 45834c1414c8SBarry Smith 4584d8d19677SJose E. Roman Output Parameters: 45854c1414c8SBarry Smith + node_count - no of inodes present in the matrix. 45864c1414c8SBarry Smith . sizes - an array of size node_count,with sizes of each inode. 45874c1414c8SBarry Smith - limit - the max size used to generate the inodes. 45884c1414c8SBarry Smith 45894c1414c8SBarry Smith Level: advanced 45904c1414c8SBarry Smith 459111a5261eSBarry Smith Note: 459295452b02SPatrick Sanan This routine returns some internal storage information 45934c1414c8SBarry Smith of the matrix, it is intended to be used by advanced users. 45944c1414c8SBarry Smith It should be called after the matrix is assembled. 45954c1414c8SBarry Smith The contents of the sizes[] array should not be changed. 45960298fd71SBarry Smith NULL may be passed for information not requested. 45974c1414c8SBarry Smith 4598db781477SPatrick Sanan .seealso: `MatGetInfo()` 45994c1414c8SBarry Smith @*/ 4600d71ae5a4SJacob Faibussowitsch PetscErrorCode MatInodeGetInodeSizes(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit) 4601d71ae5a4SJacob Faibussowitsch { 46025f80ce2aSJacob Faibussowitsch PetscErrorCode (*f)(Mat, PetscInt *, PetscInt **, PetscInt *); 46034c1414c8SBarry Smith 46044c1414c8SBarry Smith PetscFunctionBegin; 46055f80ce2aSJacob Faibussowitsch PetscCheck(A->assembled, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix"); 46069566063dSJacob Faibussowitsch PetscCall(PetscObjectQueryFunction((PetscObject)A, "MatInodeGetInodeSizes_C", &f)); 46079566063dSJacob Faibussowitsch if (f) PetscCall((*f)(A, node_count, sizes, limit)); 46083ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 46094c1414c8SBarry Smith } 46104c1414c8SBarry Smith 4611d71ae5a4SJacob Faibussowitsch PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit) 4612d71ae5a4SJacob Faibussowitsch { 46134c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 46144c1414c8SBarry Smith 46154c1414c8SBarry Smith PetscFunctionBegin; 46164c1414c8SBarry Smith if (node_count) *node_count = a->inode.node_count; 46174c1414c8SBarry Smith if (sizes) *sizes = a->inode.size; 46184c1414c8SBarry Smith if (limit) *limit = a->inode.limit; 46193ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 46204c1414c8SBarry Smith } 4621