xref: /petsc/src/mat/impls/aij/seq/inode.c (revision 32603206efff945aaef677fd0dad12e9aa894f1f)
14c1414c8SBarry Smith /*
24c1414c8SBarry Smith   This file provides high performance routines for the Inode format (compressed sparse row)
34c1414c8SBarry Smith   by taking advantage of rows with identical nonzero structure (I-nodes).
44c1414c8SBarry Smith */
5c6db04a5SJed Brown #include <../src/mat/impls/aij/seq/aij.h>
6fb56d528SJed Brown #if defined(PETSC_HAVE_XMMINTRIN_H)
7fb56d528SJed Brown   #include <xmmintrin.h>
8fb56d528SJed Brown #endif
94c1414c8SBarry Smith 
10d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatCreateColInode_Private(Mat A, PetscInt *size, PetscInt **ns)
11d71ae5a4SJacob Faibussowitsch {
124c1414c8SBarry Smith   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
134c1414c8SBarry Smith   PetscInt    i, count, m, n, min_mn, *ns_row, *ns_col;
144c1414c8SBarry Smith 
154c1414c8SBarry Smith   PetscFunctionBegin;
16d0f46423SBarry Smith   n = A->cmap->n;
17d0f46423SBarry Smith   m = A->rmap->n;
1808401ef6SPierre Jolivet   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
194c1414c8SBarry Smith   ns_row = a->inode.size;
204c1414c8SBarry Smith 
214c1414c8SBarry Smith   min_mn = (m < n) ? m : n;
224c1414c8SBarry Smith   if (!ns) {
23fbccb6d4SPierre Jolivet     for (count = 0, i = 0; count < min_mn; count += ns_row[i], i++);
24fbccb6d4SPierre Jolivet     for (; count + 1 < n; count++, i++);
25ad540459SPierre Jolivet     if (count < n) i++;
264c1414c8SBarry Smith     *size = i;
273ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
284c1414c8SBarry Smith   }
299566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(n + 1, &ns_col));
304c1414c8SBarry Smith 
314c1414c8SBarry Smith   /* Use the same row structure wherever feasible. */
32ad540459SPierre Jolivet   for (count = 0, i = 0; count < min_mn; count += ns_row[i], i++) ns_col[i] = ns_row[i];
334c1414c8SBarry Smith 
344c1414c8SBarry Smith   /* if m < n; pad up the remainder with inode_limit */
35ad540459SPierre Jolivet   for (; count + 1 < n; count++, i++) ns_col[i] = 1;
36aaa8cc7dSPierre Jolivet   /* The last node is the odd ball. pad it up with the remaining rows; */
374c1414c8SBarry Smith   if (count < n) {
384c1414c8SBarry Smith     ns_col[i] = n - count;
394c1414c8SBarry Smith     i++;
404c1414c8SBarry Smith   } else if (count > n) {
414c1414c8SBarry Smith     /* Adjust for the over estimation */
424c1414c8SBarry Smith     ns_col[i - 1] += n - count;
434c1414c8SBarry Smith   }
444c1414c8SBarry Smith   *size = i;
454c1414c8SBarry Smith   *ns   = ns_col;
463ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
474c1414c8SBarry Smith }
484c1414c8SBarry Smith 
494c1414c8SBarry Smith /*
504c1414c8SBarry Smith       This builds symmetric version of nonzero structure,
514c1414c8SBarry Smith */
52d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
53d71ae5a4SJacob Faibussowitsch {
544c1414c8SBarry Smith   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
558758e1faSBarry Smith   PetscInt       *work, *ia, *ja, nz, nslim_row, nslim_col, m, row, col, n;
568758e1faSBarry Smith   PetscInt       *tns, *tvc, *ns_row = a->inode.size, *ns_col, nsz, i1, i2;
578758e1faSBarry Smith   const PetscInt *j, *jmax, *ai = a->i, *aj = a->j;
584c1414c8SBarry Smith 
594c1414c8SBarry Smith   PetscFunctionBegin;
604c1414c8SBarry Smith   nslim_row = a->inode.node_count;
61d0f46423SBarry Smith   m         = A->rmap->n;
62d0f46423SBarry Smith   n         = A->cmap->n;
6308401ef6SPierre Jolivet   PetscCheck(m == n, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
6408401ef6SPierre Jolivet   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
654c1414c8SBarry Smith 
664c1414c8SBarry Smith   /* Use the row_inode as column_inode */
674c1414c8SBarry Smith   nslim_col = nslim_row;
684c1414c8SBarry Smith   ns_col    = ns_row;
694c1414c8SBarry Smith 
7035cb6cd3SPierre Jolivet   /* allocate space for reformatted inode structure */
719566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
724c1414c8SBarry Smith   for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_row[i1];
734c1414c8SBarry Smith 
744c1414c8SBarry Smith   for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
754c1414c8SBarry Smith     nsz = ns_col[i1];
762205254eSKarl Rupp     for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
774c1414c8SBarry Smith   }
784c1414c8SBarry Smith   /* allocate space for row pointers */
799566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(nslim_row + 1, &ia));
804c1414c8SBarry Smith   *iia = ia;
819566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(nslim_row + 1, &work));
824c1414c8SBarry Smith 
834c1414c8SBarry Smith   /* determine the number of columns in each row */
844c1414c8SBarry Smith   ia[0] = oshift;
854c1414c8SBarry Smith   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
864c1414c8SBarry Smith     j    = aj + ai[row] + ishift;
874c1414c8SBarry Smith     jmax = aj + ai[row + 1] + ishift;
8883fed2edSSatish Balay     if (j == jmax) continue; /* empty row */
894c1414c8SBarry Smith     col = *j++ + ishift;
904c1414c8SBarry Smith     i2  = tvc[col];
916aad120cSJose E. Roman     while (i2 < i1 && j < jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elements */
924c1414c8SBarry Smith       ia[i1 + 1]++;
934c1414c8SBarry Smith       ia[i2 + 1]++;
944c1414c8SBarry Smith       i2++; /* Start col of next node */
9590d2dec7SBarry Smith       while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j;
964c1414c8SBarry Smith       i2 = tvc[col];
974c1414c8SBarry Smith     }
984c1414c8SBarry Smith     if (i2 == i1) ia[i2 + 1]++; /* now the diagonal element */
994c1414c8SBarry Smith   }
1004c1414c8SBarry Smith 
1014c1414c8SBarry Smith   /* shift ia[i] to point to next row */
1024c1414c8SBarry Smith   for (i1 = 1; i1 < nslim_row + 1; i1++) {
1034c1414c8SBarry Smith     row = ia[i1 - 1];
1044c1414c8SBarry Smith     ia[i1] += row;
1054c1414c8SBarry Smith     work[i1 - 1] = row - oshift;
1064c1414c8SBarry Smith   }
1074c1414c8SBarry Smith 
1084c1414c8SBarry Smith   /* allocate space for column pointers */
1094c1414c8SBarry Smith   nz = ia[nslim_row] + (!ishift);
1109566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(nz, &ja));
1114c1414c8SBarry Smith   *jja = ja;
1124c1414c8SBarry Smith 
1134c1414c8SBarry Smith   /* loop over lower triangular part putting into ja */
1144c1414c8SBarry Smith   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
1154c1414c8SBarry Smith     j    = aj + ai[row] + ishift;
1164c1414c8SBarry Smith     jmax = aj + ai[row + 1] + ishift;
11783fed2edSSatish Balay     if (j == jmax) continue; /* empty row */
1184c1414c8SBarry Smith     col = *j++ + ishift;
1194c1414c8SBarry Smith     i2  = tvc[col];
1204c1414c8SBarry Smith     while (i2 < i1 && j < jmax) {
1214c1414c8SBarry Smith       ja[work[i2]++] = i1 + oshift;
1224c1414c8SBarry Smith       ja[work[i1]++] = i2 + oshift;
1234c1414c8SBarry Smith       ++i2;
12490d2dec7SBarry Smith       while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j; /* Skip rest col indices in this node */
1254c1414c8SBarry Smith       i2 = tvc[col];
1264c1414c8SBarry Smith     }
1274c1414c8SBarry Smith     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
1284c1414c8SBarry Smith   }
1299566063dSJacob Faibussowitsch   PetscCall(PetscFree(work));
1309566063dSJacob Faibussowitsch   PetscCall(PetscFree2(tns, tvc));
1313ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1324c1414c8SBarry Smith }
1334c1414c8SBarry Smith 
1344c1414c8SBarry Smith /*
1354c1414c8SBarry Smith       This builds nonsymmetric version of nonzero structure,
1364c1414c8SBarry Smith */
137d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
138d71ae5a4SJacob Faibussowitsch {
1394c1414c8SBarry Smith   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
1408758e1faSBarry Smith   PetscInt       *work, *ia, *ja, nz, nslim_row, n, row, col, *ns_col, nslim_col;
1418758e1faSBarry Smith   PetscInt       *tns, *tvc, nsz, i1, i2;
1428758e1faSBarry Smith   const PetscInt *j, *ai = a->i, *aj = a->j, *ns_row = a->inode.size;
1434c1414c8SBarry Smith 
1444c1414c8SBarry Smith   PetscFunctionBegin;
14508401ef6SPierre Jolivet   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
1464c1414c8SBarry Smith   nslim_row = a->inode.node_count;
147d0f46423SBarry Smith   n         = A->cmap->n;
1484c1414c8SBarry Smith 
1494c1414c8SBarry Smith   /* Create The column_inode for this matrix */
1509566063dSJacob Faibussowitsch   PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
1514c1414c8SBarry Smith 
15235cb6cd3SPierre Jolivet   /* allocate space for reformatted column_inode structure */
1539566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
1544c1414c8SBarry Smith   for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_col[i1];
1554c1414c8SBarry Smith 
1564c1414c8SBarry Smith   for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
1574c1414c8SBarry Smith     nsz = ns_col[i1];
1582205254eSKarl Rupp     for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
1594c1414c8SBarry Smith   }
1604c1414c8SBarry Smith   /* allocate space for row pointers */
1619566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(nslim_row + 1, &ia));
1624c1414c8SBarry Smith   *iia = ia;
1639566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(nslim_row + 1, &work));
1644c1414c8SBarry Smith 
1654c1414c8SBarry Smith   /* determine the number of columns in each row */
1664c1414c8SBarry Smith   ia[0] = oshift;
1674c1414c8SBarry Smith   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
1684c1414c8SBarry Smith     j  = aj + ai[row] + ishift;
16983fed2edSSatish Balay     nz = ai[row + 1] - ai[row];
17083fed2edSSatish Balay     if (!nz) continue; /* empty row */
1714c1414c8SBarry Smith     col = *j++ + ishift;
1724c1414c8SBarry Smith     i2  = tvc[col];
1736aad120cSJose E. Roman     while (nz-- > 0) { /* off-diagonal elements */
1744c1414c8SBarry Smith       ia[i1 + 1]++;
1754c1414c8SBarry Smith       i2++; /* Start col of next node */
176a8e3a797SJed Brown       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
1774c1414c8SBarry Smith       if (nz > 0) i2 = tvc[col];
1784c1414c8SBarry Smith     }
1794c1414c8SBarry Smith   }
1804c1414c8SBarry Smith 
1814c1414c8SBarry Smith   /* shift ia[i] to point to next row */
1824c1414c8SBarry Smith   for (i1 = 1; i1 < nslim_row + 1; i1++) {
1834c1414c8SBarry Smith     row = ia[i1 - 1];
1844c1414c8SBarry Smith     ia[i1] += row;
1854c1414c8SBarry Smith     work[i1 - 1] = row - oshift;
1864c1414c8SBarry Smith   }
1874c1414c8SBarry Smith 
1884c1414c8SBarry Smith   /* allocate space for column pointers */
1894c1414c8SBarry Smith   nz = ia[nslim_row] + (!ishift);
1909566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(nz, &ja));
1914c1414c8SBarry Smith   *jja = ja;
1924c1414c8SBarry Smith 
1934c1414c8SBarry Smith   /* loop over matrix putting into ja */
1944c1414c8SBarry Smith   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
1954c1414c8SBarry Smith     j  = aj + ai[row] + ishift;
19683fed2edSSatish Balay     nz = ai[row + 1] - ai[row];
19783fed2edSSatish Balay     if (!nz) continue; /* empty row */
1984c1414c8SBarry Smith     col = *j++ + ishift;
1994c1414c8SBarry Smith     i2  = tvc[col];
2004c1414c8SBarry Smith     while (nz-- > 0) {
2014c1414c8SBarry Smith       ja[work[i1]++] = i2 + oshift;
2024c1414c8SBarry Smith       ++i2;
203a8e3a797SJed Brown       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
2044c1414c8SBarry Smith       if (nz > 0) i2 = tvc[col];
2054c1414c8SBarry Smith     }
2064c1414c8SBarry Smith   }
2079566063dSJacob Faibussowitsch   PetscCall(PetscFree(ns_col));
2089566063dSJacob Faibussowitsch   PetscCall(PetscFree(work));
2099566063dSJacob Faibussowitsch   PetscCall(PetscFree2(tns, tvc));
2103ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2114c1414c8SBarry Smith }
2124c1414c8SBarry Smith 
213d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
214d71ae5a4SJacob Faibussowitsch {
2154c1414c8SBarry Smith   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
2164c1414c8SBarry Smith 
2174c1414c8SBarry Smith   PetscFunctionBegin;
21850ba90b4SBarry Smith   if (n) *n = a->inode.node_count;
2193ba16761SJacob Faibussowitsch   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
2208f7157efSSatish Balay   if (!blockcompressed) {
2219566063dSJacob Faibussowitsch     PetscCall(MatGetRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
2228f7157efSSatish Balay   } else if (symmetric) {
2239566063dSJacob Faibussowitsch     PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift));
2244c1414c8SBarry Smith   } else {
2259566063dSJacob Faibussowitsch     PetscCall(MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift));
2264c1414c8SBarry Smith   }
2273ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2284c1414c8SBarry Smith }
2294c1414c8SBarry Smith 
230d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
231d71ae5a4SJacob Faibussowitsch {
2324c1414c8SBarry Smith   PetscFunctionBegin;
2333ba16761SJacob Faibussowitsch   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
2348f7157efSSatish Balay 
2358f7157efSSatish Balay   if (!blockcompressed) {
2369566063dSJacob Faibussowitsch     PetscCall(MatRestoreRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
2378f7157efSSatish Balay   } else {
2389566063dSJacob Faibussowitsch     PetscCall(PetscFree(*ia));
2399566063dSJacob Faibussowitsch     PetscCall(PetscFree(*ja));
2408f7157efSSatish Balay   }
2413ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2424c1414c8SBarry Smith }
2434c1414c8SBarry Smith 
244d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
245d71ae5a4SJacob Faibussowitsch {
2464c1414c8SBarry Smith   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
2474c1414c8SBarry Smith   PetscInt   *work, *ia, *ja, *j, nz, nslim_row, n, row, col, *ns_col, nslim_col;
2484c1414c8SBarry Smith   PetscInt   *tns, *tvc, *ns_row = a->inode.size, nsz, i1, i2, *ai = a->i, *aj = a->j;
2494c1414c8SBarry Smith 
2504c1414c8SBarry Smith   PetscFunctionBegin;
25108401ef6SPierre Jolivet   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2524c1414c8SBarry Smith   nslim_row = a->inode.node_count;
253d0f46423SBarry Smith   n         = A->cmap->n;
2544c1414c8SBarry Smith 
2554c1414c8SBarry Smith   /* Create The column_inode for this matrix */
2569566063dSJacob Faibussowitsch   PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
2574c1414c8SBarry Smith 
25835cb6cd3SPierre Jolivet   /* allocate space for reformatted column_inode structure */
2599566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
2604c1414c8SBarry Smith   for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_col[i1];
2614c1414c8SBarry Smith 
2624c1414c8SBarry Smith   for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
2634c1414c8SBarry Smith     nsz = ns_col[i1];
2642205254eSKarl Rupp     for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
2654c1414c8SBarry Smith   }
2664c1414c8SBarry Smith   /* allocate space for column pointers */
2679566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(nslim_col + 1, &ia));
2684c1414c8SBarry Smith   *iia = ia;
2699566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(nslim_col + 1, &work));
2704c1414c8SBarry Smith 
2714c1414c8SBarry Smith   /* determine the number of columns in each row */
2724c1414c8SBarry Smith   ia[0] = oshift;
2734c1414c8SBarry Smith   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
2744c1414c8SBarry Smith     j   = aj + ai[row] + ishift;
2754c1414c8SBarry Smith     col = *j++ + ishift;
2764c1414c8SBarry Smith     i2  = tvc[col];
2774c1414c8SBarry Smith     nz  = ai[row + 1] - ai[row];
2786aad120cSJose E. Roman     while (nz-- > 0) { /* off-diagonal elements */
2794c1414c8SBarry Smith       /* ia[i1+1]++; */
2804c1414c8SBarry Smith       ia[i2 + 1]++;
2814c1414c8SBarry Smith       i2++;
282a8e3a797SJed Brown       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
2834c1414c8SBarry Smith       if (nz > 0) i2 = tvc[col];
2844c1414c8SBarry Smith     }
2854c1414c8SBarry Smith   }
2864c1414c8SBarry Smith 
2874c1414c8SBarry Smith   /* shift ia[i] to point to next col */
2884c1414c8SBarry Smith   for (i1 = 1; i1 < nslim_col + 1; i1++) {
2894c1414c8SBarry Smith     col = ia[i1 - 1];
2904c1414c8SBarry Smith     ia[i1] += col;
2914c1414c8SBarry Smith     work[i1 - 1] = col - oshift;
2924c1414c8SBarry Smith   }
2934c1414c8SBarry Smith 
2944c1414c8SBarry Smith   /* allocate space for column pointers */
2954c1414c8SBarry Smith   nz = ia[nslim_col] + (!ishift);
2969566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(nz, &ja));
2974c1414c8SBarry Smith   *jja = ja;
2984c1414c8SBarry Smith 
2994c1414c8SBarry Smith   /* loop over matrix putting into ja */
3004c1414c8SBarry Smith   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
3014c1414c8SBarry Smith     j   = aj + ai[row] + ishift;
3024c1414c8SBarry Smith     col = *j++ + ishift;
3034c1414c8SBarry Smith     i2  = tvc[col];
3044c1414c8SBarry Smith     nz  = ai[row + 1] - ai[row];
3054c1414c8SBarry Smith     while (nz-- > 0) {
3064c1414c8SBarry Smith       /* ja[work[i1]++] = i2 + oshift; */
3074c1414c8SBarry Smith       ja[work[i2]++] = i1 + oshift;
3084c1414c8SBarry Smith       i2++;
309a8e3a797SJed Brown       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
3104c1414c8SBarry Smith       if (nz > 0) i2 = tvc[col];
3114c1414c8SBarry Smith     }
3124c1414c8SBarry Smith   }
3139566063dSJacob Faibussowitsch   PetscCall(PetscFree(ns_col));
3149566063dSJacob Faibussowitsch   PetscCall(PetscFree(work));
3159566063dSJacob Faibussowitsch   PetscCall(PetscFree2(tns, tvc));
3163ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3174c1414c8SBarry Smith }
3184c1414c8SBarry Smith 
319d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
320d71ae5a4SJacob Faibussowitsch {
3214c1414c8SBarry Smith   PetscFunctionBegin;
3229566063dSJacob Faibussowitsch   PetscCall(MatCreateColInode_Private(A, n, NULL));
3233ba16761SJacob Faibussowitsch   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
3244c1414c8SBarry Smith 
3258f7157efSSatish Balay   if (!blockcompressed) {
3269566063dSJacob Faibussowitsch     PetscCall(MatGetColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
3278f7157efSSatish Balay   } else if (symmetric) {
328a5b23f4aSJose E. Roman     /* Since the indices are symmetric it doesn't matter */
3299566063dSJacob Faibussowitsch     PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift));
3304c1414c8SBarry Smith   } else {
3319566063dSJacob Faibussowitsch     PetscCall(MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift));
3324c1414c8SBarry Smith   }
3333ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3344c1414c8SBarry Smith }
3354c1414c8SBarry Smith 
336d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
337d71ae5a4SJacob Faibussowitsch {
3384c1414c8SBarry Smith   PetscFunctionBegin;
3393ba16761SJacob Faibussowitsch   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
3408f7157efSSatish Balay   if (!blockcompressed) {
3419566063dSJacob Faibussowitsch     PetscCall(MatRestoreColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
3428f7157efSSatish Balay   } else {
3439566063dSJacob Faibussowitsch     PetscCall(PetscFree(*ia));
3449566063dSJacob Faibussowitsch     PetscCall(PetscFree(*ja));
3458f7157efSSatish Balay   }
3463ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3474c1414c8SBarry Smith }
3484c1414c8SBarry Smith 
349d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqAIJ_Inode(Mat A, Vec xx, Vec yy)
350d71ae5a4SJacob Faibussowitsch {
3514c1414c8SBarry Smith   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
3524c1414c8SBarry Smith   PetscScalar        sum1, sum2, sum3, sum4, sum5, tmp0, tmp1;
353d9fead3dSBarry Smith   PetscScalar       *y;
354dd6ea824SBarry Smith   const PetscScalar *x;
355dd6ea824SBarry Smith   const MatScalar   *v1, *v2, *v3, *v4, *v5;
3568758e1faSBarry Smith   PetscInt           i1, i2, n, i, row, node_max, nsz, sz, nonzerorow = 0;
3578758e1faSBarry Smith   const PetscInt    *idx, *ns, *ii;
3584c1414c8SBarry Smith 
3594c1414c8SBarry Smith #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
3604c1414c8SBarry Smith   #pragma disjoint(*x, *y, *v1, *v2, *v3, *v4, *v5)
3614c1414c8SBarry Smith #endif
3624c1414c8SBarry Smith 
3634c1414c8SBarry Smith   PetscFunctionBegin;
36408401ef6SPierre Jolivet   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
3654c1414c8SBarry Smith   node_max = a->inode.node_count;
3664c1414c8SBarry Smith   ns       = a->inode.size; /* Node Size array */
3679566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
3689566063dSJacob Faibussowitsch   PetscCall(VecGetArray(yy, &y));
3694c1414c8SBarry Smith   idx = a->j;
3704c1414c8SBarry Smith   v1  = a->a;
3714c1414c8SBarry Smith   ii  = a->i;
3724c1414c8SBarry Smith 
3734c1414c8SBarry Smith   for (i = 0, row = 0; i < node_max; ++i) {
3744c1414c8SBarry Smith     nsz = ns[i];
3754c1414c8SBarry Smith     n   = ii[1] - ii[0];
37698c9bda7SSatish Balay     nonzerorow += (n > 0) * nsz;
3774c1414c8SBarry Smith     ii += nsz;
37850d8bf02SJed Brown     PetscPrefetchBlock(idx + nsz * n, n, 0, PETSC_PREFETCH_HINT_NTA);      /* Prefetch the indices for the block row after the current one */
37950d8bf02SJed Brown     PetscPrefetchBlock(v1 + nsz * n, nsz * n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one  */
3804c1414c8SBarry Smith     sz = n;                                                                /* No of non zeros in this row */
3814c1414c8SBarry Smith                                                                            /* Switch on the size of Node */
3824c1414c8SBarry Smith     switch (nsz) {                                                         /* Each loop in 'case' is unrolled */
3834c1414c8SBarry Smith     case 1:
38475567043SBarry Smith       sum1 = 0.;
3854c1414c8SBarry Smith 
3864c1414c8SBarry Smith       for (n = 0; n < sz - 1; n += 2) {
3874c1414c8SBarry Smith         i1 = idx[0]; /* The instructions are ordered to */
3884c1414c8SBarry Smith         i2 = idx[1]; /* make the compiler's job easy */
3894c1414c8SBarry Smith         idx += 2;
3904c1414c8SBarry Smith         tmp0 = x[i1];
3914c1414c8SBarry Smith         tmp1 = x[i2];
3929371c9d4SSatish Balay         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
3939371c9d4SSatish Balay         v1 += 2;
3944c1414c8SBarry Smith       }
3954c1414c8SBarry Smith 
3964c1414c8SBarry Smith       if (n == sz - 1) { /* Take care of the last nonzero  */
3974c1414c8SBarry Smith         tmp0 = x[*idx++];
3984c1414c8SBarry Smith         sum1 += *v1++ * tmp0;
3994c1414c8SBarry Smith       }
4004c1414c8SBarry Smith       y[row++] = sum1;
4014c1414c8SBarry Smith       break;
4024c1414c8SBarry Smith     case 2:
40375567043SBarry Smith       sum1 = 0.;
40475567043SBarry Smith       sum2 = 0.;
4054c1414c8SBarry Smith       v2   = v1 + n;
4064c1414c8SBarry Smith 
4074c1414c8SBarry Smith       for (n = 0; n < sz - 1; n += 2) {
4084c1414c8SBarry Smith         i1 = idx[0];
4094c1414c8SBarry Smith         i2 = idx[1];
4104c1414c8SBarry Smith         idx += 2;
4114c1414c8SBarry Smith         tmp0 = x[i1];
4124c1414c8SBarry Smith         tmp1 = x[i2];
4139371c9d4SSatish Balay         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
4149371c9d4SSatish Balay         v1 += 2;
4159371c9d4SSatish Balay         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
4169371c9d4SSatish Balay         v2 += 2;
4174c1414c8SBarry Smith       }
4184c1414c8SBarry Smith       if (n == sz - 1) {
4194c1414c8SBarry Smith         tmp0 = x[*idx++];
4204c1414c8SBarry Smith         sum1 += *v1++ * tmp0;
4214c1414c8SBarry Smith         sum2 += *v2++ * tmp0;
4224c1414c8SBarry Smith       }
4234c1414c8SBarry Smith       y[row++] = sum1;
4244c1414c8SBarry Smith       y[row++] = sum2;
4254c1414c8SBarry Smith       v1       = v2; /* Since the next block to be processed starts there*/
4264c1414c8SBarry Smith       idx += sz;
4274c1414c8SBarry Smith       break;
4284c1414c8SBarry Smith     case 3:
42975567043SBarry Smith       sum1 = 0.;
43075567043SBarry Smith       sum2 = 0.;
43175567043SBarry Smith       sum3 = 0.;
4324c1414c8SBarry Smith       v2   = v1 + n;
4334c1414c8SBarry Smith       v3   = v2 + n;
4344c1414c8SBarry Smith 
4354c1414c8SBarry Smith       for (n = 0; n < sz - 1; n += 2) {
4364c1414c8SBarry Smith         i1 = idx[0];
4374c1414c8SBarry Smith         i2 = idx[1];
4384c1414c8SBarry Smith         idx += 2;
4394c1414c8SBarry Smith         tmp0 = x[i1];
4404c1414c8SBarry Smith         tmp1 = x[i2];
4419371c9d4SSatish Balay         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
4429371c9d4SSatish Balay         v1 += 2;
4439371c9d4SSatish Balay         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
4449371c9d4SSatish Balay         v2 += 2;
4459371c9d4SSatish Balay         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
4469371c9d4SSatish Balay         v3 += 2;
4474c1414c8SBarry Smith       }
4484c1414c8SBarry Smith       if (n == sz - 1) {
4494c1414c8SBarry Smith         tmp0 = x[*idx++];
4504c1414c8SBarry Smith         sum1 += *v1++ * tmp0;
4514c1414c8SBarry Smith         sum2 += *v2++ * tmp0;
4524c1414c8SBarry Smith         sum3 += *v3++ * tmp0;
4534c1414c8SBarry Smith       }
4544c1414c8SBarry Smith       y[row++] = sum1;
4554c1414c8SBarry Smith       y[row++] = sum2;
4564c1414c8SBarry Smith       y[row++] = sum3;
4574c1414c8SBarry Smith       v1       = v3; /* Since the next block to be processed starts there*/
4584c1414c8SBarry Smith       idx += 2 * sz;
4594c1414c8SBarry Smith       break;
4604c1414c8SBarry Smith     case 4:
46175567043SBarry Smith       sum1 = 0.;
46275567043SBarry Smith       sum2 = 0.;
46375567043SBarry Smith       sum3 = 0.;
46475567043SBarry Smith       sum4 = 0.;
4654c1414c8SBarry Smith       v2   = v1 + n;
4664c1414c8SBarry Smith       v3   = v2 + n;
4674c1414c8SBarry Smith       v4   = v3 + n;
4684c1414c8SBarry Smith 
4694c1414c8SBarry Smith       for (n = 0; n < sz - 1; n += 2) {
4704c1414c8SBarry Smith         i1 = idx[0];
4714c1414c8SBarry Smith         i2 = idx[1];
4724c1414c8SBarry Smith         idx += 2;
4734c1414c8SBarry Smith         tmp0 = x[i1];
4744c1414c8SBarry Smith         tmp1 = x[i2];
4759371c9d4SSatish Balay         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
4769371c9d4SSatish Balay         v1 += 2;
4779371c9d4SSatish Balay         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
4789371c9d4SSatish Balay         v2 += 2;
4799371c9d4SSatish Balay         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
4809371c9d4SSatish Balay         v3 += 2;
4819371c9d4SSatish Balay         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
4829371c9d4SSatish Balay         v4 += 2;
4834c1414c8SBarry Smith       }
4844c1414c8SBarry Smith       if (n == sz - 1) {
4854c1414c8SBarry Smith         tmp0 = x[*idx++];
4864c1414c8SBarry Smith         sum1 += *v1++ * tmp0;
4874c1414c8SBarry Smith         sum2 += *v2++ * tmp0;
4884c1414c8SBarry Smith         sum3 += *v3++ * tmp0;
4894c1414c8SBarry Smith         sum4 += *v4++ * tmp0;
4904c1414c8SBarry Smith       }
4914c1414c8SBarry Smith       y[row++] = sum1;
4924c1414c8SBarry Smith       y[row++] = sum2;
4934c1414c8SBarry Smith       y[row++] = sum3;
4944c1414c8SBarry Smith       y[row++] = sum4;
4954c1414c8SBarry Smith       v1       = v4; /* Since the next block to be processed starts there*/
4964c1414c8SBarry Smith       idx += 3 * sz;
4974c1414c8SBarry Smith       break;
4984c1414c8SBarry Smith     case 5:
49975567043SBarry Smith       sum1 = 0.;
50075567043SBarry Smith       sum2 = 0.;
50175567043SBarry Smith       sum3 = 0.;
50275567043SBarry Smith       sum4 = 0.;
50375567043SBarry Smith       sum5 = 0.;
5044c1414c8SBarry Smith       v2   = v1 + n;
5054c1414c8SBarry Smith       v3   = v2 + n;
5064c1414c8SBarry Smith       v4   = v3 + n;
5074c1414c8SBarry Smith       v5   = v4 + n;
5084c1414c8SBarry Smith 
5094c1414c8SBarry Smith       for (n = 0; n < sz - 1; n += 2) {
5104c1414c8SBarry Smith         i1 = idx[0];
5114c1414c8SBarry Smith         i2 = idx[1];
5124c1414c8SBarry Smith         idx += 2;
5134c1414c8SBarry Smith         tmp0 = x[i1];
5144c1414c8SBarry Smith         tmp1 = x[i2];
5159371c9d4SSatish Balay         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
5169371c9d4SSatish Balay         v1 += 2;
5179371c9d4SSatish Balay         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
5189371c9d4SSatish Balay         v2 += 2;
5199371c9d4SSatish Balay         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
5209371c9d4SSatish Balay         v3 += 2;
5219371c9d4SSatish Balay         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
5229371c9d4SSatish Balay         v4 += 2;
5239371c9d4SSatish Balay         sum5 += v5[0] * tmp0 + v5[1] * tmp1;
5249371c9d4SSatish Balay         v5 += 2;
5254c1414c8SBarry Smith       }
5264c1414c8SBarry Smith       if (n == sz - 1) {
5274c1414c8SBarry Smith         tmp0 = x[*idx++];
5284c1414c8SBarry Smith         sum1 += *v1++ * tmp0;
5294c1414c8SBarry Smith         sum2 += *v2++ * tmp0;
5304c1414c8SBarry Smith         sum3 += *v3++ * tmp0;
5314c1414c8SBarry Smith         sum4 += *v4++ * tmp0;
5324c1414c8SBarry Smith         sum5 += *v5++ * tmp0;
5334c1414c8SBarry Smith       }
5344c1414c8SBarry Smith       y[row++] = sum1;
5354c1414c8SBarry Smith       y[row++] = sum2;
5364c1414c8SBarry Smith       y[row++] = sum3;
5374c1414c8SBarry Smith       y[row++] = sum4;
5384c1414c8SBarry Smith       y[row++] = sum5;
5394c1414c8SBarry Smith       v1       = v5; /* Since the next block to be processed starts there */
5404c1414c8SBarry Smith       idx += 4 * sz;
5414c1414c8SBarry Smith       break;
542d71ae5a4SJacob Faibussowitsch     default:
543d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported");
5444c1414c8SBarry Smith     }
5454c1414c8SBarry Smith   }
5469566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
5479566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(yy, &y));
5489566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz - nonzerorow));
5493ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
5504c1414c8SBarry Smith }
5512ef1f0ffSBarry Smith 
5524108e4d5SBarry Smith /* Almost same code as the MatMult_SeqAIJ_Inode() */
553d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A, Vec xx, Vec zz, Vec yy)
554d71ae5a4SJacob Faibussowitsch {
5554c1414c8SBarry Smith   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
5564c1414c8SBarry Smith   PetscScalar        sum1, sum2, sum3, sum4, sum5, tmp0, tmp1;
5578758e1faSBarry Smith   const MatScalar   *v1, *v2, *v3, *v4, *v5;
5588758e1faSBarry Smith   const PetscScalar *x;
5598758e1faSBarry Smith   PetscScalar       *y, *z, *zt;
5608758e1faSBarry Smith   PetscInt           i1, i2, n, i, row, node_max, nsz, sz;
5618758e1faSBarry Smith   const PetscInt    *idx, *ns, *ii;
5624c1414c8SBarry Smith 
5634c1414c8SBarry Smith   PetscFunctionBegin;
56408401ef6SPierre Jolivet   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
5654c1414c8SBarry Smith   node_max = a->inode.node_count;
5664c1414c8SBarry Smith   ns       = a->inode.size; /* Node Size array */
5672205254eSKarl Rupp 
5689566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
5699566063dSJacob Faibussowitsch   PetscCall(VecGetArrayPair(zz, yy, &z, &y));
5704c1414c8SBarry Smith   zt = z;
5714c1414c8SBarry Smith 
5724c1414c8SBarry Smith   idx = a->j;
5734c1414c8SBarry Smith   v1  = a->a;
5744c1414c8SBarry Smith   ii  = a->i;
5754c1414c8SBarry Smith 
5764c1414c8SBarry Smith   for (i = 0, row = 0; i < node_max; ++i) {
5774c1414c8SBarry Smith     nsz = ns[i];
5784c1414c8SBarry Smith     n   = ii[1] - ii[0];
5794c1414c8SBarry Smith     ii += nsz;
5804c1414c8SBarry Smith     sz = n;        /* No of non zeros in this row */
5814c1414c8SBarry Smith                    /* Switch on the size of Node */
5824c1414c8SBarry Smith     switch (nsz) { /* Each loop in 'case' is unrolled */
5834c1414c8SBarry Smith     case 1:
5844c1414c8SBarry Smith       sum1 = *zt++;
5854c1414c8SBarry Smith 
5864c1414c8SBarry Smith       for (n = 0; n < sz - 1; n += 2) {
5874c1414c8SBarry Smith         i1 = idx[0]; /* The instructions are ordered to */
5884c1414c8SBarry Smith         i2 = idx[1]; /* make the compiler's job easy */
5894c1414c8SBarry Smith         idx += 2;
5904c1414c8SBarry Smith         tmp0 = x[i1];
5914c1414c8SBarry Smith         tmp1 = x[i2];
5929371c9d4SSatish Balay         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
5939371c9d4SSatish Balay         v1 += 2;
5944c1414c8SBarry Smith       }
5954c1414c8SBarry Smith 
5964c1414c8SBarry Smith       if (n == sz - 1) { /* Take care of the last nonzero  */
5974c1414c8SBarry Smith         tmp0 = x[*idx++];
5984c1414c8SBarry Smith         sum1 += *v1++ * tmp0;
5994c1414c8SBarry Smith       }
6004c1414c8SBarry Smith       y[row++] = sum1;
6014c1414c8SBarry Smith       break;
6024c1414c8SBarry Smith     case 2:
6034c1414c8SBarry Smith       sum1 = *zt++;
6044c1414c8SBarry Smith       sum2 = *zt++;
6054c1414c8SBarry Smith       v2   = v1 + n;
6064c1414c8SBarry Smith 
6074c1414c8SBarry Smith       for (n = 0; n < sz - 1; n += 2) {
6084c1414c8SBarry Smith         i1 = idx[0];
6094c1414c8SBarry Smith         i2 = idx[1];
6104c1414c8SBarry Smith         idx += 2;
6114c1414c8SBarry Smith         tmp0 = x[i1];
6124c1414c8SBarry Smith         tmp1 = x[i2];
6139371c9d4SSatish Balay         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
6149371c9d4SSatish Balay         v1 += 2;
6159371c9d4SSatish Balay         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
6169371c9d4SSatish Balay         v2 += 2;
6174c1414c8SBarry Smith       }
6184c1414c8SBarry Smith       if (n == sz - 1) {
6194c1414c8SBarry Smith         tmp0 = x[*idx++];
6204c1414c8SBarry Smith         sum1 += *v1++ * tmp0;
6214c1414c8SBarry Smith         sum2 += *v2++ * tmp0;
6224c1414c8SBarry Smith       }
6234c1414c8SBarry Smith       y[row++] = sum1;
6244c1414c8SBarry Smith       y[row++] = sum2;
6254c1414c8SBarry Smith       v1       = v2; /* Since the next block to be processed starts there*/
6264c1414c8SBarry Smith       idx += sz;
6274c1414c8SBarry Smith       break;
6284c1414c8SBarry Smith     case 3:
6294c1414c8SBarry Smith       sum1 = *zt++;
6304c1414c8SBarry Smith       sum2 = *zt++;
6314c1414c8SBarry Smith       sum3 = *zt++;
6324c1414c8SBarry Smith       v2   = v1 + n;
6334c1414c8SBarry Smith       v3   = v2 + n;
6344c1414c8SBarry Smith 
6354c1414c8SBarry Smith       for (n = 0; n < sz - 1; n += 2) {
6364c1414c8SBarry Smith         i1 = idx[0];
6374c1414c8SBarry Smith         i2 = idx[1];
6384c1414c8SBarry Smith         idx += 2;
6394c1414c8SBarry Smith         tmp0 = x[i1];
6404c1414c8SBarry Smith         tmp1 = x[i2];
6419371c9d4SSatish Balay         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
6429371c9d4SSatish Balay         v1 += 2;
6439371c9d4SSatish Balay         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
6449371c9d4SSatish Balay         v2 += 2;
6459371c9d4SSatish Balay         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
6469371c9d4SSatish Balay         v3 += 2;
6474c1414c8SBarry Smith       }
6484c1414c8SBarry Smith       if (n == sz - 1) {
6494c1414c8SBarry Smith         tmp0 = x[*idx++];
6504c1414c8SBarry Smith         sum1 += *v1++ * tmp0;
6514c1414c8SBarry Smith         sum2 += *v2++ * tmp0;
6524c1414c8SBarry Smith         sum3 += *v3++ * tmp0;
6534c1414c8SBarry Smith       }
6544c1414c8SBarry Smith       y[row++] = sum1;
6554c1414c8SBarry Smith       y[row++] = sum2;
6564c1414c8SBarry Smith       y[row++] = sum3;
6574c1414c8SBarry Smith       v1       = v3; /* Since the next block to be processed starts there*/
6584c1414c8SBarry Smith       idx += 2 * sz;
6594c1414c8SBarry Smith       break;
6604c1414c8SBarry Smith     case 4:
6614c1414c8SBarry Smith       sum1 = *zt++;
6624c1414c8SBarry Smith       sum2 = *zt++;
6634c1414c8SBarry Smith       sum3 = *zt++;
6644c1414c8SBarry Smith       sum4 = *zt++;
6654c1414c8SBarry Smith       v2   = v1 + n;
6664c1414c8SBarry Smith       v3   = v2 + n;
6674c1414c8SBarry Smith       v4   = v3 + n;
6684c1414c8SBarry Smith 
6694c1414c8SBarry Smith       for (n = 0; n < sz - 1; n += 2) {
6704c1414c8SBarry Smith         i1 = idx[0];
6714c1414c8SBarry Smith         i2 = idx[1];
6724c1414c8SBarry Smith         idx += 2;
6734c1414c8SBarry Smith         tmp0 = x[i1];
6744c1414c8SBarry Smith         tmp1 = x[i2];
6759371c9d4SSatish Balay         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
6769371c9d4SSatish Balay         v1 += 2;
6779371c9d4SSatish Balay         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
6789371c9d4SSatish Balay         v2 += 2;
6799371c9d4SSatish Balay         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
6809371c9d4SSatish Balay         v3 += 2;
6819371c9d4SSatish Balay         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
6829371c9d4SSatish Balay         v4 += 2;
6834c1414c8SBarry Smith       }
6844c1414c8SBarry Smith       if (n == sz - 1) {
6854c1414c8SBarry Smith         tmp0 = x[*idx++];
6864c1414c8SBarry Smith         sum1 += *v1++ * tmp0;
6874c1414c8SBarry Smith         sum2 += *v2++ * tmp0;
6884c1414c8SBarry Smith         sum3 += *v3++ * tmp0;
6894c1414c8SBarry Smith         sum4 += *v4++ * tmp0;
6904c1414c8SBarry Smith       }
6914c1414c8SBarry Smith       y[row++] = sum1;
6924c1414c8SBarry Smith       y[row++] = sum2;
6934c1414c8SBarry Smith       y[row++] = sum3;
6944c1414c8SBarry Smith       y[row++] = sum4;
6954c1414c8SBarry Smith       v1       = v4; /* Since the next block to be processed starts there*/
6964c1414c8SBarry Smith       idx += 3 * sz;
6974c1414c8SBarry Smith       break;
6984c1414c8SBarry Smith     case 5:
6994c1414c8SBarry Smith       sum1 = *zt++;
7004c1414c8SBarry Smith       sum2 = *zt++;
7014c1414c8SBarry Smith       sum3 = *zt++;
7024c1414c8SBarry Smith       sum4 = *zt++;
7034c1414c8SBarry Smith       sum5 = *zt++;
7044c1414c8SBarry Smith       v2   = v1 + n;
7054c1414c8SBarry Smith       v3   = v2 + n;
7064c1414c8SBarry Smith       v4   = v3 + n;
7074c1414c8SBarry Smith       v5   = v4 + n;
7084c1414c8SBarry Smith 
7094c1414c8SBarry Smith       for (n = 0; n < sz - 1; n += 2) {
7104c1414c8SBarry Smith         i1 = idx[0];
7114c1414c8SBarry Smith         i2 = idx[1];
7124c1414c8SBarry Smith         idx += 2;
7134c1414c8SBarry Smith         tmp0 = x[i1];
7144c1414c8SBarry Smith         tmp1 = x[i2];
7159371c9d4SSatish Balay         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
7169371c9d4SSatish Balay         v1 += 2;
7179371c9d4SSatish Balay         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
7189371c9d4SSatish Balay         v2 += 2;
7199371c9d4SSatish Balay         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
7209371c9d4SSatish Balay         v3 += 2;
7219371c9d4SSatish Balay         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
7229371c9d4SSatish Balay         v4 += 2;
7239371c9d4SSatish Balay         sum5 += v5[0] * tmp0 + v5[1] * tmp1;
7249371c9d4SSatish Balay         v5 += 2;
7254c1414c8SBarry Smith       }
7264c1414c8SBarry Smith       if (n == sz - 1) {
7274c1414c8SBarry Smith         tmp0 = x[*idx++];
7284c1414c8SBarry Smith         sum1 += *v1++ * tmp0;
7294c1414c8SBarry Smith         sum2 += *v2++ * tmp0;
7304c1414c8SBarry Smith         sum3 += *v3++ * tmp0;
7314c1414c8SBarry Smith         sum4 += *v4++ * tmp0;
7324c1414c8SBarry Smith         sum5 += *v5++ * tmp0;
7334c1414c8SBarry Smith       }
7344c1414c8SBarry Smith       y[row++] = sum1;
7354c1414c8SBarry Smith       y[row++] = sum2;
7364c1414c8SBarry Smith       y[row++] = sum3;
7374c1414c8SBarry Smith       y[row++] = sum4;
7384c1414c8SBarry Smith       y[row++] = sum5;
7394c1414c8SBarry Smith       v1       = v5; /* Since the next block to be processed starts there */
7404c1414c8SBarry Smith       idx += 4 * sz;
7414c1414c8SBarry Smith       break;
742d71ae5a4SJacob Faibussowitsch     default:
743d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported");
7444c1414c8SBarry Smith     }
7454c1414c8SBarry Smith   }
7469566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
7479566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayPair(zz, yy, &z, &y));
7489566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz));
7493ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
7504c1414c8SBarry Smith }
7514c1414c8SBarry Smith 
752ff6a9541SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A, Vec bb, Vec xx)
753d71ae5a4SJacob Faibussowitsch {
7544c1414c8SBarry Smith   Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
7554c1414c8SBarry Smith   IS                 iscol = a->col, isrow = a->row;
7565d0c19d7SBarry Smith   const PetscInt    *r, *c, *rout, *cout;
7578758e1faSBarry Smith   PetscInt           i, j, n = A->rmap->n, nz;
7588758e1faSBarry Smith   PetscInt           node_max, *ns, row, nsz, aii, i0, i1;
7598758e1faSBarry Smith   const PetscInt    *ai = a->i, *a_j = a->j, *vi, *ad, *aj;
760d9fead3dSBarry Smith   PetscScalar       *x, *tmp, *tmps, tmp0, tmp1;
761d9fead3dSBarry Smith   PetscScalar        sum1, sum2, sum3, sum4, sum5;
762dd6ea824SBarry Smith   const MatScalar   *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa;
763dd6ea824SBarry Smith   const PetscScalar *b;
7644c1414c8SBarry Smith 
7654c1414c8SBarry Smith   PetscFunctionBegin;
76608401ef6SPierre Jolivet   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
7674c1414c8SBarry Smith   node_max = a->inode.node_count;
7684c1414c8SBarry Smith   ns       = a->inode.size; /* Node Size array */
7694c1414c8SBarry Smith 
7709566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
7719566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(xx, &x));
7724c1414c8SBarry Smith   tmp = a->solve_work;
7734c1414c8SBarry Smith 
7749371c9d4SSatish Balay   PetscCall(ISGetIndices(isrow, &rout));
7759371c9d4SSatish Balay   r = rout;
7769371c9d4SSatish Balay   PetscCall(ISGetIndices(iscol, &cout));
7779371c9d4SSatish Balay   c = cout + (n - 1);
7784c1414c8SBarry Smith 
7794c1414c8SBarry Smith   /* forward solve the lower triangular */
7804c1414c8SBarry Smith   tmps = tmp;
7814c1414c8SBarry Smith   aa   = a_a;
7824c1414c8SBarry Smith   aj   = a_j;
7834c1414c8SBarry Smith   ad   = a->diag;
7844c1414c8SBarry Smith 
7854c1414c8SBarry Smith   for (i = 0, row = 0; i < node_max; ++i) {
7864c1414c8SBarry Smith     nsz = ns[i];
7874c1414c8SBarry Smith     aii = ai[row];
7884c1414c8SBarry Smith     v1  = aa + aii;
7894c1414c8SBarry Smith     vi  = aj + aii;
7904c1414c8SBarry Smith     nz  = ad[row] - aii;
79126549573SJed Brown     if (i < node_max - 1) {
79226549573SJed Brown       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
79391c35059SPierre Jolivet       * but our indexing to determine its size could. */
79450d8bf02SJed Brown       PetscPrefetchBlock(aj + ai[row + nsz], ad[row + nsz] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */
79526549573SJed Brown       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
79650d8bf02SJed Brown       PetscPrefetchBlock(aa + ai[row + nsz], ad[row + nsz + ns[i + 1] - 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA);
79726549573SJed Brown       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
79826549573SJed Brown     }
7994c1414c8SBarry Smith 
8004c1414c8SBarry Smith     switch (nsz) { /* Each loop in 'case' is unrolled */
8014c1414c8SBarry Smith     case 1:
8024c1414c8SBarry Smith       sum1 = b[*r++];
8034c1414c8SBarry Smith       for (j = 0; j < nz - 1; j += 2) {
8044c1414c8SBarry Smith         i0 = vi[0];
8054c1414c8SBarry Smith         i1 = vi[1];
8064c1414c8SBarry Smith         vi += 2;
8074c1414c8SBarry Smith         tmp0 = tmps[i0];
8084c1414c8SBarry Smith         tmp1 = tmps[i1];
8099371c9d4SSatish Balay         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
8109371c9d4SSatish Balay         v1 += 2;
8114c1414c8SBarry Smith       }
8124c1414c8SBarry Smith       if (j == nz - 1) {
8134c1414c8SBarry Smith         tmp0 = tmps[*vi++];
8144c1414c8SBarry Smith         sum1 -= *v1++ * tmp0;
8154c1414c8SBarry Smith       }
8164c1414c8SBarry Smith       tmp[row++] = sum1;
8174c1414c8SBarry Smith       break;
8184c1414c8SBarry Smith     case 2:
8194c1414c8SBarry Smith       sum1 = b[*r++];
8204c1414c8SBarry Smith       sum2 = b[*r++];
8214c1414c8SBarry Smith       v2   = aa + ai[row + 1];
8224c1414c8SBarry Smith 
8234c1414c8SBarry Smith       for (j = 0; j < nz - 1; j += 2) {
8244c1414c8SBarry Smith         i0 = vi[0];
8254c1414c8SBarry Smith         i1 = vi[1];
8264c1414c8SBarry Smith         vi += 2;
8274c1414c8SBarry Smith         tmp0 = tmps[i0];
8284c1414c8SBarry Smith         tmp1 = tmps[i1];
8299371c9d4SSatish Balay         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
8309371c9d4SSatish Balay         v1 += 2;
8319371c9d4SSatish Balay         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
8329371c9d4SSatish Balay         v2 += 2;
8334c1414c8SBarry Smith       }
8344c1414c8SBarry Smith       if (j == nz - 1) {
8354c1414c8SBarry Smith         tmp0 = tmps[*vi++];
8364c1414c8SBarry Smith         sum1 -= *v1++ * tmp0;
8374c1414c8SBarry Smith         sum2 -= *v2++ * tmp0;
8384c1414c8SBarry Smith       }
8394c1414c8SBarry Smith       sum2 -= *v2++ * sum1;
8404c1414c8SBarry Smith       tmp[row++] = sum1;
8414c1414c8SBarry Smith       tmp[row++] = sum2;
8424c1414c8SBarry Smith       break;
8434c1414c8SBarry Smith     case 3:
8444c1414c8SBarry Smith       sum1 = b[*r++];
8454c1414c8SBarry Smith       sum2 = b[*r++];
8464c1414c8SBarry Smith       sum3 = b[*r++];
8474c1414c8SBarry Smith       v2   = aa + ai[row + 1];
8484c1414c8SBarry Smith       v3   = aa + ai[row + 2];
8494c1414c8SBarry Smith 
8504c1414c8SBarry Smith       for (j = 0; j < nz - 1; j += 2) {
8514c1414c8SBarry Smith         i0 = vi[0];
8524c1414c8SBarry Smith         i1 = vi[1];
8534c1414c8SBarry Smith         vi += 2;
8544c1414c8SBarry Smith         tmp0 = tmps[i0];
8554c1414c8SBarry Smith         tmp1 = tmps[i1];
8569371c9d4SSatish Balay         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
8579371c9d4SSatish Balay         v1 += 2;
8589371c9d4SSatish Balay         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
8599371c9d4SSatish Balay         v2 += 2;
8609371c9d4SSatish Balay         sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
8619371c9d4SSatish Balay         v3 += 2;
8624c1414c8SBarry Smith       }
8634c1414c8SBarry Smith       if (j == nz - 1) {
8644c1414c8SBarry Smith         tmp0 = tmps[*vi++];
8654c1414c8SBarry Smith         sum1 -= *v1++ * tmp0;
8664c1414c8SBarry Smith         sum2 -= *v2++ * tmp0;
8674c1414c8SBarry Smith         sum3 -= *v3++ * tmp0;
8684c1414c8SBarry Smith       }
8694c1414c8SBarry Smith       sum2 -= *v2++ * sum1;
8704c1414c8SBarry Smith       sum3 -= *v3++ * sum1;
8714c1414c8SBarry Smith       sum3 -= *v3++ * sum2;
8722205254eSKarl Rupp 
8734c1414c8SBarry Smith       tmp[row++] = sum1;
8744c1414c8SBarry Smith       tmp[row++] = sum2;
8754c1414c8SBarry Smith       tmp[row++] = sum3;
8764c1414c8SBarry Smith       break;
8774c1414c8SBarry Smith 
8784c1414c8SBarry Smith     case 4:
8794c1414c8SBarry Smith       sum1 = b[*r++];
8804c1414c8SBarry Smith       sum2 = b[*r++];
8814c1414c8SBarry Smith       sum3 = b[*r++];
8824c1414c8SBarry Smith       sum4 = b[*r++];
8834c1414c8SBarry Smith       v2   = aa + ai[row + 1];
8844c1414c8SBarry Smith       v3   = aa + ai[row + 2];
8854c1414c8SBarry Smith       v4   = aa + ai[row + 3];
8864c1414c8SBarry Smith 
8874c1414c8SBarry Smith       for (j = 0; j < nz - 1; j += 2) {
8884c1414c8SBarry Smith         i0 = vi[0];
8894c1414c8SBarry Smith         i1 = vi[1];
8904c1414c8SBarry Smith         vi += 2;
8914c1414c8SBarry Smith         tmp0 = tmps[i0];
8924c1414c8SBarry Smith         tmp1 = tmps[i1];
8939371c9d4SSatish Balay         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
8949371c9d4SSatish Balay         v1 += 2;
8959371c9d4SSatish Balay         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
8969371c9d4SSatish Balay         v2 += 2;
8979371c9d4SSatish Balay         sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
8989371c9d4SSatish Balay         v3 += 2;
8999371c9d4SSatish Balay         sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
9009371c9d4SSatish Balay         v4 += 2;
9014c1414c8SBarry Smith       }
9024c1414c8SBarry Smith       if (j == nz - 1) {
9034c1414c8SBarry Smith         tmp0 = tmps[*vi++];
9044c1414c8SBarry Smith         sum1 -= *v1++ * tmp0;
9054c1414c8SBarry Smith         sum2 -= *v2++ * tmp0;
9064c1414c8SBarry Smith         sum3 -= *v3++ * tmp0;
9074c1414c8SBarry Smith         sum4 -= *v4++ * tmp0;
9084c1414c8SBarry Smith       }
9094c1414c8SBarry Smith       sum2 -= *v2++ * sum1;
9104c1414c8SBarry Smith       sum3 -= *v3++ * sum1;
9114c1414c8SBarry Smith       sum4 -= *v4++ * sum1;
9124c1414c8SBarry Smith       sum3 -= *v3++ * sum2;
9134c1414c8SBarry Smith       sum4 -= *v4++ * sum2;
9144c1414c8SBarry Smith       sum4 -= *v4++ * sum3;
9154c1414c8SBarry Smith 
9164c1414c8SBarry Smith       tmp[row++] = sum1;
9174c1414c8SBarry Smith       tmp[row++] = sum2;
9184c1414c8SBarry Smith       tmp[row++] = sum3;
9194c1414c8SBarry Smith       tmp[row++] = sum4;
9204c1414c8SBarry Smith       break;
9214c1414c8SBarry Smith     case 5:
9224c1414c8SBarry Smith       sum1 = b[*r++];
9234c1414c8SBarry Smith       sum2 = b[*r++];
9244c1414c8SBarry Smith       sum3 = b[*r++];
9254c1414c8SBarry Smith       sum4 = b[*r++];
9264c1414c8SBarry Smith       sum5 = b[*r++];
9274c1414c8SBarry Smith       v2   = aa + ai[row + 1];
9284c1414c8SBarry Smith       v3   = aa + ai[row + 2];
9294c1414c8SBarry Smith       v4   = aa + ai[row + 3];
9304c1414c8SBarry Smith       v5   = aa + ai[row + 4];
9314c1414c8SBarry Smith 
9324c1414c8SBarry Smith       for (j = 0; j < nz - 1; j += 2) {
9334c1414c8SBarry Smith         i0 = vi[0];
9344c1414c8SBarry Smith         i1 = vi[1];
9354c1414c8SBarry Smith         vi += 2;
9364c1414c8SBarry Smith         tmp0 = tmps[i0];
9374c1414c8SBarry Smith         tmp1 = tmps[i1];
9389371c9d4SSatish Balay         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
9399371c9d4SSatish Balay         v1 += 2;
9409371c9d4SSatish Balay         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
9419371c9d4SSatish Balay         v2 += 2;
9429371c9d4SSatish Balay         sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
9439371c9d4SSatish Balay         v3 += 2;
9449371c9d4SSatish Balay         sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
9459371c9d4SSatish Balay         v4 += 2;
9469371c9d4SSatish Balay         sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
9479371c9d4SSatish Balay         v5 += 2;
9484c1414c8SBarry Smith       }
9494c1414c8SBarry Smith       if (j == nz - 1) {
9504c1414c8SBarry Smith         tmp0 = tmps[*vi++];
9514c1414c8SBarry Smith         sum1 -= *v1++ * tmp0;
9524c1414c8SBarry Smith         sum2 -= *v2++ * tmp0;
9534c1414c8SBarry Smith         sum3 -= *v3++ * tmp0;
9544c1414c8SBarry Smith         sum4 -= *v4++ * tmp0;
9554c1414c8SBarry Smith         sum5 -= *v5++ * tmp0;
9564c1414c8SBarry Smith       }
9574c1414c8SBarry Smith 
9584c1414c8SBarry Smith       sum2 -= *v2++ * sum1;
9594c1414c8SBarry Smith       sum3 -= *v3++ * sum1;
9604c1414c8SBarry Smith       sum4 -= *v4++ * sum1;
9614c1414c8SBarry Smith       sum5 -= *v5++ * sum1;
9624c1414c8SBarry Smith       sum3 -= *v3++ * sum2;
9634c1414c8SBarry Smith       sum4 -= *v4++ * sum2;
9644c1414c8SBarry Smith       sum5 -= *v5++ * sum2;
9654c1414c8SBarry Smith       sum4 -= *v4++ * sum3;
9664c1414c8SBarry Smith       sum5 -= *v5++ * sum3;
9674c1414c8SBarry Smith       sum5 -= *v5++ * sum4;
9684c1414c8SBarry Smith 
9694c1414c8SBarry Smith       tmp[row++] = sum1;
9704c1414c8SBarry Smith       tmp[row++] = sum2;
9714c1414c8SBarry Smith       tmp[row++] = sum3;
9724c1414c8SBarry Smith       tmp[row++] = sum4;
9734c1414c8SBarry Smith       tmp[row++] = sum5;
9744c1414c8SBarry Smith       break;
975d71ae5a4SJacob Faibussowitsch     default:
976d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
9774c1414c8SBarry Smith     }
9784c1414c8SBarry Smith   }
9794c1414c8SBarry Smith   /* backward solve the upper triangular */
9804c1414c8SBarry Smith   for (i = node_max - 1, row = n - 1; i >= 0; i--) {
9814c1414c8SBarry Smith     nsz = ns[i];
9824c1414c8SBarry Smith     aii = ai[row + 1] - 1;
9834c1414c8SBarry Smith     v1  = aa + aii;
9844c1414c8SBarry Smith     vi  = aj + aii;
9854c1414c8SBarry Smith     nz  = aii - ad[row];
9864c1414c8SBarry Smith     switch (nsz) { /* Each loop in 'case' is unrolled */
9874c1414c8SBarry Smith     case 1:
9884c1414c8SBarry Smith       sum1 = tmp[row];
9894c1414c8SBarry Smith 
9904c1414c8SBarry Smith       for (j = nz; j > 1; j -= 2) {
9914c1414c8SBarry Smith         vi -= 2;
9924c1414c8SBarry Smith         i0   = vi[2];
9934c1414c8SBarry Smith         i1   = vi[1];
9944c1414c8SBarry Smith         tmp0 = tmps[i0];
9954c1414c8SBarry Smith         tmp1 = tmps[i1];
9964c1414c8SBarry Smith         v1 -= 2;
9974c1414c8SBarry Smith         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
9984c1414c8SBarry Smith       }
9994c1414c8SBarry Smith       if (j == 1) {
10004c1414c8SBarry Smith         tmp0 = tmps[*vi--];
10014c1414c8SBarry Smith         sum1 -= *v1-- * tmp0;
10024c1414c8SBarry Smith       }
10039371c9d4SSatish Balay       x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
10049371c9d4SSatish Balay       row--;
10054c1414c8SBarry Smith       break;
10064c1414c8SBarry Smith     case 2:
10074c1414c8SBarry Smith       sum1 = tmp[row];
10084c1414c8SBarry Smith       sum2 = tmp[row - 1];
10094c1414c8SBarry Smith       v2   = aa + ai[row] - 1;
10104c1414c8SBarry Smith       for (j = nz; j > 1; j -= 2) {
10114c1414c8SBarry Smith         vi -= 2;
10124c1414c8SBarry Smith         i0   = vi[2];
10134c1414c8SBarry Smith         i1   = vi[1];
10144c1414c8SBarry Smith         tmp0 = tmps[i0];
10154c1414c8SBarry Smith         tmp1 = tmps[i1];
10164c1414c8SBarry Smith         v1 -= 2;
10174c1414c8SBarry Smith         v2 -= 2;
10184c1414c8SBarry Smith         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
10194c1414c8SBarry Smith         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
10204c1414c8SBarry Smith       }
10214c1414c8SBarry Smith       if (j == 1) {
10224c1414c8SBarry Smith         tmp0 = tmps[*vi--];
10234c1414c8SBarry Smith         sum1 -= *v1-- * tmp0;
10244c1414c8SBarry Smith         sum2 -= *v2-- * tmp0;
10254c1414c8SBarry Smith       }
10264c1414c8SBarry Smith 
10279371c9d4SSatish Balay       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
10289371c9d4SSatish Balay       row--;
10294c1414c8SBarry Smith       sum2 -= *v2-- * tmp0;
10309371c9d4SSatish Balay       x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
10319371c9d4SSatish Balay       row--;
10324c1414c8SBarry Smith       break;
10334c1414c8SBarry Smith     case 3:
10344c1414c8SBarry Smith       sum1 = tmp[row];
10354c1414c8SBarry Smith       sum2 = tmp[row - 1];
10364c1414c8SBarry Smith       sum3 = tmp[row - 2];
10374c1414c8SBarry Smith       v2   = aa + ai[row] - 1;
10384c1414c8SBarry Smith       v3   = aa + ai[row - 1] - 1;
10394c1414c8SBarry Smith       for (j = nz; j > 1; j -= 2) {
10404c1414c8SBarry Smith         vi -= 2;
10414c1414c8SBarry Smith         i0   = vi[2];
10424c1414c8SBarry Smith         i1   = vi[1];
10434c1414c8SBarry Smith         tmp0 = tmps[i0];
10444c1414c8SBarry Smith         tmp1 = tmps[i1];
10454c1414c8SBarry Smith         v1 -= 2;
10464c1414c8SBarry Smith         v2 -= 2;
10474c1414c8SBarry Smith         v3 -= 2;
10484c1414c8SBarry Smith         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
10494c1414c8SBarry Smith         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
10504c1414c8SBarry Smith         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
10514c1414c8SBarry Smith       }
10524c1414c8SBarry Smith       if (j == 1) {
10534c1414c8SBarry Smith         tmp0 = tmps[*vi--];
10544c1414c8SBarry Smith         sum1 -= *v1-- * tmp0;
10554c1414c8SBarry Smith         sum2 -= *v2-- * tmp0;
10564c1414c8SBarry Smith         sum3 -= *v3-- * tmp0;
10574c1414c8SBarry Smith       }
10589371c9d4SSatish Balay       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
10599371c9d4SSatish Balay       row--;
10604c1414c8SBarry Smith       sum2 -= *v2-- * tmp0;
10614c1414c8SBarry Smith       sum3 -= *v3-- * tmp0;
10629371c9d4SSatish Balay       tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
10639371c9d4SSatish Balay       row--;
10644c1414c8SBarry Smith       sum3 -= *v3-- * tmp0;
10659371c9d4SSatish Balay       x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
10669371c9d4SSatish Balay       row--;
10674c1414c8SBarry Smith 
10684c1414c8SBarry Smith       break;
10694c1414c8SBarry Smith     case 4:
10704c1414c8SBarry Smith       sum1 = tmp[row];
10714c1414c8SBarry Smith       sum2 = tmp[row - 1];
10724c1414c8SBarry Smith       sum3 = tmp[row - 2];
10734c1414c8SBarry Smith       sum4 = tmp[row - 3];
10744c1414c8SBarry Smith       v2   = aa + ai[row] - 1;
10754c1414c8SBarry Smith       v3   = aa + ai[row - 1] - 1;
10764c1414c8SBarry Smith       v4   = aa + ai[row - 2] - 1;
10774c1414c8SBarry Smith 
10784c1414c8SBarry Smith       for (j = nz; j > 1; j -= 2) {
10794c1414c8SBarry Smith         vi -= 2;
10804c1414c8SBarry Smith         i0   = vi[2];
10814c1414c8SBarry Smith         i1   = vi[1];
10824c1414c8SBarry Smith         tmp0 = tmps[i0];
10834c1414c8SBarry Smith         tmp1 = tmps[i1];
10844c1414c8SBarry Smith         v1 -= 2;
10854c1414c8SBarry Smith         v2 -= 2;
10864c1414c8SBarry Smith         v3 -= 2;
10874c1414c8SBarry Smith         v4 -= 2;
10884c1414c8SBarry Smith         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
10894c1414c8SBarry Smith         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
10904c1414c8SBarry Smith         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
10914c1414c8SBarry Smith         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
10924c1414c8SBarry Smith       }
10934c1414c8SBarry Smith       if (j == 1) {
10944c1414c8SBarry Smith         tmp0 = tmps[*vi--];
10954c1414c8SBarry Smith         sum1 -= *v1-- * tmp0;
10964c1414c8SBarry Smith         sum2 -= *v2-- * tmp0;
10974c1414c8SBarry Smith         sum3 -= *v3-- * tmp0;
10984c1414c8SBarry Smith         sum4 -= *v4-- * tmp0;
10994c1414c8SBarry Smith       }
11004c1414c8SBarry Smith 
11019371c9d4SSatish Balay       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
11029371c9d4SSatish Balay       row--;
11034c1414c8SBarry Smith       sum2 -= *v2-- * tmp0;
11044c1414c8SBarry Smith       sum3 -= *v3-- * tmp0;
11054c1414c8SBarry Smith       sum4 -= *v4-- * tmp0;
11069371c9d4SSatish Balay       tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
11079371c9d4SSatish Balay       row--;
11084c1414c8SBarry Smith       sum3 -= *v3-- * tmp0;
11094c1414c8SBarry Smith       sum4 -= *v4-- * tmp0;
11109371c9d4SSatish Balay       tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
11119371c9d4SSatish Balay       row--;
11124c1414c8SBarry Smith       sum4 -= *v4-- * tmp0;
11139371c9d4SSatish Balay       x[*c--] = tmp[row] = sum4 * a_a[ad[row]];
11149371c9d4SSatish Balay       row--;
11154c1414c8SBarry Smith       break;
11164c1414c8SBarry Smith     case 5:
11174c1414c8SBarry Smith       sum1 = tmp[row];
11184c1414c8SBarry Smith       sum2 = tmp[row - 1];
11194c1414c8SBarry Smith       sum3 = tmp[row - 2];
11204c1414c8SBarry Smith       sum4 = tmp[row - 3];
11214c1414c8SBarry Smith       sum5 = tmp[row - 4];
11224c1414c8SBarry Smith       v2   = aa + ai[row] - 1;
11234c1414c8SBarry Smith       v3   = aa + ai[row - 1] - 1;
11244c1414c8SBarry Smith       v4   = aa + ai[row - 2] - 1;
11254c1414c8SBarry Smith       v5   = aa + ai[row - 3] - 1;
11264c1414c8SBarry Smith       for (j = nz; j > 1; j -= 2) {
11274c1414c8SBarry Smith         vi -= 2;
11284c1414c8SBarry Smith         i0   = vi[2];
11294c1414c8SBarry Smith         i1   = vi[1];
11304c1414c8SBarry Smith         tmp0 = tmps[i0];
11314c1414c8SBarry Smith         tmp1 = tmps[i1];
11324c1414c8SBarry Smith         v1 -= 2;
11334c1414c8SBarry Smith         v2 -= 2;
11344c1414c8SBarry Smith         v3 -= 2;
11354c1414c8SBarry Smith         v4 -= 2;
11364c1414c8SBarry Smith         v5 -= 2;
11374c1414c8SBarry Smith         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
11384c1414c8SBarry Smith         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
11394c1414c8SBarry Smith         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
11404c1414c8SBarry Smith         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
11414c1414c8SBarry Smith         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
11424c1414c8SBarry Smith       }
11434c1414c8SBarry Smith       if (j == 1) {
11444c1414c8SBarry Smith         tmp0 = tmps[*vi--];
11454c1414c8SBarry Smith         sum1 -= *v1-- * tmp0;
11464c1414c8SBarry Smith         sum2 -= *v2-- * tmp0;
11474c1414c8SBarry Smith         sum3 -= *v3-- * tmp0;
11484c1414c8SBarry Smith         sum4 -= *v4-- * tmp0;
11494c1414c8SBarry Smith         sum5 -= *v5-- * tmp0;
11504c1414c8SBarry Smith       }
11514c1414c8SBarry Smith 
11529371c9d4SSatish Balay       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
11539371c9d4SSatish Balay       row--;
11544c1414c8SBarry Smith       sum2 -= *v2-- * tmp0;
11554c1414c8SBarry Smith       sum3 -= *v3-- * tmp0;
11564c1414c8SBarry Smith       sum4 -= *v4-- * tmp0;
11574c1414c8SBarry Smith       sum5 -= *v5-- * tmp0;
11589371c9d4SSatish Balay       tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
11599371c9d4SSatish Balay       row--;
11604c1414c8SBarry Smith       sum3 -= *v3-- * tmp0;
11614c1414c8SBarry Smith       sum4 -= *v4-- * tmp0;
11624c1414c8SBarry Smith       sum5 -= *v5-- * tmp0;
11639371c9d4SSatish Balay       tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
11649371c9d4SSatish Balay       row--;
11654c1414c8SBarry Smith       sum4 -= *v4-- * tmp0;
11664c1414c8SBarry Smith       sum5 -= *v5-- * tmp0;
11679371c9d4SSatish Balay       tmp0 = x[*c--] = tmp[row] = sum4 * a_a[ad[row]];
11689371c9d4SSatish Balay       row--;
11694c1414c8SBarry Smith       sum5 -= *v5-- * tmp0;
11709371c9d4SSatish Balay       x[*c--] = tmp[row] = sum5 * a_a[ad[row]];
11719371c9d4SSatish Balay       row--;
11724c1414c8SBarry Smith       break;
1173d71ae5a4SJacob Faibussowitsch     default:
1174d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
11754c1414c8SBarry Smith     }
11764c1414c8SBarry Smith   }
11779566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &rout));
11789566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &cout));
11799566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
11809566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(xx, &x));
11819566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
11823ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
11834c1414c8SBarry Smith }
11844c1414c8SBarry Smith 
1185d71ae5a4SJacob Faibussowitsch PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B, Mat A, const MatFactorInfo *info)
1186d71ae5a4SJacob Faibussowitsch {
118728f1b45aSHong Zhang   Mat              C = B;
118828f1b45aSHong Zhang   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data;
118928f1b45aSHong Zhang   IS               isrow = b->row, isicol = b->icol;
119028f1b45aSHong Zhang   const PetscInt  *r, *ic, *ics;
119128f1b45aSHong Zhang   const PetscInt   n = A->rmap->n, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag;
119228f1b45aSHong Zhang   PetscInt         i, j, k, nz, nzL, row, *pj;
119328f1b45aSHong Zhang   const PetscInt  *ajtmp, *bjtmp;
11949877982aSShri Abhyankar   MatScalar       *pc, *pc1, *pc2, *pc3, *pc4, mul1, mul2, mul3, mul4, *pv, *rtmp1, *rtmp2, *rtmp3, *rtmp4;
11959877982aSShri Abhyankar   const MatScalar *aa = a->a, *v, *v1, *v2, *v3, *v4;
119628f1b45aSHong Zhang   FactorShiftCtx   sctx;
11974f81c4b7SBarry Smith   const PetscInt  *ddiag;
119828f1b45aSHong Zhang   PetscReal        rs;
119928f1b45aSHong Zhang   MatScalar        d;
12004f81c4b7SBarry Smith   PetscInt         inod, nodesz, node_max, col;
12014f81c4b7SBarry Smith   const PetscInt  *ns;
120207b50cabSHong Zhang   PetscInt        *tmp_vec1, *tmp_vec2, *nsmap;
12030e95ead3SHong Zhang 
120428f1b45aSHong Zhang   PetscFunctionBegin;
120528f1b45aSHong Zhang   /* MatPivotSetUp(): initialize shift context sctx */
12069566063dSJacob Faibussowitsch   PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
120728f1b45aSHong Zhang 
1208f4db908eSBarry Smith   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
120928f1b45aSHong Zhang     ddiag          = a->diag;
121028f1b45aSHong Zhang     sctx.shift_top = info->zeropivot;
121128f1b45aSHong Zhang     for (i = 0; i < n; i++) {
121228f1b45aSHong Zhang       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
121328f1b45aSHong Zhang       d  = (aa)[ddiag[i]];
121428f1b45aSHong Zhang       rs = -PetscAbsScalar(d) - PetscRealPart(d);
121528f1b45aSHong Zhang       v  = aa + ai[i];
121628f1b45aSHong Zhang       nz = ai[i + 1] - ai[i];
12172205254eSKarl Rupp       for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]);
121828f1b45aSHong Zhang       if (rs > sctx.shift_top) sctx.shift_top = rs;
121928f1b45aSHong Zhang     }
122028f1b45aSHong Zhang     sctx.shift_top *= 1.1;
122128f1b45aSHong Zhang     sctx.nshift_max = 5;
122228f1b45aSHong Zhang     sctx.shift_lo   = 0.;
122328f1b45aSHong Zhang     sctx.shift_hi   = 1.;
122428f1b45aSHong Zhang   }
122528f1b45aSHong Zhang 
12269566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isrow, &r));
12279566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isicol, &ic));
122868785679SHong Zhang 
12299566063dSJacob Faibussowitsch   PetscCall(PetscCalloc4(n, &rtmp1, n, &rtmp2, n, &rtmp3, n, &rtmp4));
123028f1b45aSHong Zhang   ics = ic;
123128f1b45aSHong Zhang 
123228f1b45aSHong Zhang   node_max = a->inode.node_count;
123328f1b45aSHong Zhang   ns       = a->inode.size;
123428b400f6SJacob Faibussowitsch   PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information");
123528f1b45aSHong Zhang 
12369877982aSShri Abhyankar   /* If max inode size > 4, split it into two inodes.*/
123768785679SHong Zhang   /* also map the inode sizes according to the ordering */
12389566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(n + 1, &tmp_vec1));
123968785679SHong Zhang   for (i = 0, j = 0; i < node_max; ++i, ++j) {
1240b1550197SShri Abhyankar     if (ns[i] > 4) {
1241048b5e81SShri Abhyankar       tmp_vec1[j] = 4;
124268785679SHong Zhang       ++j;
124368785679SHong Zhang       tmp_vec1[j] = ns[i] - tmp_vec1[j - 1];
124468785679SHong Zhang     } else {
124568785679SHong Zhang       tmp_vec1[j] = ns[i];
124668785679SHong Zhang     }
124768785679SHong Zhang   }
124868785679SHong Zhang   /* Use the correct node_max */
124968785679SHong Zhang   node_max = j;
125068785679SHong Zhang 
125168785679SHong Zhang   /* Now reorder the inode info based on mat re-ordering info */
125268785679SHong Zhang   /* First create a row -> inode_size_array_index map */
12539566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(n + 1, &nsmap));
12549566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(node_max + 1, &tmp_vec2));
125568785679SHong Zhang   for (i = 0, row = 0; i < node_max; i++) {
125668785679SHong Zhang     nodesz = tmp_vec1[i];
1257ad540459SPierre Jolivet     for (j = 0; j < nodesz; j++, row++) nsmap[row] = i;
125868785679SHong Zhang   }
125968785679SHong Zhang   /* Using nsmap, create a reordered ns structure */
126068785679SHong Zhang   for (i = 0, j = 0; i < node_max; i++) {
126168785679SHong Zhang     nodesz      = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
126268785679SHong Zhang     tmp_vec2[i] = nodesz;
126368785679SHong Zhang     j += nodesz;
126468785679SHong Zhang   }
12659566063dSJacob Faibussowitsch   PetscCall(PetscFree(nsmap));
12669566063dSJacob Faibussowitsch   PetscCall(PetscFree(tmp_vec1));
1267b89f182dSHong Zhang 
126868785679SHong Zhang   /* Now use the correct ns */
126968785679SHong Zhang   ns = tmp_vec2;
127068785679SHong Zhang 
127128f1b45aSHong Zhang   do {
127207b50cabSHong Zhang     sctx.newshift = PETSC_FALSE;
127328f1b45aSHong Zhang     /* Now loop over each block-row, and do the factorization */
127428f1b45aSHong Zhang     for (inod = 0, i = 0; inod < node_max; inod++) { /* i: row index; inod: inode index */
127528f1b45aSHong Zhang       nodesz = ns[inod];
127628f1b45aSHong Zhang 
127728f1b45aSHong Zhang       switch (nodesz) {
127828f1b45aSHong Zhang       case 1:
1279b89f182dSHong Zhang         /* zero rtmp1 */
128028f1b45aSHong Zhang         /* L part */
128128f1b45aSHong Zhang         nz    = bi[i + 1] - bi[i];
128228f1b45aSHong Zhang         bjtmp = bj + bi[i];
1283b89f182dSHong Zhang         for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0;
128428f1b45aSHong Zhang 
128528f1b45aSHong Zhang         /* U part */
128628f1b45aSHong Zhang         nz    = bdiag[i] - bdiag[i + 1];
128728f1b45aSHong Zhang         bjtmp = bj + bdiag[i + 1] + 1;
1288b89f182dSHong Zhang         for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0;
128928f1b45aSHong Zhang 
129028f1b45aSHong Zhang         /* load in initial (unfactored row) */
129128f1b45aSHong Zhang         nz    = ai[r[i] + 1] - ai[r[i]];
129228f1b45aSHong Zhang         ajtmp = aj + ai[r[i]];
129328f1b45aSHong Zhang         v     = aa + ai[r[i]];
12942205254eSKarl Rupp         for (j = 0; j < nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];
12952205254eSKarl Rupp 
129628f1b45aSHong Zhang         /* ZeropivotApply() */
1297b89f182dSHong Zhang         rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */
129828f1b45aSHong Zhang 
129928f1b45aSHong Zhang         /* elimination */
130028f1b45aSHong Zhang         bjtmp = bj + bi[i];
130128f1b45aSHong Zhang         row   = *bjtmp++;
130228f1b45aSHong Zhang         nzL   = bi[i + 1] - bi[i];
130328f1b45aSHong Zhang         for (k = 0; k < nzL; k++) {
1304b89f182dSHong Zhang           pc = rtmp1 + row;
130528f1b45aSHong Zhang           if (*pc != 0.0) {
130628f1b45aSHong Zhang             pv   = b->a + bdiag[row];
1307b89f182dSHong Zhang             mul1 = *pc * (*pv);
1308b89f182dSHong Zhang             *pc  = mul1;
130928f1b45aSHong Zhang             pj   = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
131028f1b45aSHong Zhang             pv   = b->a + bdiag[row + 1] + 1;
131128f1b45aSHong Zhang             nz   = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1312b89f182dSHong Zhang             for (j = 0; j < nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
13139566063dSJacob Faibussowitsch             PetscCall(PetscLogFlops(1 + 2.0 * nz));
131428f1b45aSHong Zhang           }
131528f1b45aSHong Zhang           row = *bjtmp++;
131628f1b45aSHong Zhang         }
131728f1b45aSHong Zhang 
131828f1b45aSHong Zhang         /* finished row so stick it into b->a */
131928f1b45aSHong Zhang         rs = 0.0;
132028f1b45aSHong Zhang         /* L part */
132128f1b45aSHong Zhang         pv = b->a + bi[i];
132228f1b45aSHong Zhang         pj = b->j + bi[i];
132328f1b45aSHong Zhang         nz = bi[i + 1] - bi[i];
132428f1b45aSHong Zhang         for (j = 0; j < nz; j++) {
13259371c9d4SSatish Balay           pv[j] = rtmp1[pj[j]];
13269371c9d4SSatish Balay           rs += PetscAbsScalar(pv[j]);
132728f1b45aSHong Zhang         }
132828f1b45aSHong Zhang 
132928f1b45aSHong Zhang         /* U part */
133028f1b45aSHong Zhang         pv = b->a + bdiag[i + 1] + 1;
133128f1b45aSHong Zhang         pj = b->j + bdiag[i + 1] + 1;
133228f1b45aSHong Zhang         nz = bdiag[i] - bdiag[i + 1] - 1;
133328f1b45aSHong Zhang         for (j = 0; j < nz; j++) {
13349371c9d4SSatish Balay           pv[j] = rtmp1[pj[j]];
13359371c9d4SSatish Balay           rs += PetscAbsScalar(pv[j]);
133628f1b45aSHong Zhang         }
133728f1b45aSHong Zhang 
1338b89f182dSHong Zhang         /* Check zero pivot */
133928f1b45aSHong Zhang         sctx.rs = rs;
1340b89f182dSHong Zhang         sctx.pv = rtmp1[i];
13419566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
134207b50cabSHong Zhang         if (sctx.newshift) break;
134328f1b45aSHong Zhang 
1344a5b23f4aSJose E. Roman         /* Mark diagonal and invert diagonal for simpler triangular solves */
134528f1b45aSHong Zhang         pv  = b->a + bdiag[i];
1346b89f182dSHong Zhang         *pv = 1.0 / sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
134728f1b45aSHong Zhang         break;
134828f1b45aSHong Zhang 
134928f1b45aSHong Zhang       case 2:
1350b89f182dSHong Zhang         /* zero rtmp1 and rtmp2 */
135128f1b45aSHong Zhang         /* L part */
135228f1b45aSHong Zhang         nz    = bi[i + 1] - bi[i];
135328f1b45aSHong Zhang         bjtmp = bj + bi[i];
135428f1b45aSHong Zhang         for (j = 0; j < nz; j++) {
135568785679SHong Zhang           col        = bjtmp[j];
13569371c9d4SSatish Balay           rtmp1[col] = 0.0;
13579371c9d4SSatish Balay           rtmp2[col] = 0.0;
135828f1b45aSHong Zhang         }
135928f1b45aSHong Zhang 
136028f1b45aSHong Zhang         /* U part */
136128f1b45aSHong Zhang         nz    = bdiag[i] - bdiag[i + 1];
136228f1b45aSHong Zhang         bjtmp = bj + bdiag[i + 1] + 1;
136328f1b45aSHong Zhang         for (j = 0; j < nz; j++) {
136468785679SHong Zhang           col        = bjtmp[j];
13659371c9d4SSatish Balay           rtmp1[col] = 0.0;
13669371c9d4SSatish Balay           rtmp2[col] = 0.0;
136728f1b45aSHong Zhang         }
136828f1b45aSHong Zhang 
136928f1b45aSHong Zhang         /* load in initial (unfactored row) */
137028f1b45aSHong Zhang         nz    = ai[r[i] + 1] - ai[r[i]];
137128f1b45aSHong Zhang         ajtmp = aj + ai[r[i]];
13729371c9d4SSatish Balay         v1    = aa + ai[r[i]];
13739371c9d4SSatish Balay         v2    = aa + ai[r[i] + 1];
137428f1b45aSHong Zhang         for (j = 0; j < nz; j++) {
137568785679SHong Zhang           col        = ics[ajtmp[j]];
13769371c9d4SSatish Balay           rtmp1[col] = v1[j];
13779371c9d4SSatish Balay           rtmp2[col] = v2[j];
137828f1b45aSHong Zhang         }
137928f1b45aSHong Zhang         /* ZeropivotApply(): shift the diagonal of the matrix  */
13809371c9d4SSatish Balay         rtmp1[i] += sctx.shift_amount;
13819371c9d4SSatish Balay         rtmp2[i + 1] += sctx.shift_amount;
138228f1b45aSHong Zhang 
138328f1b45aSHong Zhang         /* elimination */
138428f1b45aSHong Zhang         bjtmp = bj + bi[i];
138528f1b45aSHong Zhang         row   = *bjtmp++; /* pivot row */
138628f1b45aSHong Zhang         nzL   = bi[i + 1] - bi[i];
138728f1b45aSHong Zhang         for (k = 0; k < nzL; k++) {
1388b89f182dSHong Zhang           pc1 = rtmp1 + row;
1389b89f182dSHong Zhang           pc2 = rtmp2 + row;
139028f1b45aSHong Zhang           if (*pc1 != 0.0 || *pc2 != 0.0) {
139128f1b45aSHong Zhang             pv   = b->a + bdiag[row];
13929371c9d4SSatish Balay             mul1 = *pc1 * (*pv);
13939371c9d4SSatish Balay             mul2 = *pc2 * (*pv);
13949371c9d4SSatish Balay             *pc1 = mul1;
13959371c9d4SSatish Balay             *pc2 = mul2;
139628f1b45aSHong Zhang 
139728f1b45aSHong Zhang             pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
139828f1b45aSHong Zhang             pv = b->a + bdiag[row + 1] + 1;
139928f1b45aSHong Zhang             nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
140028f1b45aSHong Zhang             for (j = 0; j < nz; j++) {
140168785679SHong Zhang               col = pj[j];
1402b89f182dSHong Zhang               rtmp1[col] -= mul1 * pv[j];
1403b89f182dSHong Zhang               rtmp2[col] -= mul2 * pv[j];
140428f1b45aSHong Zhang             }
14059566063dSJacob Faibussowitsch             PetscCall(PetscLogFlops(2 + 4.0 * nz));
140628f1b45aSHong Zhang           }
140728f1b45aSHong Zhang           row = *bjtmp++;
140828f1b45aSHong Zhang         }
140928f1b45aSHong Zhang 
1410b89f182dSHong Zhang         /* finished row i; check zero pivot, then stick row i into b->a */
141128f1b45aSHong Zhang         rs = 0.0;
141228f1b45aSHong Zhang         /* L part */
1413b89f182dSHong Zhang         pc1 = b->a + bi[i];
141428f1b45aSHong Zhang         pj  = b->j + bi[i];
141528f1b45aSHong Zhang         nz  = bi[i + 1] - bi[i];
141628f1b45aSHong Zhang         for (j = 0; j < nz; j++) {
141768785679SHong Zhang           col    = pj[j];
14189371c9d4SSatish Balay           pc1[j] = rtmp1[col];
14199371c9d4SSatish Balay           rs += PetscAbsScalar(pc1[j]);
142028f1b45aSHong Zhang         }
142128f1b45aSHong Zhang         /* U part */
1422b89f182dSHong Zhang         pc1 = b->a + bdiag[i + 1] + 1;
142328f1b45aSHong Zhang         pj  = b->j + bdiag[i + 1] + 1;
14240e7a5c2bSHong Zhang         nz  = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
142528f1b45aSHong Zhang         for (j = 0; j < nz; j++) {
142668785679SHong Zhang           col    = pj[j];
14279371c9d4SSatish Balay           pc1[j] = rtmp1[col];
14289371c9d4SSatish Balay           rs += PetscAbsScalar(pc1[j]);
142928f1b45aSHong Zhang         }
143028f1b45aSHong Zhang 
143128f1b45aSHong Zhang         sctx.rs = rs;
1432b89f182dSHong Zhang         sctx.pv = rtmp1[i];
14339566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
143407b50cabSHong Zhang         if (sctx.newshift) break;
1435b89f182dSHong Zhang         pc1  = b->a + bdiag[i]; /* Mark diagonal */
1436b89f182dSHong Zhang         *pc1 = 1.0 / sctx.pv;
1437b89f182dSHong Zhang 
1438b89f182dSHong Zhang         /* Now take care of diagonal 2x2 block. */
1439b89f182dSHong Zhang         pc2 = rtmp2 + i;
1440b89f182dSHong Zhang         if (*pc2 != 0.0) {
1441b89f182dSHong Zhang           mul1 = (*pc2) * (*pc1);             /* *pc1=diag[i] is inverted! */
1442b89f182dSHong Zhang           *pc2 = mul1;                        /* insert L entry */
1443b89f182dSHong Zhang           pj   = b->j + bdiag[i + 1] + 1;     /* beginning of U(i,:) */
1444b89f182dSHong Zhang           nz   = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1445b89f182dSHong Zhang           for (j = 0; j < nz; j++) {
14469371c9d4SSatish Balay             col = pj[j];
14479371c9d4SSatish Balay             rtmp2[col] -= mul1 * rtmp1[col];
144828f1b45aSHong Zhang           }
14499566063dSJacob Faibussowitsch           PetscCall(PetscLogFlops(1 + 2.0 * nz));
1450b89f182dSHong Zhang         }
1451b89f182dSHong Zhang 
1452b89f182dSHong Zhang         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1453b89f182dSHong Zhang         rs = 0.0;
1454b89f182dSHong Zhang         /* L part */
1455b89f182dSHong Zhang         pc2 = b->a + bi[i + 1];
1456b89f182dSHong Zhang         pj  = b->j + bi[i + 1];
1457b89f182dSHong Zhang         nz  = bi[i + 2] - bi[i + 1];
1458b89f182dSHong Zhang         for (j = 0; j < nz; j++) {
1459b89f182dSHong Zhang           col    = pj[j];
14609371c9d4SSatish Balay           pc2[j] = rtmp2[col];
14619371c9d4SSatish Balay           rs += PetscAbsScalar(pc2[j]);
1462b89f182dSHong Zhang         }
1463b89f182dSHong Zhang         /* U part */
1464b89f182dSHong Zhang         pc2 = b->a + bdiag[i + 2] + 1;
14650e7a5c2bSHong Zhang         pj  = b->j + bdiag[i + 2] + 1;
14660e7a5c2bSHong Zhang         nz  = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1467b89f182dSHong Zhang         for (j = 0; j < nz; j++) {
1468b89f182dSHong Zhang           col    = pj[j];
14699371c9d4SSatish Balay           pc2[j] = rtmp2[col];
14709371c9d4SSatish Balay           rs += PetscAbsScalar(pc2[j]);
1471b89f182dSHong Zhang         }
1472b89f182dSHong Zhang 
147328f1b45aSHong Zhang         sctx.rs = rs;
1474b89f182dSHong Zhang         sctx.pv = rtmp2[i + 1];
14759566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
147607b50cabSHong Zhang         if (sctx.newshift) break;
147728f1b45aSHong Zhang         pc2  = b->a + bdiag[i + 1];
1478b89f182dSHong Zhang         *pc2 = 1.0 / sctx.pv;
147928f1b45aSHong Zhang         break;
1480b89f182dSHong Zhang 
148168785679SHong Zhang       case 3:
148268785679SHong Zhang         /* zero rtmp */
148368785679SHong Zhang         /* L part */
148468785679SHong Zhang         nz    = bi[i + 1] - bi[i];
148568785679SHong Zhang         bjtmp = bj + bi[i];
148668785679SHong Zhang         for (j = 0; j < nz; j++) {
148768785679SHong Zhang           col        = bjtmp[j];
14889371c9d4SSatish Balay           rtmp1[col] = 0.0;
14899371c9d4SSatish Balay           rtmp2[col] = 0.0;
14909371c9d4SSatish Balay           rtmp3[col] = 0.0;
149168785679SHong Zhang         }
149268785679SHong Zhang 
149368785679SHong Zhang         /* U part */
149468785679SHong Zhang         nz    = bdiag[i] - bdiag[i + 1];
149568785679SHong Zhang         bjtmp = bj + bdiag[i + 1] + 1;
149668785679SHong Zhang         for (j = 0; j < nz; j++) {
149768785679SHong Zhang           col        = bjtmp[j];
14989371c9d4SSatish Balay           rtmp1[col] = 0.0;
14999371c9d4SSatish Balay           rtmp2[col] = 0.0;
15009371c9d4SSatish Balay           rtmp3[col] = 0.0;
150168785679SHong Zhang         }
150268785679SHong Zhang 
150368785679SHong Zhang         /* load in initial (unfactored row) */
150468785679SHong Zhang         nz    = ai[r[i] + 1] - ai[r[i]];
150568785679SHong Zhang         ajtmp = aj + ai[r[i]];
15069371c9d4SSatish Balay         v1    = aa + ai[r[i]];
15079371c9d4SSatish Balay         v2    = aa + ai[r[i] + 1];
15089371c9d4SSatish Balay         v3    = aa + ai[r[i] + 2];
150968785679SHong Zhang         for (j = 0; j < nz; j++) {
151068785679SHong Zhang           col        = ics[ajtmp[j]];
15119371c9d4SSatish Balay           rtmp1[col] = v1[j];
15129371c9d4SSatish Balay           rtmp2[col] = v2[j];
15139371c9d4SSatish Balay           rtmp3[col] = v3[j];
151468785679SHong Zhang         }
151568785679SHong Zhang         /* ZeropivotApply(): shift the diagonal of the matrix  */
15169371c9d4SSatish Balay         rtmp1[i] += sctx.shift_amount;
15179371c9d4SSatish Balay         rtmp2[i + 1] += sctx.shift_amount;
15189371c9d4SSatish Balay         rtmp3[i + 2] += sctx.shift_amount;
151968785679SHong Zhang 
152068785679SHong Zhang         /* elimination */
152168785679SHong Zhang         bjtmp = bj + bi[i];
152268785679SHong Zhang         row   = *bjtmp++; /* pivot row */
152368785679SHong Zhang         nzL   = bi[i + 1] - bi[i];
152468785679SHong Zhang         for (k = 0; k < nzL; k++) {
1525b89f182dSHong Zhang           pc1 = rtmp1 + row;
1526b89f182dSHong Zhang           pc2 = rtmp2 + row;
1527b89f182dSHong Zhang           pc3 = rtmp3 + row;
152868785679SHong Zhang           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
152968785679SHong Zhang             pv   = b->a + bdiag[row];
15309371c9d4SSatish Balay             mul1 = *pc1 * (*pv);
15319371c9d4SSatish Balay             mul2 = *pc2 * (*pv);
15329371c9d4SSatish Balay             mul3 = *pc3 * (*pv);
15339371c9d4SSatish Balay             *pc1 = mul1;
15349371c9d4SSatish Balay             *pc2 = mul2;
15359371c9d4SSatish Balay             *pc3 = mul3;
153668785679SHong Zhang 
153768785679SHong Zhang             pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
153868785679SHong Zhang             pv = b->a + bdiag[row + 1] + 1;
153968785679SHong Zhang             nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
154068785679SHong Zhang             for (j = 0; j < nz; j++) {
154168785679SHong Zhang               col = pj[j];
1542b89f182dSHong Zhang               rtmp1[col] -= mul1 * pv[j];
1543b89f182dSHong Zhang               rtmp2[col] -= mul2 * pv[j];
1544b89f182dSHong Zhang               rtmp3[col] -= mul3 * pv[j];
154568785679SHong Zhang             }
15469566063dSJacob Faibussowitsch             PetscCall(PetscLogFlops(3 + 6.0 * nz));
154768785679SHong Zhang           }
154868785679SHong Zhang           row = *bjtmp++;
154968785679SHong Zhang         }
155068785679SHong Zhang 
1551b89f182dSHong Zhang         /* finished row i; check zero pivot, then stick row i into b->a */
1552b89f182dSHong Zhang         rs = 0.0;
1553b89f182dSHong Zhang         /* L part */
1554b89f182dSHong Zhang         pc1 = b->a + bi[i];
1555b89f182dSHong Zhang         pj  = b->j + bi[i];
1556b89f182dSHong Zhang         nz  = bi[i + 1] - bi[i];
1557b89f182dSHong Zhang         for (j = 0; j < nz; j++) {
1558b89f182dSHong Zhang           col    = pj[j];
15599371c9d4SSatish Balay           pc1[j] = rtmp1[col];
15609371c9d4SSatish Balay           rs += PetscAbsScalar(pc1[j]);
1561b89f182dSHong Zhang         }
1562b89f182dSHong Zhang         /* U part */
1563b89f182dSHong Zhang         pc1 = b->a + bdiag[i + 1] + 1;
1564b89f182dSHong Zhang         pj  = b->j + bdiag[i + 1] + 1;
15650e7a5c2bSHong Zhang         nz  = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1566b89f182dSHong Zhang         for (j = 0; j < nz; j++) {
1567b89f182dSHong Zhang           col    = pj[j];
15689371c9d4SSatish Balay           pc1[j] = rtmp1[col];
15699371c9d4SSatish Balay           rs += PetscAbsScalar(pc1[j]);
1570b89f182dSHong Zhang         }
157168785679SHong Zhang 
1572b89f182dSHong Zhang         sctx.rs = rs;
1573b89f182dSHong Zhang         sctx.pv = rtmp1[i];
15749566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
157507b50cabSHong Zhang         if (sctx.newshift) break;
1576b89f182dSHong Zhang         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1577b89f182dSHong Zhang         *pc1 = 1.0 / sctx.pv;
1578b89f182dSHong Zhang 
1579b89f182dSHong Zhang         /* Now take care of 1st column of diagonal 3x3 block. */
1580b89f182dSHong Zhang         pc2 = rtmp2 + i;
1581b89f182dSHong Zhang         pc3 = rtmp3 + i;
1582b89f182dSHong Zhang         if (*pc2 != 0.0 || *pc3 != 0.0) {
15839371c9d4SSatish Balay           mul2 = (*pc2) * (*pc1);
15849371c9d4SSatish Balay           *pc2 = mul2;
15859371c9d4SSatish Balay           mul3 = (*pc3) * (*pc1);
15869371c9d4SSatish Balay           *pc3 = mul3;
158768785679SHong Zhang           pj   = b->j + bdiag[i + 1] + 1;     /* beginning of U(i,:) */
158868785679SHong Zhang           nz   = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
158968785679SHong Zhang           for (j = 0; j < nz; j++) {
159068785679SHong Zhang             col = pj[j];
1591b89f182dSHong Zhang             rtmp2[col] -= mul2 * rtmp1[col];
1592b89f182dSHong Zhang             rtmp3[col] -= mul3 * rtmp1[col];
159368785679SHong Zhang           }
15949566063dSJacob Faibussowitsch           PetscCall(PetscLogFlops(2 + 4.0 * nz));
159568785679SHong Zhang         }
159668785679SHong Zhang 
1597b89f182dSHong Zhang         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1598b89f182dSHong Zhang         rs = 0.0;
1599b89f182dSHong Zhang         /* L part */
1600b89f182dSHong Zhang         pc2 = b->a + bi[i + 1];
1601b89f182dSHong Zhang         pj  = b->j + bi[i + 1];
1602b89f182dSHong Zhang         nz  = bi[i + 2] - bi[i + 1];
1603b89f182dSHong Zhang         for (j = 0; j < nz; j++) {
1604b89f182dSHong Zhang           col    = pj[j];
16059371c9d4SSatish Balay           pc2[j] = rtmp2[col];
16069371c9d4SSatish Balay           rs += PetscAbsScalar(pc2[j]);
1607b89f182dSHong Zhang         }
1608b89f182dSHong Zhang         /* U part */
1609b89f182dSHong Zhang         pc2 = b->a + bdiag[i + 2] + 1;
16100e7a5c2bSHong Zhang         pj  = b->j + bdiag[i + 2] + 1;
16110e7a5c2bSHong Zhang         nz  = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1612b89f182dSHong Zhang         for (j = 0; j < nz; j++) {
1613b89f182dSHong Zhang           col    = pj[j];
16149371c9d4SSatish Balay           pc2[j] = rtmp2[col];
16159371c9d4SSatish Balay           rs += PetscAbsScalar(pc2[j]);
1616b89f182dSHong Zhang         }
1617b89f182dSHong Zhang 
1618b89f182dSHong Zhang         sctx.rs = rs;
1619b89f182dSHong Zhang         sctx.pv = rtmp2[i + 1];
16209566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
162107b50cabSHong Zhang         if (sctx.newshift) break;
1622b89f182dSHong Zhang         pc2  = b->a + bdiag[i + 1];
1623b89f182dSHong Zhang         *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */
1624b89f182dSHong Zhang 
1625b89f182dSHong Zhang         /* Now take care of 2nd column of diagonal 3x3 block. */
1626b89f182dSHong Zhang         pc3 = rtmp3 + i + 1;
162768785679SHong Zhang         if (*pc3 != 0.0) {
16289371c9d4SSatish Balay           mul3 = (*pc3) * (*pc2);
16299371c9d4SSatish Balay           *pc3 = mul3;
163068785679SHong Zhang           pj   = b->j + bdiag[i + 2] + 1;         /* beginning of U(i+1,:) */
163168785679SHong Zhang           nz   = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */
163268785679SHong Zhang           for (j = 0; j < nz; j++) {
163368785679SHong Zhang             col = pj[j];
1634b89f182dSHong Zhang             rtmp3[col] -= mul3 * rtmp2[col];
163568785679SHong Zhang           }
16369566063dSJacob Faibussowitsch           PetscCall(PetscLogFlops(1 + 2.0 * nz));
163768785679SHong Zhang         }
163868785679SHong Zhang 
1639b89f182dSHong Zhang         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
164068785679SHong Zhang         rs = 0.0;
164168785679SHong Zhang         /* L part */
1642b89f182dSHong Zhang         pc3 = b->a + bi[i + 2];
1643b89f182dSHong Zhang         pj  = b->j + bi[i + 2];
1644b89f182dSHong Zhang         nz  = bi[i + 3] - bi[i + 2];
164568785679SHong Zhang         for (j = 0; j < nz; j++) {
164668785679SHong Zhang           col    = pj[j];
16479371c9d4SSatish Balay           pc3[j] = rtmp3[col];
16489371c9d4SSatish Balay           rs += PetscAbsScalar(pc3[j]);
164968785679SHong Zhang         }
165068785679SHong Zhang         /* U part */
1651b89f182dSHong Zhang         pc3 = b->a + bdiag[i + 3] + 1;
16520e7a5c2bSHong Zhang         pj  = b->j + bdiag[i + 3] + 1;
16530e7a5c2bSHong Zhang         nz  = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */
165468785679SHong Zhang         for (j = 0; j < nz; j++) {
165568785679SHong Zhang           col    = pj[j];
16569371c9d4SSatish Balay           pc3[j] = rtmp3[col];
16579371c9d4SSatish Balay           rs += PetscAbsScalar(pc3[j]);
165868785679SHong Zhang         }
165968785679SHong Zhang 
166068785679SHong Zhang         sctx.rs = rs;
1661b89f182dSHong Zhang         sctx.pv = rtmp3[i + 2];
16629566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2));
166307b50cabSHong Zhang         if (sctx.newshift) break;
166468785679SHong Zhang         pc3  = b->a + bdiag[i + 2];
1665b89f182dSHong Zhang         *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */
166668785679SHong Zhang         break;
16679877982aSShri Abhyankar       case 4:
16689877982aSShri Abhyankar         /* zero rtmp */
16699877982aSShri Abhyankar         /* L part */
16709877982aSShri Abhyankar         nz    = bi[i + 1] - bi[i];
16719877982aSShri Abhyankar         bjtmp = bj + bi[i];
16729877982aSShri Abhyankar         for (j = 0; j < nz; j++) {
16739877982aSShri Abhyankar           col        = bjtmp[j];
16749371c9d4SSatish Balay           rtmp1[col] = 0.0;
16759371c9d4SSatish Balay           rtmp2[col] = 0.0;
16769371c9d4SSatish Balay           rtmp3[col] = 0.0;
16779371c9d4SSatish Balay           rtmp4[col] = 0.0;
16789877982aSShri Abhyankar         }
16799877982aSShri Abhyankar 
16809877982aSShri Abhyankar         /* U part */
16819877982aSShri Abhyankar         nz    = bdiag[i] - bdiag[i + 1];
16829877982aSShri Abhyankar         bjtmp = bj + bdiag[i + 1] + 1;
16839877982aSShri Abhyankar         for (j = 0; j < nz; j++) {
16849877982aSShri Abhyankar           col        = bjtmp[j];
16859371c9d4SSatish Balay           rtmp1[col] = 0.0;
16869371c9d4SSatish Balay           rtmp2[col] = 0.0;
16879371c9d4SSatish Balay           rtmp3[col] = 0.0;
16889371c9d4SSatish Balay           rtmp4[col] = 0.0;
16899877982aSShri Abhyankar         }
16909877982aSShri Abhyankar 
16919877982aSShri Abhyankar         /* load in initial (unfactored row) */
16929877982aSShri Abhyankar         nz    = ai[r[i] + 1] - ai[r[i]];
16939877982aSShri Abhyankar         ajtmp = aj + ai[r[i]];
16949371c9d4SSatish Balay         v1    = aa + ai[r[i]];
16959371c9d4SSatish Balay         v2    = aa + ai[r[i] + 1];
16969371c9d4SSatish Balay         v3    = aa + ai[r[i] + 2];
16979371c9d4SSatish Balay         v4    = aa + ai[r[i] + 3];
16989877982aSShri Abhyankar         for (j = 0; j < nz; j++) {
16999877982aSShri Abhyankar           col        = ics[ajtmp[j]];
17009371c9d4SSatish Balay           rtmp1[col] = v1[j];
17019371c9d4SSatish Balay           rtmp2[col] = v2[j];
17029371c9d4SSatish Balay           rtmp3[col] = v3[j];
17039371c9d4SSatish Balay           rtmp4[col] = v4[j];
17049877982aSShri Abhyankar         }
17059877982aSShri Abhyankar         /* ZeropivotApply(): shift the diagonal of the matrix  */
17069371c9d4SSatish Balay         rtmp1[i] += sctx.shift_amount;
17079371c9d4SSatish Balay         rtmp2[i + 1] += sctx.shift_amount;
17089371c9d4SSatish Balay         rtmp3[i + 2] += sctx.shift_amount;
17099371c9d4SSatish Balay         rtmp4[i + 3] += sctx.shift_amount;
17109877982aSShri Abhyankar 
17119877982aSShri Abhyankar         /* elimination */
17129877982aSShri Abhyankar         bjtmp = bj + bi[i];
17139877982aSShri Abhyankar         row   = *bjtmp++; /* pivot row */
17149877982aSShri Abhyankar         nzL   = bi[i + 1] - bi[i];
17159877982aSShri Abhyankar         for (k = 0; k < nzL; k++) {
17169877982aSShri Abhyankar           pc1 = rtmp1 + row;
17179877982aSShri Abhyankar           pc2 = rtmp2 + row;
17189877982aSShri Abhyankar           pc3 = rtmp3 + row;
17199877982aSShri Abhyankar           pc4 = rtmp4 + row;
17209877982aSShri Abhyankar           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
17219877982aSShri Abhyankar             pv   = b->a + bdiag[row];
17229371c9d4SSatish Balay             mul1 = *pc1 * (*pv);
17239371c9d4SSatish Balay             mul2 = *pc2 * (*pv);
17249371c9d4SSatish Balay             mul3 = *pc3 * (*pv);
17259371c9d4SSatish Balay             mul4 = *pc4 * (*pv);
17269371c9d4SSatish Balay             *pc1 = mul1;
17279371c9d4SSatish Balay             *pc2 = mul2;
17289371c9d4SSatish Balay             *pc3 = mul3;
17299371c9d4SSatish Balay             *pc4 = mul4;
17309877982aSShri Abhyankar 
17319877982aSShri Abhyankar             pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
17329877982aSShri Abhyankar             pv = b->a + bdiag[row + 1] + 1;
17339877982aSShri Abhyankar             nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
17349877982aSShri Abhyankar             for (j = 0; j < nz; j++) {
17359877982aSShri Abhyankar               col = pj[j];
17369877982aSShri Abhyankar               rtmp1[col] -= mul1 * pv[j];
17379877982aSShri Abhyankar               rtmp2[col] -= mul2 * pv[j];
17389877982aSShri Abhyankar               rtmp3[col] -= mul3 * pv[j];
17399877982aSShri Abhyankar               rtmp4[col] -= mul4 * pv[j];
17409877982aSShri Abhyankar             }
17419566063dSJacob Faibussowitsch             PetscCall(PetscLogFlops(4 + 8.0 * nz));
17429877982aSShri Abhyankar           }
17439877982aSShri Abhyankar           row = *bjtmp++;
17449877982aSShri Abhyankar         }
17459877982aSShri Abhyankar 
17469877982aSShri Abhyankar         /* finished row i; check zero pivot, then stick row i into b->a */
17479877982aSShri Abhyankar         rs = 0.0;
17489877982aSShri Abhyankar         /* L part */
17499877982aSShri Abhyankar         pc1 = b->a + bi[i];
17509877982aSShri Abhyankar         pj  = b->j + bi[i];
17519877982aSShri Abhyankar         nz  = bi[i + 1] - bi[i];
17529877982aSShri Abhyankar         for (j = 0; j < nz; j++) {
17539877982aSShri Abhyankar           col    = pj[j];
17549371c9d4SSatish Balay           pc1[j] = rtmp1[col];
17559371c9d4SSatish Balay           rs += PetscAbsScalar(pc1[j]);
17569877982aSShri Abhyankar         }
17579877982aSShri Abhyankar         /* U part */
17589877982aSShri Abhyankar         pc1 = b->a + bdiag[i + 1] + 1;
17599877982aSShri Abhyankar         pj  = b->j + bdiag[i + 1] + 1;
17609877982aSShri Abhyankar         nz  = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
17619877982aSShri Abhyankar         for (j = 0; j < nz; j++) {
17629877982aSShri Abhyankar           col    = pj[j];
17639371c9d4SSatish Balay           pc1[j] = rtmp1[col];
17649371c9d4SSatish Balay           rs += PetscAbsScalar(pc1[j]);
17659877982aSShri Abhyankar         }
17669877982aSShri Abhyankar 
17679877982aSShri Abhyankar         sctx.rs = rs;
17689877982aSShri Abhyankar         sctx.pv = rtmp1[i];
17699566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
177007b50cabSHong Zhang         if (sctx.newshift) break;
17719877982aSShri Abhyankar         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
17729877982aSShri Abhyankar         *pc1 = 1.0 / sctx.pv;
17739877982aSShri Abhyankar 
17749877982aSShri Abhyankar         /* Now take care of 1st column of diagonal 4x4 block. */
17759877982aSShri Abhyankar         pc2 = rtmp2 + i;
17769877982aSShri Abhyankar         pc3 = rtmp3 + i;
17779877982aSShri Abhyankar         pc4 = rtmp4 + i;
17789877982aSShri Abhyankar         if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
17799371c9d4SSatish Balay           mul2 = (*pc2) * (*pc1);
17809371c9d4SSatish Balay           *pc2 = mul2;
17819371c9d4SSatish Balay           mul3 = (*pc3) * (*pc1);
17829371c9d4SSatish Balay           *pc3 = mul3;
17839371c9d4SSatish Balay           mul4 = (*pc4) * (*pc1);
17849371c9d4SSatish Balay           *pc4 = mul4;
17859877982aSShri Abhyankar           pj   = b->j + bdiag[i + 1] + 1;     /* beginning of U(i,:) */
17869877982aSShri Abhyankar           nz   = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
17879877982aSShri Abhyankar           for (j = 0; j < nz; j++) {
17889877982aSShri Abhyankar             col = pj[j];
17899877982aSShri Abhyankar             rtmp2[col] -= mul2 * rtmp1[col];
17909877982aSShri Abhyankar             rtmp3[col] -= mul3 * rtmp1[col];
17919877982aSShri Abhyankar             rtmp4[col] -= mul4 * rtmp1[col];
17929877982aSShri Abhyankar           }
17939566063dSJacob Faibussowitsch           PetscCall(PetscLogFlops(3 + 6.0 * nz));
17949877982aSShri Abhyankar         }
17959877982aSShri Abhyankar 
17969877982aSShri Abhyankar         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
17979877982aSShri Abhyankar         rs = 0.0;
17989877982aSShri Abhyankar         /* L part */
17999877982aSShri Abhyankar         pc2 = b->a + bi[i + 1];
18009877982aSShri Abhyankar         pj  = b->j + bi[i + 1];
18019877982aSShri Abhyankar         nz  = bi[i + 2] - bi[i + 1];
18029877982aSShri Abhyankar         for (j = 0; j < nz; j++) {
18039877982aSShri Abhyankar           col    = pj[j];
18049371c9d4SSatish Balay           pc2[j] = rtmp2[col];
18059371c9d4SSatish Balay           rs += PetscAbsScalar(pc2[j]);
18069877982aSShri Abhyankar         }
18079877982aSShri Abhyankar         /* U part */
18089877982aSShri Abhyankar         pc2 = b->a + bdiag[i + 2] + 1;
18099877982aSShri Abhyankar         pj  = b->j + bdiag[i + 2] + 1;
18109877982aSShri Abhyankar         nz  = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
18119877982aSShri Abhyankar         for (j = 0; j < nz; j++) {
18129877982aSShri Abhyankar           col    = pj[j];
18139371c9d4SSatish Balay           pc2[j] = rtmp2[col];
18149371c9d4SSatish Balay           rs += PetscAbsScalar(pc2[j]);
18159877982aSShri Abhyankar         }
18169877982aSShri Abhyankar 
18179877982aSShri Abhyankar         sctx.rs = rs;
18189877982aSShri Abhyankar         sctx.pv = rtmp2[i + 1];
18199566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
182007b50cabSHong Zhang         if (sctx.newshift) break;
18219877982aSShri Abhyankar         pc2  = b->a + bdiag[i + 1];
18229877982aSShri Abhyankar         *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */
18239877982aSShri Abhyankar 
18249877982aSShri Abhyankar         /* Now take care of 2nd column of diagonal 4x4 block. */
18259877982aSShri Abhyankar         pc3 = rtmp3 + i + 1;
18269877982aSShri Abhyankar         pc4 = rtmp4 + i + 1;
18279877982aSShri Abhyankar         if (*pc3 != 0.0 || *pc4 != 0.0) {
18289371c9d4SSatish Balay           mul3 = (*pc3) * (*pc2);
18299371c9d4SSatish Balay           *pc3 = mul3;
18309371c9d4SSatish Balay           mul4 = (*pc4) * (*pc2);
18319371c9d4SSatish Balay           *pc4 = mul4;
18329877982aSShri Abhyankar           pj   = b->j + bdiag[i + 2] + 1;         /* beginning of U(i+1,:) */
18339877982aSShri Abhyankar           nz   = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */
18349877982aSShri Abhyankar           for (j = 0; j < nz; j++) {
18359877982aSShri Abhyankar             col = pj[j];
18369877982aSShri Abhyankar             rtmp3[col] -= mul3 * rtmp2[col];
18379877982aSShri Abhyankar             rtmp4[col] -= mul4 * rtmp2[col];
18389877982aSShri Abhyankar           }
18399566063dSJacob Faibussowitsch           PetscCall(PetscLogFlops(4.0 * nz));
18409877982aSShri Abhyankar         }
18419877982aSShri Abhyankar 
18429877982aSShri Abhyankar         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
18439877982aSShri Abhyankar         rs = 0.0;
18449877982aSShri Abhyankar         /* L part */
18459877982aSShri Abhyankar         pc3 = b->a + bi[i + 2];
18469877982aSShri Abhyankar         pj  = b->j + bi[i + 2];
18479877982aSShri Abhyankar         nz  = bi[i + 3] - bi[i + 2];
18489877982aSShri Abhyankar         for (j = 0; j < nz; j++) {
18499877982aSShri Abhyankar           col    = pj[j];
18509371c9d4SSatish Balay           pc3[j] = rtmp3[col];
18519371c9d4SSatish Balay           rs += PetscAbsScalar(pc3[j]);
18529877982aSShri Abhyankar         }
18539877982aSShri Abhyankar         /* U part */
18549877982aSShri Abhyankar         pc3 = b->a + bdiag[i + 3] + 1;
18559877982aSShri Abhyankar         pj  = b->j + bdiag[i + 3] + 1;
18569877982aSShri Abhyankar         nz  = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */
18579877982aSShri Abhyankar         for (j = 0; j < nz; j++) {
18589877982aSShri Abhyankar           col    = pj[j];
18599371c9d4SSatish Balay           pc3[j] = rtmp3[col];
18609371c9d4SSatish Balay           rs += PetscAbsScalar(pc3[j]);
18619877982aSShri Abhyankar         }
18629877982aSShri Abhyankar 
18639877982aSShri Abhyankar         sctx.rs = rs;
18649877982aSShri Abhyankar         sctx.pv = rtmp3[i + 2];
18659566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2));
186607b50cabSHong Zhang         if (sctx.newshift) break;
18679877982aSShri Abhyankar         pc3  = b->a + bdiag[i + 2];
18689877982aSShri Abhyankar         *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */
18699877982aSShri Abhyankar 
18709877982aSShri Abhyankar         /* Now take care of 3rd column of diagonal 4x4 block. */
18719877982aSShri Abhyankar         pc4 = rtmp4 + i + 2;
18729877982aSShri Abhyankar         if (*pc4 != 0.0) {
18739371c9d4SSatish Balay           mul4 = (*pc4) * (*pc3);
18749371c9d4SSatish Balay           *pc4 = mul4;
18759877982aSShri Abhyankar           pj   = b->j + bdiag[i + 3] + 1;         /* beginning of U(i+2,:) */
18769877982aSShri Abhyankar           nz   = bdiag[i + 2] - bdiag[i + 3] - 1; /* num of entries in U(i+2,:) excluding diag */
18779877982aSShri Abhyankar           for (j = 0; j < nz; j++) {
18789877982aSShri Abhyankar             col = pj[j];
18799877982aSShri Abhyankar             rtmp4[col] -= mul4 * rtmp3[col];
18809877982aSShri Abhyankar           }
18819566063dSJacob Faibussowitsch           PetscCall(PetscLogFlops(1 + 2.0 * nz));
18829877982aSShri Abhyankar         }
18839877982aSShri Abhyankar 
18849877982aSShri Abhyankar         /* finished i+3; check zero pivot, then stick row i+3 into b->a */
18859877982aSShri Abhyankar         rs = 0.0;
18869877982aSShri Abhyankar         /* L part */
18879877982aSShri Abhyankar         pc4 = b->a + bi[i + 3];
18889877982aSShri Abhyankar         pj  = b->j + bi[i + 3];
18899877982aSShri Abhyankar         nz  = bi[i + 4] - bi[i + 3];
18909877982aSShri Abhyankar         for (j = 0; j < nz; j++) {
18919877982aSShri Abhyankar           col    = pj[j];
18929371c9d4SSatish Balay           pc4[j] = rtmp4[col];
18939371c9d4SSatish Balay           rs += PetscAbsScalar(pc4[j]);
18949877982aSShri Abhyankar         }
18959877982aSShri Abhyankar         /* U part */
18969877982aSShri Abhyankar         pc4 = b->a + bdiag[i + 4] + 1;
18979877982aSShri Abhyankar         pj  = b->j + bdiag[i + 4] + 1;
18989877982aSShri Abhyankar         nz  = bdiag[i + 3] - bdiag[i + 4] - 1; /* exclude diagonal */
18999877982aSShri Abhyankar         for (j = 0; j < nz; j++) {
19009877982aSShri Abhyankar           col    = pj[j];
19019371c9d4SSatish Balay           pc4[j] = rtmp4[col];
19029371c9d4SSatish Balay           rs += PetscAbsScalar(pc4[j]);
19039877982aSShri Abhyankar         }
19049877982aSShri Abhyankar 
19059877982aSShri Abhyankar         sctx.rs = rs;
19069877982aSShri Abhyankar         sctx.pv = rtmp4[i + 3];
19079566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 3));
190807b50cabSHong Zhang         if (sctx.newshift) break;
19099877982aSShri Abhyankar         pc4  = b->a + bdiag[i + 3];
19109877982aSShri Abhyankar         *pc4 = 1.0 / sctx.pv; /* Mark diag[i+3] */
19119877982aSShri Abhyankar         break;
191268785679SHong Zhang 
1913d71ae5a4SJacob Faibussowitsch       default:
1914d71ae5a4SJacob Faibussowitsch         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported ");
191528f1b45aSHong Zhang       }
1916c2b86aeeSHong Zhang       if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
191728f1b45aSHong Zhang       i += nodesz;              /* Update the row */
191868785679SHong Zhang     }
191928f1b45aSHong Zhang 
192028f1b45aSHong Zhang     /* MatPivotRefine() */
192107b50cabSHong Zhang     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) {
192228f1b45aSHong Zhang       /*
192328f1b45aSHong Zhang        * if no shift in this attempt & shifting & started shifting & can refine,
192428f1b45aSHong Zhang        * then try lower shift
192528f1b45aSHong Zhang        */
192628f1b45aSHong Zhang       sctx.shift_hi       = sctx.shift_fraction;
192728f1b45aSHong Zhang       sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.;
192828f1b45aSHong Zhang       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
192907b50cabSHong Zhang       sctx.newshift       = PETSC_TRUE;
193028f1b45aSHong Zhang       sctx.nshift++;
193128f1b45aSHong Zhang     }
193207b50cabSHong Zhang   } while (sctx.newshift);
193328f1b45aSHong Zhang 
19349566063dSJacob Faibussowitsch   PetscCall(PetscFree4(rtmp1, rtmp2, rtmp3, rtmp4));
19359566063dSJacob Faibussowitsch   PetscCall(PetscFree(tmp_vec2));
19369566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isicol, &ic));
19379566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &r));
193828f1b45aSHong Zhang 
1939abb87a52SBarry Smith   if (b->inode.size) {
1940abb87a52SBarry Smith     C->ops->solve = MatSolve_SeqAIJ_Inode;
1941abb87a52SBarry Smith   } else {
1942d3ac4fa3SBarry Smith     C->ops->solve = MatSolve_SeqAIJ;
1943abb87a52SBarry Smith   }
194428f1b45aSHong Zhang   C->ops->solveadd          = MatSolveAdd_SeqAIJ;
194528f1b45aSHong Zhang   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
194628f1b45aSHong Zhang   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
194728f1b45aSHong Zhang   C->ops->matsolve          = MatMatSolve_SeqAIJ;
1948a3d9026eSPierre Jolivet   C->ops->matsolvetranspose = MatMatSolveTranspose_SeqAIJ;
194928f1b45aSHong Zhang   C->assembled              = PETSC_TRUE;
195028f1b45aSHong Zhang   C->preallocated           = PETSC_TRUE;
19512205254eSKarl Rupp 
19529566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(C->cmap->n));
195328f1b45aSHong Zhang 
195428f1b45aSHong Zhang   /* MatShiftView(A,info,&sctx) */
195528f1b45aSHong Zhang   if (sctx.nshift) {
1956f4db908eSBarry Smith     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
19579566063dSJacob Faibussowitsch       PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
1958f4db908eSBarry Smith     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
19599566063dSJacob Faibussowitsch       PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
1960f4db908eSBarry Smith     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
19619566063dSJacob Faibussowitsch       PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount));
196228f1b45aSHong Zhang     }
196328f1b45aSHong Zhang   }
19643ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
196528f1b45aSHong Zhang }
1966628f99d7SShri Abhyankar 
1967ff6a9541SJacob Faibussowitsch #if 0
1968ff6a9541SJacob Faibussowitsch // unused
1969ff6a9541SJacob Faibussowitsch static PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B, Mat A, const MatFactorInfo *info)
1970d71ae5a4SJacob Faibussowitsch {
1971628f99d7SShri Abhyankar   Mat              C = B;
1972628f99d7SShri Abhyankar   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data;
1973628f99d7SShri Abhyankar   IS               iscol = b->col, isrow = b->row, isicol = b->icol;
1974628f99d7SShri Abhyankar   const PetscInt  *r, *ic, *c, *ics;
1975628f99d7SShri Abhyankar   PetscInt         n = A->rmap->n, *bi = b->i;
1976628f99d7SShri Abhyankar   PetscInt        *bj = b->j, *nbj = b->j + 1, *ajtmp, *bjtmp, nz, nz_tmp, row, prow;
19778758e1faSBarry Smith   PetscInt         i, j, idx, *bd = b->diag, node_max, nodesz;
19788758e1faSBarry Smith   PetscInt        *ai = a->i, *aj = a->j;
1979628f99d7SShri Abhyankar   PetscInt        *ns, *tmp_vec1, *tmp_vec2, *nsmap, *pj;
1980628f99d7SShri Abhyankar   PetscScalar      mul1, mul2, mul3, tmp;
1981628f99d7SShri Abhyankar   MatScalar       *pc1, *pc2, *pc3, *ba = b->a, *pv, *rtmp11, *rtmp22, *rtmp33;
1982628f99d7SShri Abhyankar   const MatScalar *v1, *v2, *v3, *aa    = a->a, *rtmp1;
1983628f99d7SShri Abhyankar   PetscReal        rs = 0.0;
1984628f99d7SShri Abhyankar   FactorShiftCtx   sctx;
1985628f99d7SShri Abhyankar 
1986628f99d7SShri Abhyankar   PetscFunctionBegin;
1987628f99d7SShri Abhyankar   sctx.shift_top      = 0;
1988628f99d7SShri Abhyankar   sctx.nshift_max     = 0;
1989628f99d7SShri Abhyankar   sctx.shift_lo       = 0;
1990628f99d7SShri Abhyankar   sctx.shift_hi       = 0;
1991628f99d7SShri Abhyankar   sctx.shift_fraction = 0;
1992628f99d7SShri Abhyankar 
1993628f99d7SShri Abhyankar   /* if both shift schemes are chosen by user, only use info->shiftpd */
1994f4db908eSBarry Smith   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1995628f99d7SShri Abhyankar     sctx.shift_top = 0;
1996628f99d7SShri Abhyankar     for (i = 0; i < n; i++) {
1997628f99d7SShri Abhyankar       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1998628f99d7SShri Abhyankar       rs    = 0.0;
1999628f99d7SShri Abhyankar       ajtmp = aj + ai[i];
2000628f99d7SShri Abhyankar       rtmp1 = aa + ai[i];
2001628f99d7SShri Abhyankar       nz    = ai[i + 1] - ai[i];
2002628f99d7SShri Abhyankar       for (j = 0; j < nz; j++) {
2003628f99d7SShri Abhyankar         if (*ajtmp != i) {
2004628f99d7SShri Abhyankar           rs += PetscAbsScalar(*rtmp1++);
2005628f99d7SShri Abhyankar         } else {
2006628f99d7SShri Abhyankar           rs -= PetscRealPart(*rtmp1++);
2007628f99d7SShri Abhyankar         }
2008628f99d7SShri Abhyankar         ajtmp++;
2009628f99d7SShri Abhyankar       }
2010628f99d7SShri Abhyankar       if (rs > sctx.shift_top) sctx.shift_top = rs;
2011628f99d7SShri Abhyankar     }
2012628f99d7SShri Abhyankar     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
2013628f99d7SShri Abhyankar     sctx.shift_top *= 1.1;
2014628f99d7SShri Abhyankar     sctx.nshift_max = 5;
2015628f99d7SShri Abhyankar     sctx.shift_lo   = 0.;
2016628f99d7SShri Abhyankar     sctx.shift_hi   = 1.;
2017628f99d7SShri Abhyankar   }
2018628f99d7SShri Abhyankar   sctx.shift_amount = 0;
2019628f99d7SShri Abhyankar   sctx.nshift       = 0;
2020628f99d7SShri Abhyankar 
20219566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isrow, &r));
20229566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(iscol, &c));
20239566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isicol, &ic));
20249566063dSJacob Faibussowitsch   PetscCall(PetscCalloc3(n, &rtmp11, n, &rtmp22, n, &rtmp33));
2025628f99d7SShri Abhyankar   ics = ic;
2026628f99d7SShri Abhyankar 
2027628f99d7SShri Abhyankar   node_max = a->inode.node_count;
2028628f99d7SShri Abhyankar   ns       = a->inode.size;
202928b400f6SJacob Faibussowitsch   PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information");
2030628f99d7SShri Abhyankar 
2031628f99d7SShri Abhyankar   /* If max inode size > 3, split it into two inodes.*/
2032628f99d7SShri Abhyankar   /* also map the inode sizes according to the ordering */
20339566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(n + 1, &tmp_vec1));
2034628f99d7SShri Abhyankar   for (i = 0, j = 0; i < node_max; ++i, ++j) {
2035628f99d7SShri Abhyankar     if (ns[i] > 3) {
2036628f99d7SShri Abhyankar       tmp_vec1[j] = ns[i] / 2; /* Assuming ns[i] < =5  */
2037628f99d7SShri Abhyankar       ++j;
2038628f99d7SShri Abhyankar       tmp_vec1[j] = ns[i] - tmp_vec1[j - 1];
2039628f99d7SShri Abhyankar     } else {
2040628f99d7SShri Abhyankar       tmp_vec1[j] = ns[i];
2041628f99d7SShri Abhyankar     }
2042628f99d7SShri Abhyankar   }
2043628f99d7SShri Abhyankar   /* Use the correct node_max */
2044628f99d7SShri Abhyankar   node_max = j;
2045628f99d7SShri Abhyankar 
2046628f99d7SShri Abhyankar   /* Now reorder the inode info based on mat re-ordering info */
2047628f99d7SShri Abhyankar   /* First create a row -> inode_size_array_index map */
20489566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(n + 1, &nsmap, node_max + 1, &tmp_vec2));
2049628f99d7SShri Abhyankar   for (i = 0, row = 0; i < node_max; i++) {
2050628f99d7SShri Abhyankar     nodesz = tmp_vec1[i];
2051ad540459SPierre Jolivet     for (j = 0; j < nodesz; j++, row++) nsmap[row] = i;
2052628f99d7SShri Abhyankar   }
2053628f99d7SShri Abhyankar   /* Using nsmap, create a reordered ns structure */
2054628f99d7SShri Abhyankar   for (i = 0, j = 0; i < node_max; i++) {
2055628f99d7SShri Abhyankar     nodesz      = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
2056628f99d7SShri Abhyankar     tmp_vec2[i] = nodesz;
2057628f99d7SShri Abhyankar     j += nodesz;
2058628f99d7SShri Abhyankar   }
20599566063dSJacob Faibussowitsch   PetscCall(PetscFree2(nsmap, tmp_vec1));
2060628f99d7SShri Abhyankar   /* Now use the correct ns */
2061628f99d7SShri Abhyankar   ns = tmp_vec2;
2062628f99d7SShri Abhyankar 
2063628f99d7SShri Abhyankar   do {
206407b50cabSHong Zhang     sctx.newshift = PETSC_FALSE;
2065628f99d7SShri Abhyankar     /* Now loop over each block-row, and do the factorization */
2066628f99d7SShri Abhyankar     for (i = 0, row = 0; i < node_max; i++) {
2067628f99d7SShri Abhyankar       nodesz = ns[i];
2068628f99d7SShri Abhyankar       nz     = bi[row + 1] - bi[row];
2069628f99d7SShri Abhyankar       bjtmp  = bj + bi[row];
2070628f99d7SShri Abhyankar 
2071628f99d7SShri Abhyankar       switch (nodesz) {
2072628f99d7SShri Abhyankar       case 1:
2073628f99d7SShri Abhyankar         for (j = 0; j < nz; j++) {
2074628f99d7SShri Abhyankar           idx         = bjtmp[j];
2075628f99d7SShri Abhyankar           rtmp11[idx] = 0.0;
2076628f99d7SShri Abhyankar         }
2077628f99d7SShri Abhyankar 
2078628f99d7SShri Abhyankar         /* load in initial (unfactored row) */
2079628f99d7SShri Abhyankar         idx    = r[row];
2080628f99d7SShri Abhyankar         nz_tmp = ai[idx + 1] - ai[idx];
2081628f99d7SShri Abhyankar         ajtmp  = aj + ai[idx];
2082628f99d7SShri Abhyankar         v1     = aa + ai[idx];
2083628f99d7SShri Abhyankar 
2084628f99d7SShri Abhyankar         for (j = 0; j < nz_tmp; j++) {
2085628f99d7SShri Abhyankar           idx         = ics[ajtmp[j]];
2086628f99d7SShri Abhyankar           rtmp11[idx] = v1[j];
2087628f99d7SShri Abhyankar         }
2088628f99d7SShri Abhyankar         rtmp11[ics[r[row]]] += sctx.shift_amount;
2089628f99d7SShri Abhyankar 
2090628f99d7SShri Abhyankar         prow = *bjtmp++;
2091628f99d7SShri Abhyankar         while (prow < row) {
2092628f99d7SShri Abhyankar           pc1 = rtmp11 + prow;
2093628f99d7SShri Abhyankar           if (*pc1 != 0.0) {
2094628f99d7SShri Abhyankar             pv     = ba + bd[prow];
2095628f99d7SShri Abhyankar             pj     = nbj + bd[prow];
2096628f99d7SShri Abhyankar             mul1   = *pc1 * *pv++;
2097628f99d7SShri Abhyankar             *pc1   = mul1;
2098628f99d7SShri Abhyankar             nz_tmp = bi[prow + 1] - bd[prow] - 1;
20999566063dSJacob Faibussowitsch             PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2100628f99d7SShri Abhyankar             for (j = 0; j < nz_tmp; j++) {
2101628f99d7SShri Abhyankar               tmp = pv[j];
2102628f99d7SShri Abhyankar               idx = pj[j];
2103628f99d7SShri Abhyankar               rtmp11[idx] -= mul1 * tmp;
2104628f99d7SShri Abhyankar             }
2105628f99d7SShri Abhyankar           }
2106628f99d7SShri Abhyankar           prow = *bjtmp++;
2107628f99d7SShri Abhyankar         }
2108628f99d7SShri Abhyankar         pj  = bj + bi[row];
2109628f99d7SShri Abhyankar         pc1 = ba + bi[row];
2110628f99d7SShri Abhyankar 
2111628f99d7SShri Abhyankar         sctx.pv     = rtmp11[row];
2112628f99d7SShri Abhyankar         rtmp11[row] = 1.0 / rtmp11[row]; /* invert diag */
2113628f99d7SShri Abhyankar         rs          = 0.0;
2114628f99d7SShri Abhyankar         for (j = 0; j < nz; j++) {
2115628f99d7SShri Abhyankar           idx    = pj[j];
2116628f99d7SShri Abhyankar           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2117628f99d7SShri Abhyankar           if (idx != row) rs += PetscAbsScalar(pc1[j]);
2118628f99d7SShri Abhyankar         }
2119628f99d7SShri Abhyankar         sctx.rs = rs;
21209566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, row));
212107b50cabSHong Zhang         if (sctx.newshift) goto endofwhile;
2122628f99d7SShri Abhyankar         break;
2123628f99d7SShri Abhyankar 
2124628f99d7SShri Abhyankar       case 2:
2125628f99d7SShri Abhyankar         for (j = 0; j < nz; j++) {
2126628f99d7SShri Abhyankar           idx         = bjtmp[j];
2127628f99d7SShri Abhyankar           rtmp11[idx] = 0.0;
2128628f99d7SShri Abhyankar           rtmp22[idx] = 0.0;
2129628f99d7SShri Abhyankar         }
2130628f99d7SShri Abhyankar 
2131628f99d7SShri Abhyankar         /* load in initial (unfactored row) */
2132628f99d7SShri Abhyankar         idx    = r[row];
2133628f99d7SShri Abhyankar         nz_tmp = ai[idx + 1] - ai[idx];
2134628f99d7SShri Abhyankar         ajtmp  = aj + ai[idx];
2135628f99d7SShri Abhyankar         v1     = aa + ai[idx];
2136628f99d7SShri Abhyankar         v2     = aa + ai[idx + 1];
2137628f99d7SShri Abhyankar         for (j = 0; j < nz_tmp; j++) {
2138628f99d7SShri Abhyankar           idx         = ics[ajtmp[j]];
2139628f99d7SShri Abhyankar           rtmp11[idx] = v1[j];
2140628f99d7SShri Abhyankar           rtmp22[idx] = v2[j];
2141628f99d7SShri Abhyankar         }
2142628f99d7SShri Abhyankar         rtmp11[ics[r[row]]] += sctx.shift_amount;
2143628f99d7SShri Abhyankar         rtmp22[ics[r[row + 1]]] += sctx.shift_amount;
2144628f99d7SShri Abhyankar 
2145628f99d7SShri Abhyankar         prow = *bjtmp++;
2146628f99d7SShri Abhyankar         while (prow < row) {
2147628f99d7SShri Abhyankar           pc1 = rtmp11 + prow;
2148628f99d7SShri Abhyankar           pc2 = rtmp22 + prow;
2149628f99d7SShri Abhyankar           if (*pc1 != 0.0 || *pc2 != 0.0) {
2150628f99d7SShri Abhyankar             pv   = ba + bd[prow];
2151628f99d7SShri Abhyankar             pj   = nbj + bd[prow];
2152628f99d7SShri Abhyankar             mul1 = *pc1 * *pv;
2153628f99d7SShri Abhyankar             mul2 = *pc2 * *pv;
2154628f99d7SShri Abhyankar             ++pv;
2155628f99d7SShri Abhyankar             *pc1 = mul1;
2156628f99d7SShri Abhyankar             *pc2 = mul2;
2157628f99d7SShri Abhyankar 
2158628f99d7SShri Abhyankar             nz_tmp = bi[prow + 1] - bd[prow] - 1;
2159628f99d7SShri Abhyankar             for (j = 0; j < nz_tmp; j++) {
2160628f99d7SShri Abhyankar               tmp = pv[j];
2161628f99d7SShri Abhyankar               idx = pj[j];
2162628f99d7SShri Abhyankar               rtmp11[idx] -= mul1 * tmp;
2163628f99d7SShri Abhyankar               rtmp22[idx] -= mul2 * tmp;
2164628f99d7SShri Abhyankar             }
21659566063dSJacob Faibussowitsch             PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp));
2166628f99d7SShri Abhyankar           }
2167628f99d7SShri Abhyankar           prow = *bjtmp++;
2168628f99d7SShri Abhyankar         }
2169628f99d7SShri Abhyankar 
2170628f99d7SShri Abhyankar         /* Now take care of diagonal 2x2 block. Note: prow = row here */
2171628f99d7SShri Abhyankar         pc1 = rtmp11 + prow;
2172628f99d7SShri Abhyankar         pc2 = rtmp22 + prow;
2173628f99d7SShri Abhyankar 
2174628f99d7SShri Abhyankar         sctx.pv = *pc1;
2175628f99d7SShri Abhyankar         pj      = bj + bi[prow];
2176628f99d7SShri Abhyankar         rs      = 0.0;
2177628f99d7SShri Abhyankar         for (j = 0; j < nz; j++) {
2178628f99d7SShri Abhyankar           idx = pj[j];
2179628f99d7SShri Abhyankar           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2180628f99d7SShri Abhyankar         }
2181628f99d7SShri Abhyankar         sctx.rs = rs;
21829566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, row));
218307b50cabSHong Zhang         if (sctx.newshift) goto endofwhile;
2184628f99d7SShri Abhyankar 
2185628f99d7SShri Abhyankar         if (*pc2 != 0.0) {
2186628f99d7SShri Abhyankar           pj     = nbj + bd[prow];
2187628f99d7SShri Abhyankar           mul2   = (*pc2) / (*pc1); /* since diag is not yet inverted.*/
2188628f99d7SShri Abhyankar           *pc2   = mul2;
2189628f99d7SShri Abhyankar           nz_tmp = bi[prow + 1] - bd[prow] - 1;
2190628f99d7SShri Abhyankar           for (j = 0; j < nz_tmp; j++) {
2191628f99d7SShri Abhyankar             idx = pj[j];
2192628f99d7SShri Abhyankar             tmp = rtmp11[idx];
2193628f99d7SShri Abhyankar             rtmp22[idx] -= mul2 * tmp;
2194628f99d7SShri Abhyankar           }
21959566063dSJacob Faibussowitsch           PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2196628f99d7SShri Abhyankar         }
2197628f99d7SShri Abhyankar 
2198628f99d7SShri Abhyankar         pj  = bj + bi[row];
2199628f99d7SShri Abhyankar         pc1 = ba + bi[row];
2200628f99d7SShri Abhyankar         pc2 = ba + bi[row + 1];
2201628f99d7SShri Abhyankar 
2202628f99d7SShri Abhyankar         sctx.pv         = rtmp22[row + 1];
2203628f99d7SShri Abhyankar         rs              = 0.0;
2204628f99d7SShri Abhyankar         rtmp11[row]     = 1.0 / rtmp11[row];
2205628f99d7SShri Abhyankar         rtmp22[row + 1] = 1.0 / rtmp22[row + 1];
2206628f99d7SShri Abhyankar         /* copy row entries from dense representation to sparse */
2207628f99d7SShri Abhyankar         for (j = 0; j < nz; j++) {
2208628f99d7SShri Abhyankar           idx    = pj[j];
2209628f99d7SShri Abhyankar           pc1[j] = rtmp11[idx];
2210628f99d7SShri Abhyankar           pc2[j] = rtmp22[idx];
2211628f99d7SShri Abhyankar           if (idx != row + 1) rs += PetscAbsScalar(pc2[j]);
2212628f99d7SShri Abhyankar         }
2213628f99d7SShri Abhyankar         sctx.rs = rs;
22149566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1));
221507b50cabSHong Zhang         if (sctx.newshift) goto endofwhile;
2216628f99d7SShri Abhyankar         break;
2217628f99d7SShri Abhyankar 
2218628f99d7SShri Abhyankar       case 3:
2219628f99d7SShri Abhyankar         for (j = 0; j < nz; j++) {
2220628f99d7SShri Abhyankar           idx         = bjtmp[j];
2221628f99d7SShri Abhyankar           rtmp11[idx] = 0.0;
2222628f99d7SShri Abhyankar           rtmp22[idx] = 0.0;
2223628f99d7SShri Abhyankar           rtmp33[idx] = 0.0;
2224628f99d7SShri Abhyankar         }
2225628f99d7SShri Abhyankar         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2226628f99d7SShri Abhyankar         idx    = r[row];
2227628f99d7SShri Abhyankar         nz_tmp = ai[idx + 1] - ai[idx];
2228628f99d7SShri Abhyankar         ajtmp  = aj + ai[idx];
2229628f99d7SShri Abhyankar         v1     = aa + ai[idx];
2230628f99d7SShri Abhyankar         v2     = aa + ai[idx + 1];
2231628f99d7SShri Abhyankar         v3     = aa + ai[idx + 2];
2232628f99d7SShri Abhyankar         for (j = 0; j < nz_tmp; j++) {
2233628f99d7SShri Abhyankar           idx         = ics[ajtmp[j]];
2234628f99d7SShri Abhyankar           rtmp11[idx] = v1[j];
2235628f99d7SShri Abhyankar           rtmp22[idx] = v2[j];
2236628f99d7SShri Abhyankar           rtmp33[idx] = v3[j];
2237628f99d7SShri Abhyankar         }
2238628f99d7SShri Abhyankar         rtmp11[ics[r[row]]] += sctx.shift_amount;
2239628f99d7SShri Abhyankar         rtmp22[ics[r[row + 1]]] += sctx.shift_amount;
2240628f99d7SShri Abhyankar         rtmp33[ics[r[row + 2]]] += sctx.shift_amount;
2241628f99d7SShri Abhyankar 
2242628f99d7SShri Abhyankar         /* loop over all pivot row blocks above this row block */
2243628f99d7SShri Abhyankar         prow = *bjtmp++;
2244628f99d7SShri Abhyankar         while (prow < row) {
2245628f99d7SShri Abhyankar           pc1 = rtmp11 + prow;
2246628f99d7SShri Abhyankar           pc2 = rtmp22 + prow;
2247628f99d7SShri Abhyankar           pc3 = rtmp33 + prow;
2248628f99d7SShri Abhyankar           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
2249628f99d7SShri Abhyankar             pv   = ba + bd[prow];
2250628f99d7SShri Abhyankar             pj   = nbj + bd[prow];
2251628f99d7SShri Abhyankar             mul1 = *pc1 * *pv;
2252628f99d7SShri Abhyankar             mul2 = *pc2 * *pv;
2253628f99d7SShri Abhyankar             mul3 = *pc3 * *pv;
2254628f99d7SShri Abhyankar             ++pv;
2255628f99d7SShri Abhyankar             *pc1 = mul1;
2256628f99d7SShri Abhyankar             *pc2 = mul2;
2257628f99d7SShri Abhyankar             *pc3 = mul3;
2258628f99d7SShri Abhyankar 
2259628f99d7SShri Abhyankar             nz_tmp = bi[prow + 1] - bd[prow] - 1;
2260628f99d7SShri Abhyankar             /* update this row based on pivot row */
2261628f99d7SShri Abhyankar             for (j = 0; j < nz_tmp; j++) {
2262628f99d7SShri Abhyankar               tmp = pv[j];
2263628f99d7SShri Abhyankar               idx = pj[j];
2264628f99d7SShri Abhyankar               rtmp11[idx] -= mul1 * tmp;
2265628f99d7SShri Abhyankar               rtmp22[idx] -= mul2 * tmp;
2266628f99d7SShri Abhyankar               rtmp33[idx] -= mul3 * tmp;
2267628f99d7SShri Abhyankar             }
22689566063dSJacob Faibussowitsch             PetscCall(PetscLogFlops(3 + 6.0 * nz_tmp));
2269628f99d7SShri Abhyankar           }
2270628f99d7SShri Abhyankar           prow = *bjtmp++;
2271628f99d7SShri Abhyankar         }
2272628f99d7SShri Abhyankar 
2273628f99d7SShri Abhyankar         /* Now take care of diagonal 3x3 block in this set of rows */
2274628f99d7SShri Abhyankar         /* note: prow = row here */
2275628f99d7SShri Abhyankar         pc1 = rtmp11 + prow;
2276628f99d7SShri Abhyankar         pc2 = rtmp22 + prow;
2277628f99d7SShri Abhyankar         pc3 = rtmp33 + prow;
2278628f99d7SShri Abhyankar 
2279628f99d7SShri Abhyankar         sctx.pv = *pc1;
2280628f99d7SShri Abhyankar         pj      = bj + bi[prow];
2281628f99d7SShri Abhyankar         rs      = 0.0;
2282628f99d7SShri Abhyankar         for (j = 0; j < nz; j++) {
2283628f99d7SShri Abhyankar           idx = pj[j];
2284628f99d7SShri Abhyankar           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2285628f99d7SShri Abhyankar         }
2286628f99d7SShri Abhyankar         sctx.rs = rs;
22879566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, row));
228807b50cabSHong Zhang         if (sctx.newshift) goto endofwhile;
2289628f99d7SShri Abhyankar 
2290628f99d7SShri Abhyankar         if (*pc2 != 0.0 || *pc3 != 0.0) {
2291628f99d7SShri Abhyankar           mul2   = (*pc2) / (*pc1);
2292628f99d7SShri Abhyankar           mul3   = (*pc3) / (*pc1);
2293628f99d7SShri Abhyankar           *pc2   = mul2;
2294628f99d7SShri Abhyankar           *pc3   = mul3;
2295628f99d7SShri Abhyankar           nz_tmp = bi[prow + 1] - bd[prow] - 1;
2296628f99d7SShri Abhyankar           pj     = nbj + bd[prow];
2297628f99d7SShri Abhyankar           for (j = 0; j < nz_tmp; j++) {
2298628f99d7SShri Abhyankar             idx = pj[j];
2299628f99d7SShri Abhyankar             tmp = rtmp11[idx];
2300628f99d7SShri Abhyankar             rtmp22[idx] -= mul2 * tmp;
2301628f99d7SShri Abhyankar             rtmp33[idx] -= mul3 * tmp;
2302628f99d7SShri Abhyankar           }
23039566063dSJacob Faibussowitsch           PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp));
2304628f99d7SShri Abhyankar         }
2305628f99d7SShri Abhyankar         ++prow;
2306628f99d7SShri Abhyankar 
2307628f99d7SShri Abhyankar         pc2     = rtmp22 + prow;
2308628f99d7SShri Abhyankar         pc3     = rtmp33 + prow;
2309628f99d7SShri Abhyankar         sctx.pv = *pc2;
2310628f99d7SShri Abhyankar         pj      = bj + bi[prow];
2311628f99d7SShri Abhyankar         rs      = 0.0;
2312628f99d7SShri Abhyankar         for (j = 0; j < nz; j++) {
2313628f99d7SShri Abhyankar           idx = pj[j];
2314628f99d7SShri Abhyankar           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2315628f99d7SShri Abhyankar         }
2316628f99d7SShri Abhyankar         sctx.rs = rs;
23179566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1));
231807b50cabSHong Zhang         if (sctx.newshift) goto endofwhile;
2319628f99d7SShri Abhyankar 
2320628f99d7SShri Abhyankar         if (*pc3 != 0.0) {
2321628f99d7SShri Abhyankar           mul3   = (*pc3) / (*pc2);
2322628f99d7SShri Abhyankar           *pc3   = mul3;
2323628f99d7SShri Abhyankar           pj     = nbj + bd[prow];
2324628f99d7SShri Abhyankar           nz_tmp = bi[prow + 1] - bd[prow] - 1;
2325628f99d7SShri Abhyankar           for (j = 0; j < nz_tmp; j++) {
2326628f99d7SShri Abhyankar             idx = pj[j];
2327628f99d7SShri Abhyankar             tmp = rtmp22[idx];
2328628f99d7SShri Abhyankar             rtmp33[idx] -= mul3 * tmp;
2329628f99d7SShri Abhyankar           }
23309566063dSJacob Faibussowitsch           PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2331628f99d7SShri Abhyankar         }
2332628f99d7SShri Abhyankar 
2333628f99d7SShri Abhyankar         pj  = bj + bi[row];
2334628f99d7SShri Abhyankar         pc1 = ba + bi[row];
2335628f99d7SShri Abhyankar         pc2 = ba + bi[row + 1];
2336628f99d7SShri Abhyankar         pc3 = ba + bi[row + 2];
2337628f99d7SShri Abhyankar 
2338628f99d7SShri Abhyankar         sctx.pv         = rtmp33[row + 2];
2339628f99d7SShri Abhyankar         rs              = 0.0;
2340628f99d7SShri Abhyankar         rtmp11[row]     = 1.0 / rtmp11[row];
2341628f99d7SShri Abhyankar         rtmp22[row + 1] = 1.0 / rtmp22[row + 1];
2342628f99d7SShri Abhyankar         rtmp33[row + 2] = 1.0 / rtmp33[row + 2];
2343628f99d7SShri Abhyankar         /* copy row entries from dense representation to sparse */
2344628f99d7SShri Abhyankar         for (j = 0; j < nz; j++) {
2345628f99d7SShri Abhyankar           idx    = pj[j];
2346628f99d7SShri Abhyankar           pc1[j] = rtmp11[idx];
2347628f99d7SShri Abhyankar           pc2[j] = rtmp22[idx];
2348628f99d7SShri Abhyankar           pc3[j] = rtmp33[idx];
2349628f99d7SShri Abhyankar           if (idx != row + 2) rs += PetscAbsScalar(pc3[j]);
2350628f99d7SShri Abhyankar         }
2351628f99d7SShri Abhyankar 
2352628f99d7SShri Abhyankar         sctx.rs = rs;
23539566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, row + 2));
235407b50cabSHong Zhang         if (sctx.newshift) goto endofwhile;
2355628f99d7SShri Abhyankar         break;
2356628f99d7SShri Abhyankar 
2357d71ae5a4SJacob Faibussowitsch       default:
2358d71ae5a4SJacob Faibussowitsch         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported ");
2359628f99d7SShri Abhyankar       }
2360628f99d7SShri Abhyankar       row += nodesz; /* Update the row */
2361628f99d7SShri Abhyankar     }
2362628f99d7SShri Abhyankar   endofwhile:;
236307b50cabSHong Zhang   } while (sctx.newshift);
23649566063dSJacob Faibussowitsch   PetscCall(PetscFree3(rtmp11, rtmp22, rtmp33));
23659566063dSJacob Faibussowitsch   PetscCall(PetscFree(tmp_vec2));
23669566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isicol, &ic));
23679566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &r));
23689566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &c));
23692205254eSKarl Rupp 
2370d3ac4fa3SBarry Smith   (B)->ops->solve = MatSolve_SeqAIJ_inplace;
2371628f99d7SShri Abhyankar   /* do not set solve add, since MatSolve_Inode + Add is faster */
2372628f99d7SShri Abhyankar   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ_inplace;
2373628f99d7SShri Abhyankar   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2374628f99d7SShri Abhyankar   C->assembled              = PETSC_TRUE;
2375628f99d7SShri Abhyankar   C->preallocated           = PETSC_TRUE;
2376628f99d7SShri Abhyankar   if (sctx.nshift) {
2377f4db908eSBarry Smith     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
23789566063dSJacob Faibussowitsch       PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
2379f4db908eSBarry Smith     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
23809566063dSJacob Faibussowitsch       PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
2381628f99d7SShri Abhyankar     }
2382628f99d7SShri Abhyankar   }
23839566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(C->cmap->n));
23849566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCheckInode(C));
23853ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2386628f99d7SShri Abhyankar }
2387ff6a9541SJacob Faibussowitsch #endif
2388628f99d7SShri Abhyankar 
2389d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A, Vec bb, Vec xx)
2390d71ae5a4SJacob Faibussowitsch {
2391019b515eSShri Abhyankar   Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
2392019b515eSShri Abhyankar   IS                 iscol = a->col, isrow = a->row;
2393019b515eSShri Abhyankar   const PetscInt    *r, *c, *rout, *cout;
23948758e1faSBarry Smith   PetscInt           i, j, n = A->rmap->n;
23958758e1faSBarry Smith   PetscInt           node_max, row, nsz, aii, i0, i1, nz;
23968758e1faSBarry Smith   const PetscInt    *ai = a->i, *a_j = a->j, *ns, *vi, *ad, *aj;
2397019b515eSShri Abhyankar   PetscScalar       *x, *tmp, *tmps, tmp0, tmp1;
2398019b515eSShri Abhyankar   PetscScalar        sum1, sum2, sum3, sum4, sum5;
2399019b515eSShri Abhyankar   const MatScalar   *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa;
2400019b515eSShri Abhyankar   const PetscScalar *b;
2401019b515eSShri Abhyankar 
2402019b515eSShri Abhyankar   PetscFunctionBegin;
240308401ef6SPierre Jolivet   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2404019b515eSShri Abhyankar   node_max = a->inode.node_count;
2405019b515eSShri Abhyankar   ns       = a->inode.size; /* Node Size array */
2406019b515eSShri Abhyankar 
24079566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
24089566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(xx, &x));
2409019b515eSShri Abhyankar   tmp = a->solve_work;
2410019b515eSShri Abhyankar 
24119371c9d4SSatish Balay   PetscCall(ISGetIndices(isrow, &rout));
24129371c9d4SSatish Balay   r = rout;
24139371c9d4SSatish Balay   PetscCall(ISGetIndices(iscol, &cout));
24149371c9d4SSatish Balay   c = cout;
2415019b515eSShri Abhyankar 
2416019b515eSShri Abhyankar   /* forward solve the lower triangular */
2417019b515eSShri Abhyankar   tmps = tmp;
2418019b515eSShri Abhyankar   aa   = a_a;
2419019b515eSShri Abhyankar   aj   = a_j;
2420019b515eSShri Abhyankar   ad   = a->diag;
2421019b515eSShri Abhyankar 
2422019b515eSShri Abhyankar   for (i = 0, row = 0; i < node_max; ++i) {
2423019b515eSShri Abhyankar     nsz = ns[i];
2424019b515eSShri Abhyankar     aii = ai[row];
2425019b515eSShri Abhyankar     v1  = aa + aii;
2426019b515eSShri Abhyankar     vi  = aj + aii;
2427019b515eSShri Abhyankar     nz  = ai[row + 1] - ai[row];
2428019b515eSShri Abhyankar 
242998991853SShri Abhyankar     if (i < node_max - 1) {
243098991853SShri Abhyankar       /* Prefetch the indices for the next block */
243150d8bf02SJed Brown       PetscPrefetchBlock(aj + ai[row + nsz], ai[row + nsz + 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */
243298991853SShri Abhyankar       /* Prefetch the data for the next block */
243350d8bf02SJed Brown       PetscPrefetchBlock(aa + ai[row + nsz], ai[row + nsz + ns[i + 1]] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA);
243498991853SShri Abhyankar     }
243598991853SShri Abhyankar 
2436019b515eSShri Abhyankar     switch (nsz) { /* Each loop in 'case' is unrolled */
2437019b515eSShri Abhyankar     case 1:
2438019b515eSShri Abhyankar       sum1 = b[r[row]];
2439019b515eSShri Abhyankar       for (j = 0; j < nz - 1; j += 2) {
2440019b515eSShri Abhyankar         i0   = vi[j];
2441019b515eSShri Abhyankar         i1   = vi[j + 1];
2442019b515eSShri Abhyankar         tmp0 = tmps[i0];
2443019b515eSShri Abhyankar         tmp1 = tmps[i1];
2444019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2445019b515eSShri Abhyankar       }
2446019b515eSShri Abhyankar       if (j == nz - 1) {
2447019b515eSShri Abhyankar         tmp0 = tmps[vi[j]];
2448019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0;
2449019b515eSShri Abhyankar       }
2450019b515eSShri Abhyankar       tmp[row++] = sum1;
2451019b515eSShri Abhyankar       break;
2452019b515eSShri Abhyankar     case 2:
2453019b515eSShri Abhyankar       sum1 = b[r[row]];
2454019b515eSShri Abhyankar       sum2 = b[r[row + 1]];
2455019b515eSShri Abhyankar       v2   = aa + ai[row + 1];
2456019b515eSShri Abhyankar 
2457019b515eSShri Abhyankar       for (j = 0; j < nz - 1; j += 2) {
2458019b515eSShri Abhyankar         i0   = vi[j];
2459019b515eSShri Abhyankar         i1   = vi[j + 1];
2460019b515eSShri Abhyankar         tmp0 = tmps[i0];
2461019b515eSShri Abhyankar         tmp1 = tmps[i1];
2462019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2463019b515eSShri Abhyankar         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2464019b515eSShri Abhyankar       }
2465019b515eSShri Abhyankar       if (j == nz - 1) {
2466019b515eSShri Abhyankar         tmp0 = tmps[vi[j]];
2467019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0;
2468019b515eSShri Abhyankar         sum2 -= v2[j] * tmp0;
2469019b515eSShri Abhyankar       }
2470019b515eSShri Abhyankar       sum2 -= v2[nz] * sum1;
2471019b515eSShri Abhyankar       tmp[row++] = sum1;
2472019b515eSShri Abhyankar       tmp[row++] = sum2;
2473019b515eSShri Abhyankar       break;
2474019b515eSShri Abhyankar     case 3:
2475019b515eSShri Abhyankar       sum1 = b[r[row]];
2476019b515eSShri Abhyankar       sum2 = b[r[row + 1]];
2477019b515eSShri Abhyankar       sum3 = b[r[row + 2]];
2478019b515eSShri Abhyankar       v2   = aa + ai[row + 1];
2479019b515eSShri Abhyankar       v3   = aa + ai[row + 2];
2480019b515eSShri Abhyankar 
2481019b515eSShri Abhyankar       for (j = 0; j < nz - 1; j += 2) {
2482019b515eSShri Abhyankar         i0   = vi[j];
2483019b515eSShri Abhyankar         i1   = vi[j + 1];
2484019b515eSShri Abhyankar         tmp0 = tmps[i0];
2485019b515eSShri Abhyankar         tmp1 = tmps[i1];
2486019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2487019b515eSShri Abhyankar         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2488019b515eSShri Abhyankar         sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2489019b515eSShri Abhyankar       }
2490019b515eSShri Abhyankar       if (j == nz - 1) {
2491019b515eSShri Abhyankar         tmp0 = tmps[vi[j]];
2492019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0;
2493019b515eSShri Abhyankar         sum2 -= v2[j] * tmp0;
2494019b515eSShri Abhyankar         sum3 -= v3[j] * tmp0;
2495019b515eSShri Abhyankar       }
2496019b515eSShri Abhyankar       sum2 -= v2[nz] * sum1;
2497019b515eSShri Abhyankar       sum3 -= v3[nz] * sum1;
2498019b515eSShri Abhyankar       sum3 -= v3[nz + 1] * sum2;
2499019b515eSShri Abhyankar       tmp[row++] = sum1;
2500019b515eSShri Abhyankar       tmp[row++] = sum2;
2501019b515eSShri Abhyankar       tmp[row++] = sum3;
2502019b515eSShri Abhyankar       break;
2503019b515eSShri Abhyankar 
2504019b515eSShri Abhyankar     case 4:
2505019b515eSShri Abhyankar       sum1 = b[r[row]];
2506019b515eSShri Abhyankar       sum2 = b[r[row + 1]];
2507019b515eSShri Abhyankar       sum3 = b[r[row + 2]];
2508019b515eSShri Abhyankar       sum4 = b[r[row + 3]];
2509019b515eSShri Abhyankar       v2   = aa + ai[row + 1];
2510019b515eSShri Abhyankar       v3   = aa + ai[row + 2];
2511019b515eSShri Abhyankar       v4   = aa + ai[row + 3];
2512019b515eSShri Abhyankar 
2513019b515eSShri Abhyankar       for (j = 0; j < nz - 1; j += 2) {
2514019b515eSShri Abhyankar         i0   = vi[j];
2515019b515eSShri Abhyankar         i1   = vi[j + 1];
2516019b515eSShri Abhyankar         tmp0 = tmps[i0];
2517019b515eSShri Abhyankar         tmp1 = tmps[i1];
2518019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2519019b515eSShri Abhyankar         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2520019b515eSShri Abhyankar         sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2521019b515eSShri Abhyankar         sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1;
2522019b515eSShri Abhyankar       }
2523019b515eSShri Abhyankar       if (j == nz - 1) {
2524019b515eSShri Abhyankar         tmp0 = tmps[vi[j]];
2525019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0;
2526019b515eSShri Abhyankar         sum2 -= v2[j] * tmp0;
2527019b515eSShri Abhyankar         sum3 -= v3[j] * tmp0;
2528019b515eSShri Abhyankar         sum4 -= v4[j] * tmp0;
2529019b515eSShri Abhyankar       }
2530019b515eSShri Abhyankar       sum2 -= v2[nz] * sum1;
2531019b515eSShri Abhyankar       sum3 -= v3[nz] * sum1;
2532019b515eSShri Abhyankar       sum4 -= v4[nz] * sum1;
2533019b515eSShri Abhyankar       sum3 -= v3[nz + 1] * sum2;
2534019b515eSShri Abhyankar       sum4 -= v4[nz + 1] * sum2;
2535019b515eSShri Abhyankar       sum4 -= v4[nz + 2] * sum3;
2536019b515eSShri Abhyankar 
2537019b515eSShri Abhyankar       tmp[row++] = sum1;
2538019b515eSShri Abhyankar       tmp[row++] = sum2;
2539019b515eSShri Abhyankar       tmp[row++] = sum3;
2540019b515eSShri Abhyankar       tmp[row++] = sum4;
2541019b515eSShri Abhyankar       break;
2542019b515eSShri Abhyankar     case 5:
2543019b515eSShri Abhyankar       sum1 = b[r[row]];
2544019b515eSShri Abhyankar       sum2 = b[r[row + 1]];
2545019b515eSShri Abhyankar       sum3 = b[r[row + 2]];
2546019b515eSShri Abhyankar       sum4 = b[r[row + 3]];
2547019b515eSShri Abhyankar       sum5 = b[r[row + 4]];
2548019b515eSShri Abhyankar       v2   = aa + ai[row + 1];
2549019b515eSShri Abhyankar       v3   = aa + ai[row + 2];
2550019b515eSShri Abhyankar       v4   = aa + ai[row + 3];
2551019b515eSShri Abhyankar       v5   = aa + ai[row + 4];
2552019b515eSShri Abhyankar 
2553019b515eSShri Abhyankar       for (j = 0; j < nz - 1; j += 2) {
2554019b515eSShri Abhyankar         i0   = vi[j];
2555019b515eSShri Abhyankar         i1   = vi[j + 1];
2556019b515eSShri Abhyankar         tmp0 = tmps[i0];
2557019b515eSShri Abhyankar         tmp1 = tmps[i1];
2558019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2559019b515eSShri Abhyankar         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2560019b515eSShri Abhyankar         sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2561019b515eSShri Abhyankar         sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1;
2562019b515eSShri Abhyankar         sum5 -= v5[j] * tmp0 + v5[j + 1] * tmp1;
2563019b515eSShri Abhyankar       }
2564019b515eSShri Abhyankar       if (j == nz - 1) {
2565019b515eSShri Abhyankar         tmp0 = tmps[vi[j]];
2566019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0;
2567019b515eSShri Abhyankar         sum2 -= v2[j] * tmp0;
2568019b515eSShri Abhyankar         sum3 -= v3[j] * tmp0;
2569019b515eSShri Abhyankar         sum4 -= v4[j] * tmp0;
2570019b515eSShri Abhyankar         sum5 -= v5[j] * tmp0;
2571019b515eSShri Abhyankar       }
2572019b515eSShri Abhyankar 
2573019b515eSShri Abhyankar       sum2 -= v2[nz] * sum1;
2574019b515eSShri Abhyankar       sum3 -= v3[nz] * sum1;
2575019b515eSShri Abhyankar       sum4 -= v4[nz] * sum1;
2576019b515eSShri Abhyankar       sum5 -= v5[nz] * sum1;
2577019b515eSShri Abhyankar       sum3 -= v3[nz + 1] * sum2;
2578019b515eSShri Abhyankar       sum4 -= v4[nz + 1] * sum2;
2579019b515eSShri Abhyankar       sum5 -= v5[nz + 1] * sum2;
2580019b515eSShri Abhyankar       sum4 -= v4[nz + 2] * sum3;
2581019b515eSShri Abhyankar       sum5 -= v5[nz + 2] * sum3;
2582019b515eSShri Abhyankar       sum5 -= v5[nz + 3] * sum4;
2583019b515eSShri Abhyankar 
2584019b515eSShri Abhyankar       tmp[row++] = sum1;
2585019b515eSShri Abhyankar       tmp[row++] = sum2;
2586019b515eSShri Abhyankar       tmp[row++] = sum3;
2587019b515eSShri Abhyankar       tmp[row++] = sum4;
2588019b515eSShri Abhyankar       tmp[row++] = sum5;
2589019b515eSShri Abhyankar       break;
2590d71ae5a4SJacob Faibussowitsch     default:
2591d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
2592019b515eSShri Abhyankar     }
2593019b515eSShri Abhyankar   }
2594019b515eSShri Abhyankar   /* backward solve the upper triangular */
2595019b515eSShri Abhyankar   for (i = node_max - 1, row = n - 1; i >= 0; i--) {
2596019b515eSShri Abhyankar     nsz = ns[i];
2597019b515eSShri Abhyankar     aii = ad[row + 1] + 1;
2598019b515eSShri Abhyankar     v1  = aa + aii;
2599019b515eSShri Abhyankar     vi  = aj + aii;
2600019b515eSShri Abhyankar     nz  = ad[row] - ad[row + 1] - 1;
260198991853SShri Abhyankar 
260298991853SShri Abhyankar     if (i > 0) {
260398991853SShri Abhyankar       /* Prefetch the indices for the next block */
260450d8bf02SJed Brown       PetscPrefetchBlock(aj + ad[row - nsz + 1] + 1, ad[row - nsz] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA);
260598991853SShri Abhyankar       /* Prefetch the data for the next block */
260650d8bf02SJed Brown       PetscPrefetchBlock(aa + ad[row - nsz + 1] + 1, ad[row - nsz - ns[i - 1] + 1] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA);
260798991853SShri Abhyankar     }
260898991853SShri Abhyankar 
2609019b515eSShri Abhyankar     switch (nsz) { /* Each loop in 'case' is unrolled */
2610019b515eSShri Abhyankar     case 1:
2611019b515eSShri Abhyankar       sum1 = tmp[row];
2612019b515eSShri Abhyankar 
2613019b515eSShri Abhyankar       for (j = 0; j < nz - 1; j += 2) {
2614019b515eSShri Abhyankar         i0   = vi[j];
2615019b515eSShri Abhyankar         i1   = vi[j + 1];
2616019b515eSShri Abhyankar         tmp0 = tmps[i0];
2617019b515eSShri Abhyankar         tmp1 = tmps[i1];
2618019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2619019b515eSShri Abhyankar       }
2620019b515eSShri Abhyankar       if (j == nz - 1) {
2621019b515eSShri Abhyankar         tmp0 = tmps[vi[j]];
2622019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0;
2623019b515eSShri Abhyankar       }
26249371c9d4SSatish Balay       x[c[row]] = tmp[row] = sum1 * v1[nz];
26259371c9d4SSatish Balay       row--;
2626019b515eSShri Abhyankar       break;
2627019b515eSShri Abhyankar     case 2:
2628019b515eSShri Abhyankar       sum1 = tmp[row];
2629019b515eSShri Abhyankar       sum2 = tmp[row - 1];
2630019b515eSShri Abhyankar       v2   = aa + ad[row] + 1;
2631019b515eSShri Abhyankar       for (j = 0; j < nz - 1; j += 2) {
2632019b515eSShri Abhyankar         i0   = vi[j];
2633019b515eSShri Abhyankar         i1   = vi[j + 1];
2634019b515eSShri Abhyankar         tmp0 = tmps[i0];
2635019b515eSShri Abhyankar         tmp1 = tmps[i1];
2636019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2637019b515eSShri Abhyankar         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2638019b515eSShri Abhyankar       }
2639019b515eSShri Abhyankar       if (j == nz - 1) {
2640019b515eSShri Abhyankar         tmp0 = tmps[vi[j]];
2641019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0;
2642019b515eSShri Abhyankar         sum2 -= v2[j + 1] * tmp0;
2643019b515eSShri Abhyankar       }
2644019b515eSShri Abhyankar 
26459371c9d4SSatish Balay       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
26469371c9d4SSatish Balay       row--;
2647019b515eSShri Abhyankar       sum2 -= v2[0] * tmp0;
26489371c9d4SSatish Balay       x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
26499371c9d4SSatish Balay       row--;
2650019b515eSShri Abhyankar       break;
2651019b515eSShri Abhyankar     case 3:
2652019b515eSShri Abhyankar       sum1 = tmp[row];
2653019b515eSShri Abhyankar       sum2 = tmp[row - 1];
2654019b515eSShri Abhyankar       sum3 = tmp[row - 2];
2655019b515eSShri Abhyankar       v2   = aa + ad[row] + 1;
2656019b515eSShri Abhyankar       v3   = aa + ad[row - 1] + 1;
2657019b515eSShri Abhyankar       for (j = 0; j < nz - 1; j += 2) {
2658019b515eSShri Abhyankar         i0   = vi[j];
2659019b515eSShri Abhyankar         i1   = vi[j + 1];
2660019b515eSShri Abhyankar         tmp0 = tmps[i0];
2661019b515eSShri Abhyankar         tmp1 = tmps[i1];
2662019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2663019b515eSShri Abhyankar         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2664019b515eSShri Abhyankar         sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2665019b515eSShri Abhyankar       }
2666019b515eSShri Abhyankar       if (j == nz - 1) {
2667019b515eSShri Abhyankar         tmp0 = tmps[vi[j]];
2668019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0;
2669019b515eSShri Abhyankar         sum2 -= v2[j + 1] * tmp0;
2670019b515eSShri Abhyankar         sum3 -= v3[j + 2] * tmp0;
2671019b515eSShri Abhyankar       }
26729371c9d4SSatish Balay       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
26739371c9d4SSatish Balay       row--;
2674019b515eSShri Abhyankar       sum2 -= v2[0] * tmp0;
2675019b515eSShri Abhyankar       sum3 -= v3[1] * tmp0;
26769371c9d4SSatish Balay       tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
26779371c9d4SSatish Balay       row--;
2678019b515eSShri Abhyankar       sum3 -= v3[0] * tmp0;
26799371c9d4SSatish Balay       x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
26809371c9d4SSatish Balay       row--;
2681019b515eSShri Abhyankar 
2682019b515eSShri Abhyankar       break;
2683019b515eSShri Abhyankar     case 4:
2684019b515eSShri Abhyankar       sum1 = tmp[row];
2685019b515eSShri Abhyankar       sum2 = tmp[row - 1];
2686019b515eSShri Abhyankar       sum3 = tmp[row - 2];
2687019b515eSShri Abhyankar       sum4 = tmp[row - 3];
2688019b515eSShri Abhyankar       v2   = aa + ad[row] + 1;
2689019b515eSShri Abhyankar       v3   = aa + ad[row - 1] + 1;
2690019b515eSShri Abhyankar       v4   = aa + ad[row - 2] + 1;
2691019b515eSShri Abhyankar 
2692019b515eSShri Abhyankar       for (j = 0; j < nz - 1; j += 2) {
2693019b515eSShri Abhyankar         i0   = vi[j];
2694019b515eSShri Abhyankar         i1   = vi[j + 1];
2695019b515eSShri Abhyankar         tmp0 = tmps[i0];
2696019b515eSShri Abhyankar         tmp1 = tmps[i1];
2697019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2698019b515eSShri Abhyankar         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2699019b515eSShri Abhyankar         sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2700019b515eSShri Abhyankar         sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1;
2701019b515eSShri Abhyankar       }
2702019b515eSShri Abhyankar       if (j == nz - 1) {
2703019b515eSShri Abhyankar         tmp0 = tmps[vi[j]];
2704019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0;
2705019b515eSShri Abhyankar         sum2 -= v2[j + 1] * tmp0;
2706019b515eSShri Abhyankar         sum3 -= v3[j + 2] * tmp0;
2707019b515eSShri Abhyankar         sum4 -= v4[j + 3] * tmp0;
2708019b515eSShri Abhyankar       }
2709019b515eSShri Abhyankar 
27109371c9d4SSatish Balay       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
27119371c9d4SSatish Balay       row--;
2712019b515eSShri Abhyankar       sum2 -= v2[0] * tmp0;
2713019b515eSShri Abhyankar       sum3 -= v3[1] * tmp0;
2714019b515eSShri Abhyankar       sum4 -= v4[2] * tmp0;
27159371c9d4SSatish Balay       tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
27169371c9d4SSatish Balay       row--;
2717019b515eSShri Abhyankar       sum3 -= v3[0] * tmp0;
2718019b515eSShri Abhyankar       sum4 -= v4[1] * tmp0;
27199371c9d4SSatish Balay       tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
27209371c9d4SSatish Balay       row--;
2721019b515eSShri Abhyankar       sum4 -= v4[0] * tmp0;
27229371c9d4SSatish Balay       x[c[row]] = tmp[row] = sum4 * v4[nz + 3];
27239371c9d4SSatish Balay       row--;
2724019b515eSShri Abhyankar       break;
2725019b515eSShri Abhyankar     case 5:
2726019b515eSShri Abhyankar       sum1 = tmp[row];
2727019b515eSShri Abhyankar       sum2 = tmp[row - 1];
2728019b515eSShri Abhyankar       sum3 = tmp[row - 2];
2729019b515eSShri Abhyankar       sum4 = tmp[row - 3];
2730019b515eSShri Abhyankar       sum5 = tmp[row - 4];
2731019b515eSShri Abhyankar       v2   = aa + ad[row] + 1;
2732019b515eSShri Abhyankar       v3   = aa + ad[row - 1] + 1;
2733019b515eSShri Abhyankar       v4   = aa + ad[row - 2] + 1;
2734019b515eSShri Abhyankar       v5   = aa + ad[row - 3] + 1;
2735019b515eSShri Abhyankar       for (j = 0; j < nz - 1; j += 2) {
2736019b515eSShri Abhyankar         i0   = vi[j];
2737019b515eSShri Abhyankar         i1   = vi[j + 1];
2738019b515eSShri Abhyankar         tmp0 = tmps[i0];
2739019b515eSShri Abhyankar         tmp1 = tmps[i1];
2740019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2741019b515eSShri Abhyankar         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2742019b515eSShri Abhyankar         sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2743019b515eSShri Abhyankar         sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1;
2744019b515eSShri Abhyankar         sum5 -= v5[j + 4] * tmp0 + v5[j + 5] * tmp1;
2745019b515eSShri Abhyankar       }
2746019b515eSShri Abhyankar       if (j == nz - 1) {
2747019b515eSShri Abhyankar         tmp0 = tmps[vi[j]];
2748019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0;
2749019b515eSShri Abhyankar         sum2 -= v2[j + 1] * tmp0;
2750019b515eSShri Abhyankar         sum3 -= v3[j + 2] * tmp0;
2751019b515eSShri Abhyankar         sum4 -= v4[j + 3] * tmp0;
2752019b515eSShri Abhyankar         sum5 -= v5[j + 4] * tmp0;
2753019b515eSShri Abhyankar       }
2754019b515eSShri Abhyankar 
27559371c9d4SSatish Balay       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
27569371c9d4SSatish Balay       row--;
2757019b515eSShri Abhyankar       sum2 -= v2[0] * tmp0;
2758019b515eSShri Abhyankar       sum3 -= v3[1] * tmp0;
2759019b515eSShri Abhyankar       sum4 -= v4[2] * tmp0;
2760019b515eSShri Abhyankar       sum5 -= v5[3] * tmp0;
27619371c9d4SSatish Balay       tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
27629371c9d4SSatish Balay       row--;
2763019b515eSShri Abhyankar       sum3 -= v3[0] * tmp0;
2764019b515eSShri Abhyankar       sum4 -= v4[1] * tmp0;
2765019b515eSShri Abhyankar       sum5 -= v5[2] * tmp0;
27669371c9d4SSatish Balay       tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
27679371c9d4SSatish Balay       row--;
2768019b515eSShri Abhyankar       sum4 -= v4[0] * tmp0;
2769019b515eSShri Abhyankar       sum5 -= v5[1] * tmp0;
27709371c9d4SSatish Balay       tmp0 = x[c[row]] = tmp[row] = sum4 * v4[nz + 3];
27719371c9d4SSatish Balay       row--;
2772019b515eSShri Abhyankar       sum5 -= v5[0] * tmp0;
27739371c9d4SSatish Balay       x[c[row]] = tmp[row] = sum5 * v5[nz + 4];
27749371c9d4SSatish Balay       row--;
2775019b515eSShri Abhyankar       break;
2776d71ae5a4SJacob Faibussowitsch     default:
2777d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
2778019b515eSShri Abhyankar     }
2779019b515eSShri Abhyankar   }
27809566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &rout));
27819566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &cout));
27829566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
27839566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(xx, &x));
27849566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
27853ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2786019b515eSShri Abhyankar }
2787019b515eSShri Abhyankar 
27884c1414c8SBarry Smith /*
27894c1414c8SBarry Smith      Makes a longer coloring[] array and calls the usual code with that
27904c1414c8SBarry Smith */
279166976f2fSJacob Faibussowitsch static PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat, PetscInt ncolors, PetscInt nin, ISColoringValue coloring[], ISColoring *iscoloring)
2792d71ae5a4SJacob Faibussowitsch {
27934c1414c8SBarry Smith   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)mat->data;
2794d0f46423SBarry Smith   PetscInt         n = mat->cmap->n, m = a->inode.node_count, j, *ns = a->inode.size, row;
27954c1414c8SBarry Smith   PetscInt        *colorused, i;
27964c1414c8SBarry Smith   ISColoringValue *newcolor;
27974c1414c8SBarry Smith 
27984c1414c8SBarry Smith   PetscFunctionBegin;
279908401ef6SPierre Jolivet   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
28009566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(n + 1, &newcolor));
28014c1414c8SBarry Smith   /* loop over inodes, marking a color for each column*/
28024c1414c8SBarry Smith   row = 0;
28034c1414c8SBarry Smith   for (i = 0; i < m; i++) {
28046497c311SBarry Smith     for (j = 0; j < ns[i]; j++) PetscCall(ISColoringValueCast(coloring[i] + j * ncolors, newcolor + row++));
28054c1414c8SBarry Smith   }
28064c1414c8SBarry Smith 
28074c1414c8SBarry Smith   /* eliminate unneeded colors */
28089566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(5 * ncolors, &colorused));
2809ad540459SPierre Jolivet   for (i = 0; i < n; i++) colorused[newcolor[i]] = 1;
28104c1414c8SBarry Smith 
2811ad540459SPierre Jolivet   for (i = 1; i < 5 * ncolors; i++) colorused[i] += colorused[i - 1];
28124c1414c8SBarry Smith   ncolors = colorused[5 * ncolors - 1];
28136497c311SBarry Smith   for (i = 0; i < n; i++) PetscCall(ISColoringValueCast(colorused[newcolor[i]] - 1, newcolor + i));
28149566063dSJacob Faibussowitsch   PetscCall(PetscFree(colorused));
28159566063dSJacob Faibussowitsch   PetscCall(ISColoringCreate(PetscObjectComm((PetscObject)mat), ncolors, n, newcolor, PETSC_OWN_POINTER, iscoloring));
28169566063dSJacob Faibussowitsch   PetscCall(PetscFree(coloring));
28173ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
28184c1414c8SBarry Smith }
28194c1414c8SBarry Smith 
2820af0996ceSBarry Smith #include <petsc/private/kernels/blockinvert.h>
28212af78befSBarry Smith 
2822d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
2823d71ae5a4SJacob Faibussowitsch {
28242af78befSBarry Smith   Mat_SeqAIJ        *a    = (Mat_SeqAIJ *)A->data;
28257aaeff0aSMatthew G. Knepley   PetscScalar        sum1 = 0.0, sum2 = 0.0, sum3 = 0.0, sum4 = 0.0, sum5 = 0.0, tmp0, tmp1, tmp2, tmp3;
28265850ef23SBarry Smith   MatScalar         *ibdiag, *bdiag, work[25], *t;
2827a8b09249SBarry Smith   PetscScalar       *x, tmp4, tmp5, x1, x2, x3, x4, x5;
28287aaeff0aSMatthew G. Knepley   const MatScalar   *v = a->a, *v1 = NULL, *v2 = NULL, *v3 = NULL, *v4 = NULL, *v5 = NULL;
28295850ef23SBarry Smith   const PetscScalar *xb, *b;
28307b6c816cSBarry Smith   PetscReal          zeropivot = 100. * PETSC_MACHINE_EPSILON, shift = 0.0;
28318758e1faSBarry Smith   PetscInt           n, m = a->inode.node_count, cnt = 0, i, j, row, i1, i2;
28328758e1faSBarry Smith   PetscInt           sz, k, ipvt[5];
28337b6c816cSBarry Smith   PetscBool          allowzeropivot, zeropivotdetected;
28348758e1faSBarry Smith   const PetscInt    *sizes = a->inode.size, *idx, *diag = a->diag, *ii = a->i;
28352af78befSBarry Smith 
28362af78befSBarry Smith   PetscFunctionBegin;
2837a455e926SHong Zhang   allowzeropivot = PetscNot(A->erroriffailure);
283808401ef6SPierre Jolivet   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
283908401ef6SPierre Jolivet   PetscCheck(omega == 1.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for omega != 1.0; use -mat_no_inode");
284008401ef6SPierre Jolivet   PetscCheck(fshift == 0.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for fshift != 0.0; use -mat_no_inode");
28412af78befSBarry Smith 
284271f1c65dSBarry Smith   if (!a->inode.ibdiagvalid) {
28432af78befSBarry Smith     if (!a->inode.ibdiag) {
28442af78befSBarry Smith       /* calculate space needed for diagonal blocks */
2845ad540459SPierre Jolivet       for (i = 0; i < m; i++) cnt += sizes[i] * sizes[i];
2846f0d39aaaSBarry Smith       a->inode.bdiagsize = cnt;
28472205254eSKarl Rupp 
28489566063dSJacob Faibussowitsch       PetscCall(PetscMalloc3(cnt, &a->inode.ibdiag, cnt, &a->inode.bdiag, A->rmap->n, &a->inode.ssor_work));
284971f1c65dSBarry Smith     }
285071f1c65dSBarry Smith 
285171f1c65dSBarry Smith     /* copy over the diagonal blocks and invert them */
28522af78befSBarry Smith     ibdiag = a->inode.ibdiag;
28532af78befSBarry Smith     bdiag  = a->inode.bdiag;
28542af78befSBarry Smith     cnt    = 0;
28552af78befSBarry Smith     for (i = 0, row = 0; i < m; i++) {
28562af78befSBarry Smith       for (j = 0; j < sizes[i]; j++) {
2857ad540459SPierre Jolivet         for (k = 0; k < sizes[i]; k++) bdiag[cnt + k * sizes[i] + j] = v[diag[row + j] - j + k];
28582af78befSBarry Smith       }
28599566063dSJacob Faibussowitsch       PetscCall(PetscArraycpy(ibdiag + cnt, bdiag + cnt, sizes[i] * sizes[i]));
28602af78befSBarry Smith 
28612af78befSBarry Smith       switch (sizes[i]) {
28622af78befSBarry Smith       case 1:
28632af78befSBarry Smith         /* Create matrix data structure */
28648e0e2a9aSHong Zhang         if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) {
28658e0e2a9aSHong Zhang           if (allowzeropivot) {
28667b6c816cSBarry Smith             A->factorerrortype             = MAT_FACTOR_NUMERIC_ZEROPIVOT;
28677b6c816cSBarry Smith             A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]);
28687b6c816cSBarry Smith             A->factorerror_zeropivot_row   = row;
28699566063dSJacob Faibussowitsch             PetscCall(PetscInfo(A, "Zero pivot, row %" PetscInt_FMT "\n", row));
287098921bdaSJacob Faibussowitsch           } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot on row %" PetscInt_FMT, row);
28718e0e2a9aSHong Zhang         }
287264c62002SMatthew Knepley         ibdiag[cnt] = 1.0 / ibdiag[cnt];
28732af78befSBarry Smith         break;
28742af78befSBarry Smith       case 2:
28759566063dSJacob Faibussowitsch         PetscCall(PetscKernel_A_gets_inverse_A_2(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
28767b6c816cSBarry Smith         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
28772af78befSBarry Smith         break;
28782af78befSBarry Smith       case 3:
28799566063dSJacob Faibussowitsch         PetscCall(PetscKernel_A_gets_inverse_A_3(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
28807b6c816cSBarry Smith         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
28812af78befSBarry Smith         break;
28822af78befSBarry Smith       case 4:
28839566063dSJacob Faibussowitsch         PetscCall(PetscKernel_A_gets_inverse_A_4(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
28847b6c816cSBarry Smith         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
28852af78befSBarry Smith         break;
28862af78befSBarry Smith       case 5:
28879566063dSJacob Faibussowitsch         PetscCall(PetscKernel_A_gets_inverse_A_5(ibdiag + cnt, ipvt, work, shift, allowzeropivot, &zeropivotdetected));
28887b6c816cSBarry Smith         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
28892af78befSBarry Smith         break;
2890d71ae5a4SJacob Faibussowitsch       default:
2891d71ae5a4SJacob Faibussowitsch         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
28922af78befSBarry Smith       }
28932af78befSBarry Smith       cnt += sizes[i] * sizes[i];
28942af78befSBarry Smith       row += sizes[i];
28952af78befSBarry Smith     }
289671f1c65dSBarry Smith     a->inode.ibdiagvalid = PETSC_TRUE;
28972af78befSBarry Smith   }
28982af78befSBarry Smith   ibdiag = a->inode.ibdiag;
28992af78befSBarry Smith   bdiag  = a->inode.bdiag;
29005850ef23SBarry Smith   t      = a->inode.ssor_work;
29012af78befSBarry Smith 
29029566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
29039566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
29045850ef23SBarry Smith   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
29055850ef23SBarry Smith   if (flag & SOR_ZERO_INITIAL_GUESS) {
29062af78befSBarry Smith     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
29078862d2efSBarry Smith       for (i = 0, row = 0; i < m; i++) {
29088862d2efSBarry Smith         sz  = diag[row] - ii[row];
29098862d2efSBarry Smith         v1  = a->a + ii[row];
29108862d2efSBarry Smith         idx = a->j + ii[row];
29118862d2efSBarry Smith 
29124108e4d5SBarry Smith         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
29138862d2efSBarry Smith         switch (sizes[i]) {
29148862d2efSBarry Smith         case 1:
29158862d2efSBarry Smith 
29168862d2efSBarry Smith           sum1 = b[row];
29178862d2efSBarry Smith           for (n = 0; n < sz - 1; n += 2) {
29188862d2efSBarry Smith             i1 = idx[0];
29198862d2efSBarry Smith             i2 = idx[1];
29208862d2efSBarry Smith             idx += 2;
29218862d2efSBarry Smith             tmp0 = x[i1];
29228862d2efSBarry Smith             tmp1 = x[i2];
29239371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
29249371c9d4SSatish Balay             v1 += 2;
29258862d2efSBarry Smith           }
29268862d2efSBarry Smith 
29278862d2efSBarry Smith           if (n == sz - 1) {
2928f0d39aaaSBarry Smith             tmp0 = x[*idx];
2929f0d39aaaSBarry Smith             sum1 -= *v1 * tmp0;
29308862d2efSBarry Smith           }
29315850ef23SBarry Smith           t[row]   = sum1;
29328862d2efSBarry Smith           x[row++] = sum1 * (*ibdiag++);
29338862d2efSBarry Smith           break;
2934f0d39aaaSBarry Smith         case 2:
2935f0d39aaaSBarry Smith           v2   = a->a + ii[row + 1];
2936f0d39aaaSBarry Smith           sum1 = b[row];
2937f0d39aaaSBarry Smith           sum2 = b[row + 1];
2938f0d39aaaSBarry Smith           for (n = 0; n < sz - 1; n += 2) {
2939f0d39aaaSBarry Smith             i1 = idx[0];
2940f0d39aaaSBarry Smith             i2 = idx[1];
2941f0d39aaaSBarry Smith             idx += 2;
2942f0d39aaaSBarry Smith             tmp0 = x[i1];
2943f0d39aaaSBarry Smith             tmp1 = x[i2];
29449371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
29459371c9d4SSatish Balay             v1 += 2;
29469371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
29479371c9d4SSatish Balay             v2 += 2;
2948f0d39aaaSBarry Smith           }
2949f0d39aaaSBarry Smith 
2950f0d39aaaSBarry Smith           if (n == sz - 1) {
2951f0d39aaaSBarry Smith             tmp0 = x[*idx];
2952f0d39aaaSBarry Smith             sum1 -= v1[0] * tmp0;
2953f0d39aaaSBarry Smith             sum2 -= v2[0] * tmp0;
2954f0d39aaaSBarry Smith           }
29555850ef23SBarry Smith           t[row]     = sum1;
29565850ef23SBarry Smith           t[row + 1] = sum2;
2957f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[2];
2958f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[3];
2959f0d39aaaSBarry Smith           ibdiag += 4;
2960f0d39aaaSBarry Smith           break;
2961f0d39aaaSBarry Smith         case 3:
2962f0d39aaaSBarry Smith           v2   = a->a + ii[row + 1];
2963f0d39aaaSBarry Smith           v3   = a->a + ii[row + 2];
2964f0d39aaaSBarry Smith           sum1 = b[row];
2965f0d39aaaSBarry Smith           sum2 = b[row + 1];
2966f0d39aaaSBarry Smith           sum3 = b[row + 2];
2967f0d39aaaSBarry Smith           for (n = 0; n < sz - 1; n += 2) {
2968f0d39aaaSBarry Smith             i1 = idx[0];
2969f0d39aaaSBarry Smith             i2 = idx[1];
2970f0d39aaaSBarry Smith             idx += 2;
2971f0d39aaaSBarry Smith             tmp0 = x[i1];
2972f0d39aaaSBarry Smith             tmp1 = x[i2];
29739371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
29749371c9d4SSatish Balay             v1 += 2;
29759371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
29769371c9d4SSatish Balay             v2 += 2;
29779371c9d4SSatish Balay             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
29789371c9d4SSatish Balay             v3 += 2;
2979f0d39aaaSBarry Smith           }
2980f0d39aaaSBarry Smith 
2981f0d39aaaSBarry Smith           if (n == sz - 1) {
2982f0d39aaaSBarry Smith             tmp0 = x[*idx];
2983f0d39aaaSBarry Smith             sum1 -= v1[0] * tmp0;
2984f0d39aaaSBarry Smith             sum2 -= v2[0] * tmp0;
2985f0d39aaaSBarry Smith             sum3 -= v3[0] * tmp0;
2986f0d39aaaSBarry Smith           }
29875850ef23SBarry Smith           t[row]     = sum1;
29885850ef23SBarry Smith           t[row + 1] = sum2;
29895850ef23SBarry Smith           t[row + 2] = sum3;
2990f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
2991f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
2992f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
2993f0d39aaaSBarry Smith           ibdiag += 9;
2994f0d39aaaSBarry Smith           break;
2995f0d39aaaSBarry Smith         case 4:
2996f0d39aaaSBarry Smith           v2   = a->a + ii[row + 1];
2997f0d39aaaSBarry Smith           v3   = a->a + ii[row + 2];
2998f0d39aaaSBarry Smith           v4   = a->a + ii[row + 3];
2999f0d39aaaSBarry Smith           sum1 = b[row];
3000f0d39aaaSBarry Smith           sum2 = b[row + 1];
3001f0d39aaaSBarry Smith           sum3 = b[row + 2];
3002f0d39aaaSBarry Smith           sum4 = b[row + 3];
3003f0d39aaaSBarry Smith           for (n = 0; n < sz - 1; n += 2) {
3004f0d39aaaSBarry Smith             i1 = idx[0];
3005f0d39aaaSBarry Smith             i2 = idx[1];
3006f0d39aaaSBarry Smith             idx += 2;
3007f0d39aaaSBarry Smith             tmp0 = x[i1];
3008f0d39aaaSBarry Smith             tmp1 = x[i2];
30099371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
30109371c9d4SSatish Balay             v1 += 2;
30119371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
30129371c9d4SSatish Balay             v2 += 2;
30139371c9d4SSatish Balay             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
30149371c9d4SSatish Balay             v3 += 2;
30159371c9d4SSatish Balay             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
30169371c9d4SSatish Balay             v4 += 2;
3017f0d39aaaSBarry Smith           }
3018f0d39aaaSBarry Smith 
3019f0d39aaaSBarry Smith           if (n == sz - 1) {
3020f0d39aaaSBarry Smith             tmp0 = x[*idx];
3021f0d39aaaSBarry Smith             sum1 -= v1[0] * tmp0;
3022f0d39aaaSBarry Smith             sum2 -= v2[0] * tmp0;
3023f0d39aaaSBarry Smith             sum3 -= v3[0] * tmp0;
3024f0d39aaaSBarry Smith             sum4 -= v4[0] * tmp0;
3025f0d39aaaSBarry Smith           }
30265850ef23SBarry Smith           t[row]     = sum1;
30275850ef23SBarry Smith           t[row + 1] = sum2;
30285850ef23SBarry Smith           t[row + 2] = sum3;
30295850ef23SBarry Smith           t[row + 3] = sum4;
3030f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
3031f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
3032f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
3033f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
3034f0d39aaaSBarry Smith           ibdiag += 16;
3035f0d39aaaSBarry Smith           break;
3036f0d39aaaSBarry Smith         case 5:
3037f0d39aaaSBarry Smith           v2   = a->a + ii[row + 1];
3038f0d39aaaSBarry Smith           v3   = a->a + ii[row + 2];
3039f0d39aaaSBarry Smith           v4   = a->a + ii[row + 3];
3040f0d39aaaSBarry Smith           v5   = a->a + ii[row + 4];
3041f0d39aaaSBarry Smith           sum1 = b[row];
3042f0d39aaaSBarry Smith           sum2 = b[row + 1];
3043f0d39aaaSBarry Smith           sum3 = b[row + 2];
3044f0d39aaaSBarry Smith           sum4 = b[row + 3];
3045f0d39aaaSBarry Smith           sum5 = b[row + 4];
3046f0d39aaaSBarry Smith           for (n = 0; n < sz - 1; n += 2) {
3047f0d39aaaSBarry Smith             i1 = idx[0];
3048f0d39aaaSBarry Smith             i2 = idx[1];
3049f0d39aaaSBarry Smith             idx += 2;
3050f0d39aaaSBarry Smith             tmp0 = x[i1];
3051f0d39aaaSBarry Smith             tmp1 = x[i2];
30529371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
30539371c9d4SSatish Balay             v1 += 2;
30549371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
30559371c9d4SSatish Balay             v2 += 2;
30569371c9d4SSatish Balay             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
30579371c9d4SSatish Balay             v3 += 2;
30589371c9d4SSatish Balay             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
30599371c9d4SSatish Balay             v4 += 2;
30609371c9d4SSatish Balay             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
30619371c9d4SSatish Balay             v5 += 2;
3062f0d39aaaSBarry Smith           }
3063f0d39aaaSBarry Smith 
3064f0d39aaaSBarry Smith           if (n == sz - 1) {
3065f0d39aaaSBarry Smith             tmp0 = x[*idx];
3066f0d39aaaSBarry Smith             sum1 -= v1[0] * tmp0;
3067f0d39aaaSBarry Smith             sum2 -= v2[0] * tmp0;
3068f0d39aaaSBarry Smith             sum3 -= v3[0] * tmp0;
3069f0d39aaaSBarry Smith             sum4 -= v4[0] * tmp0;
3070f0d39aaaSBarry Smith             sum5 -= v5[0] * tmp0;
3071f0d39aaaSBarry Smith           }
30725850ef23SBarry Smith           t[row]     = sum1;
30735850ef23SBarry Smith           t[row + 1] = sum2;
30745850ef23SBarry Smith           t[row + 2] = sum3;
30755850ef23SBarry Smith           t[row + 3] = sum4;
30765850ef23SBarry Smith           t[row + 4] = sum5;
3077f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
3078f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
3079f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
3080f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
3081f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
3082f0d39aaaSBarry Smith           ibdiag += 25;
3083f0d39aaaSBarry Smith           break;
3084d71ae5a4SJacob Faibussowitsch         default:
3085d71ae5a4SJacob Faibussowitsch           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
30868862d2efSBarry Smith         }
30872af78befSBarry Smith       }
30882af78befSBarry Smith 
30895850ef23SBarry Smith       xb = t;
30909566063dSJacob Faibussowitsch       PetscCall(PetscLogFlops(a->nz));
30912af78befSBarry Smith     } else xb = b;
30922af78befSBarry Smith     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3093f0d39aaaSBarry Smith       ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3094d0f46423SBarry Smith       for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3095f0d39aaaSBarry Smith         ibdiag -= sizes[i] * sizes[i];
30968862d2efSBarry Smith         sz  = ii[row + 1] - diag[row] - 1;
30978862d2efSBarry Smith         v1  = a->a + diag[row] + 1;
30988862d2efSBarry Smith         idx = a->j + diag[row] + 1;
30992af78befSBarry Smith 
31004108e4d5SBarry Smith         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
31018862d2efSBarry Smith         switch (sizes[i]) {
31028862d2efSBarry Smith         case 1:
31038862d2efSBarry Smith 
31048862d2efSBarry Smith           sum1 = xb[row];
31058862d2efSBarry Smith           for (n = 0; n < sz - 1; n += 2) {
31068862d2efSBarry Smith             i1 = idx[0];
31078862d2efSBarry Smith             i2 = idx[1];
31088862d2efSBarry Smith             idx += 2;
31098862d2efSBarry Smith             tmp0 = x[i1];
31108862d2efSBarry Smith             tmp1 = x[i2];
31119371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
31129371c9d4SSatish Balay             v1 += 2;
31138862d2efSBarry Smith           }
31148862d2efSBarry Smith 
31158862d2efSBarry Smith           if (n == sz - 1) {
3116f0d39aaaSBarry Smith             tmp0 = x[*idx];
3117f0d39aaaSBarry Smith             sum1 -= *v1 * tmp0;
31188862d2efSBarry Smith           }
3119f0d39aaaSBarry Smith           x[row--] = sum1 * (*ibdiag);
3120f0d39aaaSBarry Smith           break;
3121f0d39aaaSBarry Smith 
3122f0d39aaaSBarry Smith         case 2:
3123f0d39aaaSBarry Smith 
3124f0d39aaaSBarry Smith           sum1 = xb[row];
3125f0d39aaaSBarry Smith           sum2 = xb[row - 1];
3126f0d39aaaSBarry Smith           /* note that sum1 is associated with the second of the two rows */
3127f0d39aaaSBarry Smith           v2 = a->a + diag[row - 1] + 2;
3128f0d39aaaSBarry Smith           for (n = 0; n < sz - 1; n += 2) {
3129f0d39aaaSBarry Smith             i1 = idx[0];
3130f0d39aaaSBarry Smith             i2 = idx[1];
3131f0d39aaaSBarry Smith             idx += 2;
3132f0d39aaaSBarry Smith             tmp0 = x[i1];
3133f0d39aaaSBarry Smith             tmp1 = x[i2];
31349371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
31359371c9d4SSatish Balay             v1 += 2;
31369371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
31379371c9d4SSatish Balay             v2 += 2;
3138f0d39aaaSBarry Smith           }
3139f0d39aaaSBarry Smith 
3140f0d39aaaSBarry Smith           if (n == sz - 1) {
3141f0d39aaaSBarry Smith             tmp0 = x[*idx];
3142f0d39aaaSBarry Smith             sum1 -= *v1 * tmp0;
3143f0d39aaaSBarry Smith             sum2 -= *v2 * tmp0;
3144f0d39aaaSBarry Smith           }
3145f0d39aaaSBarry Smith           x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3146f0d39aaaSBarry Smith           x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3147f0d39aaaSBarry Smith           break;
3148f0d39aaaSBarry Smith         case 3:
3149f0d39aaaSBarry Smith 
3150f0d39aaaSBarry Smith           sum1 = xb[row];
3151f0d39aaaSBarry Smith           sum2 = xb[row - 1];
3152f0d39aaaSBarry Smith           sum3 = xb[row - 2];
3153f0d39aaaSBarry Smith           v2   = a->a + diag[row - 1] + 2;
3154f0d39aaaSBarry Smith           v3   = a->a + diag[row - 2] + 3;
3155f0d39aaaSBarry Smith           for (n = 0; n < sz - 1; n += 2) {
3156f0d39aaaSBarry Smith             i1 = idx[0];
3157f0d39aaaSBarry Smith             i2 = idx[1];
3158f0d39aaaSBarry Smith             idx += 2;
3159f0d39aaaSBarry Smith             tmp0 = x[i1];
3160f0d39aaaSBarry Smith             tmp1 = x[i2];
31619371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
31629371c9d4SSatish Balay             v1 += 2;
31639371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
31649371c9d4SSatish Balay             v2 += 2;
31659371c9d4SSatish Balay             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
31669371c9d4SSatish Balay             v3 += 2;
3167f0d39aaaSBarry Smith           }
3168f0d39aaaSBarry Smith 
3169f0d39aaaSBarry Smith           if (n == sz - 1) {
3170f0d39aaaSBarry Smith             tmp0 = x[*idx];
3171f0d39aaaSBarry Smith             sum1 -= *v1 * tmp0;
3172f0d39aaaSBarry Smith             sum2 -= *v2 * tmp0;
3173f0d39aaaSBarry Smith             sum3 -= *v3 * tmp0;
3174f0d39aaaSBarry Smith           }
3175f0d39aaaSBarry Smith           x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3176f0d39aaaSBarry Smith           x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3177f0d39aaaSBarry Smith           x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3178f0d39aaaSBarry Smith           break;
3179f0d39aaaSBarry Smith         case 4:
3180f0d39aaaSBarry Smith 
3181f0d39aaaSBarry Smith           sum1 = xb[row];
3182f0d39aaaSBarry Smith           sum2 = xb[row - 1];
3183f0d39aaaSBarry Smith           sum3 = xb[row - 2];
3184f0d39aaaSBarry Smith           sum4 = xb[row - 3];
3185f0d39aaaSBarry Smith           v2   = a->a + diag[row - 1] + 2;
3186f0d39aaaSBarry Smith           v3   = a->a + diag[row - 2] + 3;
3187f0d39aaaSBarry Smith           v4   = a->a + diag[row - 3] + 4;
3188f0d39aaaSBarry Smith           for (n = 0; n < sz - 1; n += 2) {
3189f0d39aaaSBarry Smith             i1 = idx[0];
3190f0d39aaaSBarry Smith             i2 = idx[1];
3191f0d39aaaSBarry Smith             idx += 2;
3192f0d39aaaSBarry Smith             tmp0 = x[i1];
3193f0d39aaaSBarry Smith             tmp1 = x[i2];
31949371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
31959371c9d4SSatish Balay             v1 += 2;
31969371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
31979371c9d4SSatish Balay             v2 += 2;
31989371c9d4SSatish Balay             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
31999371c9d4SSatish Balay             v3 += 2;
32009371c9d4SSatish Balay             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
32019371c9d4SSatish Balay             v4 += 2;
3202f0d39aaaSBarry Smith           }
3203f0d39aaaSBarry Smith 
3204f0d39aaaSBarry Smith           if (n == sz - 1) {
3205f0d39aaaSBarry Smith             tmp0 = x[*idx];
3206f0d39aaaSBarry Smith             sum1 -= *v1 * tmp0;
3207f0d39aaaSBarry Smith             sum2 -= *v2 * tmp0;
3208f0d39aaaSBarry Smith             sum3 -= *v3 * tmp0;
3209f0d39aaaSBarry Smith             sum4 -= *v4 * tmp0;
3210f0d39aaaSBarry Smith           }
3211f0d39aaaSBarry Smith           x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3212f0d39aaaSBarry Smith           x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3213f0d39aaaSBarry Smith           x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3214f0d39aaaSBarry Smith           x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3215f0d39aaaSBarry Smith           break;
3216f0d39aaaSBarry Smith         case 5:
3217f0d39aaaSBarry Smith 
3218f0d39aaaSBarry Smith           sum1 = xb[row];
3219f0d39aaaSBarry Smith           sum2 = xb[row - 1];
3220f0d39aaaSBarry Smith           sum3 = xb[row - 2];
3221f0d39aaaSBarry Smith           sum4 = xb[row - 3];
3222f0d39aaaSBarry Smith           sum5 = xb[row - 4];
3223f0d39aaaSBarry Smith           v2   = a->a + diag[row - 1] + 2;
3224f0d39aaaSBarry Smith           v3   = a->a + diag[row - 2] + 3;
3225f0d39aaaSBarry Smith           v4   = a->a + diag[row - 3] + 4;
3226f0d39aaaSBarry Smith           v5   = a->a + diag[row - 4] + 5;
3227f0d39aaaSBarry Smith           for (n = 0; n < sz - 1; n += 2) {
3228f0d39aaaSBarry Smith             i1 = idx[0];
3229f0d39aaaSBarry Smith             i2 = idx[1];
3230f0d39aaaSBarry Smith             idx += 2;
3231f0d39aaaSBarry Smith             tmp0 = x[i1];
3232f0d39aaaSBarry Smith             tmp1 = x[i2];
32339371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
32349371c9d4SSatish Balay             v1 += 2;
32359371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
32369371c9d4SSatish Balay             v2 += 2;
32379371c9d4SSatish Balay             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
32389371c9d4SSatish Balay             v3 += 2;
32399371c9d4SSatish Balay             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
32409371c9d4SSatish Balay             v4 += 2;
32419371c9d4SSatish Balay             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
32429371c9d4SSatish Balay             v5 += 2;
3243f0d39aaaSBarry Smith           }
3244f0d39aaaSBarry Smith 
3245f0d39aaaSBarry Smith           if (n == sz - 1) {
3246f0d39aaaSBarry Smith             tmp0 = x[*idx];
3247f0d39aaaSBarry Smith             sum1 -= *v1 * tmp0;
3248f0d39aaaSBarry Smith             sum2 -= *v2 * tmp0;
3249f0d39aaaSBarry Smith             sum3 -= *v3 * tmp0;
3250f0d39aaaSBarry Smith             sum4 -= *v4 * tmp0;
3251f0d39aaaSBarry Smith             sum5 -= *v5 * tmp0;
3252f0d39aaaSBarry Smith           }
3253f0d39aaaSBarry Smith           x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3254f0d39aaaSBarry Smith           x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3255f0d39aaaSBarry Smith           x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3256f0d39aaaSBarry Smith           x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3257f0d39aaaSBarry Smith           x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
32588862d2efSBarry Smith           break;
3259d71ae5a4SJacob Faibussowitsch         default:
3260d71ae5a4SJacob Faibussowitsch           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
32618862d2efSBarry Smith         }
32622af78befSBarry Smith       }
32632af78befSBarry Smith 
32649566063dSJacob Faibussowitsch       PetscCall(PetscLogFlops(a->nz));
32652af78befSBarry Smith     }
32662af78befSBarry Smith     its--;
32675850ef23SBarry Smith   }
32685850ef23SBarry Smith   while (its--) {
32695850ef23SBarry Smith     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
32709371c9d4SSatish Balay       for (i = 0, row = 0, ibdiag = a->inode.ibdiag; i < m; row += sizes[i], ibdiag += sizes[i] * sizes[i], i++) {
3271d876e2b0SMark Adams         sz  = diag[row] - ii[row];
32725850ef23SBarry Smith         v1  = a->a + ii[row];
32735850ef23SBarry Smith         idx = a->j + ii[row];
32745850ef23SBarry Smith         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
32755850ef23SBarry Smith         switch (sizes[i]) {
32765850ef23SBarry Smith         case 1:
32775850ef23SBarry Smith           sum1 = b[row];
32785850ef23SBarry Smith           for (n = 0; n < sz - 1; n += 2) {
32795850ef23SBarry Smith             i1 = idx[0];
32805850ef23SBarry Smith             i2 = idx[1];
32815850ef23SBarry Smith             idx += 2;
32825850ef23SBarry Smith             tmp0 = x[i1];
32835850ef23SBarry Smith             tmp1 = x[i2];
32849371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
32859371c9d4SSatish Balay             v1 += 2;
32865850ef23SBarry Smith           }
32875850ef23SBarry Smith           if (n == sz - 1) {
3288d876e2b0SMark Adams             tmp0 = x[*idx++];
3289d876e2b0SMark Adams             sum1 -= *v1 * tmp0;
3290d876e2b0SMark Adams             v1++;
3291d876e2b0SMark Adams           }
3292d876e2b0SMark Adams           t[row] = sum1;
3293d876e2b0SMark Adams           sz     = ii[row + 1] - diag[row] - 1;
3294d876e2b0SMark Adams           idx    = a->j + diag[row] + 1;
3295d876e2b0SMark Adams           v1 += 1;
3296d876e2b0SMark Adams           for (n = 0; n < sz - 1; n += 2) {
3297d876e2b0SMark Adams             i1 = idx[0];
3298d876e2b0SMark Adams             i2 = idx[1];
3299d876e2b0SMark Adams             idx += 2;
3300d876e2b0SMark Adams             tmp0 = x[i1];
3301d876e2b0SMark Adams             tmp1 = x[i2];
33029371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
33039371c9d4SSatish Balay             v1 += 2;
3304d876e2b0SMark Adams           }
3305d876e2b0SMark Adams           if (n == sz - 1) {
3306d876e2b0SMark Adams             tmp0 = x[*idx++];
33075850ef23SBarry Smith             sum1 -= *v1 * tmp0;
33085850ef23SBarry Smith           }
33095850ef23SBarry Smith           /* in MatSOR_SeqAIJ this line would be
33105850ef23SBarry Smith            *
33115850ef23SBarry Smith            * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
33125850ef23SBarry Smith            *
33135850ef23SBarry Smith            * but omega == 1, so this becomes
33145850ef23SBarry Smith            *
3315d876e2b0SMark Adams            * x[row] = sum1*(*ibdiag++);
33165850ef23SBarry Smith            *
33175850ef23SBarry Smith            */
3318d876e2b0SMark Adams           x[row] = sum1 * (*ibdiag);
33195850ef23SBarry Smith           break;
33205850ef23SBarry Smith         case 2:
33215850ef23SBarry Smith           v2   = a->a + ii[row + 1];
33225850ef23SBarry Smith           sum1 = b[row];
33235850ef23SBarry Smith           sum2 = b[row + 1];
33245850ef23SBarry Smith           for (n = 0; n < sz - 1; n += 2) {
33255850ef23SBarry Smith             i1 = idx[0];
33265850ef23SBarry Smith             i2 = idx[1];
33275850ef23SBarry Smith             idx += 2;
33285850ef23SBarry Smith             tmp0 = x[i1];
33295850ef23SBarry Smith             tmp1 = x[i2];
33309371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
33319371c9d4SSatish Balay             v1 += 2;
33329371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
33339371c9d4SSatish Balay             v2 += 2;
33345850ef23SBarry Smith           }
3335d876e2b0SMark Adams           if (n == sz - 1) {
3336d876e2b0SMark Adams             tmp0 = x[*idx++];
3337d876e2b0SMark Adams             sum1 -= v1[0] * tmp0;
3338d876e2b0SMark Adams             sum2 -= v2[0] * tmp0;
33399371c9d4SSatish Balay             v1++;
33409371c9d4SSatish Balay             v2++;
3341d876e2b0SMark Adams           }
3342d876e2b0SMark Adams           t[row]     = sum1;
3343d876e2b0SMark Adams           t[row + 1] = sum2;
3344d876e2b0SMark Adams           sz         = ii[row + 1] - diag[row] - 2;
3345d876e2b0SMark Adams           idx        = a->j + diag[row] + 2;
3346d876e2b0SMark Adams           v1 += 2;
3347d876e2b0SMark Adams           v2 += 2;
3348d876e2b0SMark Adams           for (n = 0; n < sz - 1; n += 2) {
3349d876e2b0SMark Adams             i1 = idx[0];
3350d876e2b0SMark Adams             i2 = idx[1];
3351d876e2b0SMark Adams             idx += 2;
3352d876e2b0SMark Adams             tmp0 = x[i1];
3353d876e2b0SMark Adams             tmp1 = x[i2];
33549371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
33559371c9d4SSatish Balay             v1 += 2;
33569371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
33579371c9d4SSatish Balay             v2 += 2;
3358d876e2b0SMark Adams           }
33595850ef23SBarry Smith           if (n == sz - 1) {
33605850ef23SBarry Smith             tmp0 = x[*idx];
33615850ef23SBarry Smith             sum1 -= v1[0] * tmp0;
33625850ef23SBarry Smith             sum2 -= v2[0] * tmp0;
33635850ef23SBarry Smith           }
3364d876e2b0SMark Adams           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[2];
3365d876e2b0SMark Adams           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
33665850ef23SBarry Smith           break;
33675850ef23SBarry Smith         case 3:
33685850ef23SBarry Smith           v2   = a->a + ii[row + 1];
33695850ef23SBarry Smith           v3   = a->a + ii[row + 2];
33705850ef23SBarry Smith           sum1 = b[row];
33715850ef23SBarry Smith           sum2 = b[row + 1];
33725850ef23SBarry Smith           sum3 = b[row + 2];
33735850ef23SBarry Smith           for (n = 0; n < sz - 1; n += 2) {
33745850ef23SBarry Smith             i1 = idx[0];
33755850ef23SBarry Smith             i2 = idx[1];
33765850ef23SBarry Smith             idx += 2;
33775850ef23SBarry Smith             tmp0 = x[i1];
33785850ef23SBarry Smith             tmp1 = x[i2];
33799371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
33809371c9d4SSatish Balay             v1 += 2;
33819371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
33829371c9d4SSatish Balay             v2 += 2;
33839371c9d4SSatish Balay             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
33849371c9d4SSatish Balay             v3 += 2;
33855850ef23SBarry Smith           }
3386d876e2b0SMark Adams           if (n == sz - 1) {
3387d876e2b0SMark Adams             tmp0 = x[*idx++];
3388d876e2b0SMark Adams             sum1 -= v1[0] * tmp0;
3389d876e2b0SMark Adams             sum2 -= v2[0] * tmp0;
3390d876e2b0SMark Adams             sum3 -= v3[0] * tmp0;
33919371c9d4SSatish Balay             v1++;
33929371c9d4SSatish Balay             v2++;
33939371c9d4SSatish Balay             v3++;
3394d876e2b0SMark Adams           }
3395d876e2b0SMark Adams           t[row]     = sum1;
3396d876e2b0SMark Adams           t[row + 1] = sum2;
3397d876e2b0SMark Adams           t[row + 2] = sum3;
3398d876e2b0SMark Adams           sz         = ii[row + 1] - diag[row] - 3;
3399d876e2b0SMark Adams           idx        = a->j + diag[row] + 3;
3400d876e2b0SMark Adams           v1 += 3;
3401d876e2b0SMark Adams           v2 += 3;
3402d876e2b0SMark Adams           v3 += 3;
3403d876e2b0SMark Adams           for (n = 0; n < sz - 1; n += 2) {
3404d876e2b0SMark Adams             i1 = idx[0];
3405d876e2b0SMark Adams             i2 = idx[1];
3406d876e2b0SMark Adams             idx += 2;
3407d876e2b0SMark Adams             tmp0 = x[i1];
3408d876e2b0SMark Adams             tmp1 = x[i2];
34099371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
34109371c9d4SSatish Balay             v1 += 2;
34119371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
34129371c9d4SSatish Balay             v2 += 2;
34139371c9d4SSatish Balay             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
34149371c9d4SSatish Balay             v3 += 2;
3415d876e2b0SMark Adams           }
34165850ef23SBarry Smith           if (n == sz - 1) {
34175850ef23SBarry Smith             tmp0 = x[*idx];
34185850ef23SBarry Smith             sum1 -= v1[0] * tmp0;
34195850ef23SBarry Smith             sum2 -= v2[0] * tmp0;
34205850ef23SBarry Smith             sum3 -= v3[0] * tmp0;
34215850ef23SBarry Smith           }
3422d876e2b0SMark Adams           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
3423d876e2b0SMark Adams           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
3424d876e2b0SMark Adams           x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
34255850ef23SBarry Smith           break;
34265850ef23SBarry Smith         case 4:
34275850ef23SBarry Smith           v2   = a->a + ii[row + 1];
34285850ef23SBarry Smith           v3   = a->a + ii[row + 2];
34295850ef23SBarry Smith           v4   = a->a + ii[row + 3];
34305850ef23SBarry Smith           sum1 = b[row];
34315850ef23SBarry Smith           sum2 = b[row + 1];
34325850ef23SBarry Smith           sum3 = b[row + 2];
34335850ef23SBarry Smith           sum4 = b[row + 3];
34345850ef23SBarry Smith           for (n = 0; n < sz - 1; n += 2) {
34355850ef23SBarry Smith             i1 = idx[0];
34365850ef23SBarry Smith             i2 = idx[1];
34375850ef23SBarry Smith             idx += 2;
34385850ef23SBarry Smith             tmp0 = x[i1];
34395850ef23SBarry Smith             tmp1 = x[i2];
34409371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
34419371c9d4SSatish Balay             v1 += 2;
34429371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
34439371c9d4SSatish Balay             v2 += 2;
34449371c9d4SSatish Balay             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
34459371c9d4SSatish Balay             v3 += 2;
34469371c9d4SSatish Balay             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
34479371c9d4SSatish Balay             v4 += 2;
34485850ef23SBarry Smith           }
3449d876e2b0SMark Adams           if (n == sz - 1) {
3450d876e2b0SMark Adams             tmp0 = x[*idx++];
3451d876e2b0SMark Adams             sum1 -= v1[0] * tmp0;
3452d876e2b0SMark Adams             sum2 -= v2[0] * tmp0;
3453d876e2b0SMark Adams             sum3 -= v3[0] * tmp0;
3454d876e2b0SMark Adams             sum4 -= v4[0] * tmp0;
34559371c9d4SSatish Balay             v1++;
34569371c9d4SSatish Balay             v2++;
34579371c9d4SSatish Balay             v3++;
34589371c9d4SSatish Balay             v4++;
3459d876e2b0SMark Adams           }
3460d876e2b0SMark Adams           t[row]     = sum1;
3461d876e2b0SMark Adams           t[row + 1] = sum2;
3462d876e2b0SMark Adams           t[row + 2] = sum3;
3463d876e2b0SMark Adams           t[row + 3] = sum4;
3464d876e2b0SMark Adams           sz         = ii[row + 1] - diag[row] - 4;
3465d876e2b0SMark Adams           idx        = a->j + diag[row] + 4;
3466d876e2b0SMark Adams           v1 += 4;
3467d876e2b0SMark Adams           v2 += 4;
3468d876e2b0SMark Adams           v3 += 4;
3469d876e2b0SMark Adams           v4 += 4;
3470d876e2b0SMark Adams           for (n = 0; n < sz - 1; n += 2) {
3471d876e2b0SMark Adams             i1 = idx[0];
3472d876e2b0SMark Adams             i2 = idx[1];
3473d876e2b0SMark Adams             idx += 2;
3474d876e2b0SMark Adams             tmp0 = x[i1];
3475d876e2b0SMark Adams             tmp1 = x[i2];
34769371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
34779371c9d4SSatish Balay             v1 += 2;
34789371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
34799371c9d4SSatish Balay             v2 += 2;
34809371c9d4SSatish Balay             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
34819371c9d4SSatish Balay             v3 += 2;
34829371c9d4SSatish Balay             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
34839371c9d4SSatish Balay             v4 += 2;
3484d876e2b0SMark Adams           }
34855850ef23SBarry Smith           if (n == sz - 1) {
34865850ef23SBarry Smith             tmp0 = x[*idx];
34875850ef23SBarry Smith             sum1 -= v1[0] * tmp0;
34885850ef23SBarry Smith             sum2 -= v2[0] * tmp0;
34895850ef23SBarry Smith             sum3 -= v3[0] * tmp0;
34905850ef23SBarry Smith             sum4 -= v4[0] * tmp0;
34915850ef23SBarry Smith           }
3492d876e2b0SMark Adams           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
3493d876e2b0SMark Adams           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
3494d876e2b0SMark Adams           x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
3495d876e2b0SMark Adams           x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
34965850ef23SBarry Smith           break;
34975850ef23SBarry Smith         case 5:
34985850ef23SBarry Smith           v2   = a->a + ii[row + 1];
34995850ef23SBarry Smith           v3   = a->a + ii[row + 2];
35005850ef23SBarry Smith           v4   = a->a + ii[row + 3];
35015850ef23SBarry Smith           v5   = a->a + ii[row + 4];
35025850ef23SBarry Smith           sum1 = b[row];
35035850ef23SBarry Smith           sum2 = b[row + 1];
35045850ef23SBarry Smith           sum3 = b[row + 2];
35055850ef23SBarry Smith           sum4 = b[row + 3];
35065850ef23SBarry Smith           sum5 = b[row + 4];
35075850ef23SBarry Smith           for (n = 0; n < sz - 1; n += 2) {
35085850ef23SBarry Smith             i1 = idx[0];
35095850ef23SBarry Smith             i2 = idx[1];
35105850ef23SBarry Smith             idx += 2;
35115850ef23SBarry Smith             tmp0 = x[i1];
35125850ef23SBarry Smith             tmp1 = x[i2];
35139371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
35149371c9d4SSatish Balay             v1 += 2;
35159371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
35169371c9d4SSatish Balay             v2 += 2;
35179371c9d4SSatish Balay             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
35189371c9d4SSatish Balay             v3 += 2;
35199371c9d4SSatish Balay             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
35209371c9d4SSatish Balay             v4 += 2;
35219371c9d4SSatish Balay             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
35229371c9d4SSatish Balay             v5 += 2;
35235850ef23SBarry Smith           }
35245850ef23SBarry Smith           if (n == sz - 1) {
3525d876e2b0SMark Adams             tmp0 = x[*idx++];
35265850ef23SBarry Smith             sum1 -= v1[0] * tmp0;
35275850ef23SBarry Smith             sum2 -= v2[0] * tmp0;
35285850ef23SBarry Smith             sum3 -= v3[0] * tmp0;
35295850ef23SBarry Smith             sum4 -= v4[0] * tmp0;
35305850ef23SBarry Smith             sum5 -= v5[0] * tmp0;
35319371c9d4SSatish Balay             v1++;
35329371c9d4SSatish Balay             v2++;
35339371c9d4SSatish Balay             v3++;
35349371c9d4SSatish Balay             v4++;
35359371c9d4SSatish Balay             v5++;
35365850ef23SBarry Smith           }
3537d876e2b0SMark Adams           t[row]     = sum1;
3538d876e2b0SMark Adams           t[row + 1] = sum2;
3539d876e2b0SMark Adams           t[row + 2] = sum3;
3540d876e2b0SMark Adams           t[row + 3] = sum4;
3541d876e2b0SMark Adams           t[row + 4] = sum5;
3542d876e2b0SMark Adams           sz         = ii[row + 1] - diag[row] - 5;
3543d876e2b0SMark Adams           idx        = a->j + diag[row] + 5;
3544d876e2b0SMark Adams           v1 += 5;
3545d876e2b0SMark Adams           v2 += 5;
3546d876e2b0SMark Adams           v3 += 5;
3547d876e2b0SMark Adams           v4 += 5;
3548d876e2b0SMark Adams           v5 += 5;
35495850ef23SBarry Smith           for (n = 0; n < sz - 1; n += 2) {
35505850ef23SBarry Smith             i1 = idx[0];
35515850ef23SBarry Smith             i2 = idx[1];
35525850ef23SBarry Smith             idx += 2;
35535850ef23SBarry Smith             tmp0 = x[i1];
35545850ef23SBarry Smith             tmp1 = x[i2];
35559371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
35569371c9d4SSatish Balay             v1 += 2;
35579371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
35589371c9d4SSatish Balay             v2 += 2;
35599371c9d4SSatish Balay             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
35609371c9d4SSatish Balay             v3 += 2;
35619371c9d4SSatish Balay             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
35629371c9d4SSatish Balay             v4 += 2;
35639371c9d4SSatish Balay             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
35649371c9d4SSatish Balay             v5 += 2;
35655850ef23SBarry Smith           }
35665850ef23SBarry Smith           if (n == sz - 1) {
35675850ef23SBarry Smith             tmp0 = x[*idx];
3568d876e2b0SMark Adams             sum1 -= v1[0] * tmp0;
3569d876e2b0SMark Adams             sum2 -= v2[0] * tmp0;
3570d876e2b0SMark Adams             sum3 -= v3[0] * tmp0;
3571d876e2b0SMark Adams             sum4 -= v4[0] * tmp0;
3572d876e2b0SMark Adams             sum5 -= v5[0] * tmp0;
35735850ef23SBarry Smith           }
3574d876e2b0SMark Adams           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
3575d876e2b0SMark Adams           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
3576d876e2b0SMark Adams           x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
3577d876e2b0SMark Adams           x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
3578d876e2b0SMark Adams           x[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
3579d876e2b0SMark Adams           break;
3580d71ae5a4SJacob Faibussowitsch         default:
3581d71ae5a4SJacob Faibussowitsch           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3582d876e2b0SMark Adams         }
3583d876e2b0SMark Adams       }
3584d876e2b0SMark Adams       xb = t;
35859566063dSJacob Faibussowitsch       PetscCall(PetscLogFlops(2.0 * a->nz)); /* undercounts diag inverse */
3586d876e2b0SMark Adams     } else xb = b;
3587d876e2b0SMark Adams 
3588d876e2b0SMark Adams     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3589d876e2b0SMark Adams       ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3590d876e2b0SMark Adams       for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3591d876e2b0SMark Adams         ibdiag -= sizes[i] * sizes[i];
3592d876e2b0SMark Adams 
3593d876e2b0SMark Adams         /* set RHS */
3594d876e2b0SMark Adams         if (xb == b) {
3595d876e2b0SMark Adams           /* whole (old way) */
3596d876e2b0SMark Adams           sz  = ii[row + 1] - ii[row];
3597d876e2b0SMark Adams           idx = a->j + ii[row];
3598d876e2b0SMark Adams           switch (sizes[i]) {
3599d71ae5a4SJacob Faibussowitsch           case 5:
3600d71ae5a4SJacob Faibussowitsch             v5 = a->a + ii[row - 4]; /* fall through */
3601d71ae5a4SJacob Faibussowitsch           case 4:
3602d71ae5a4SJacob Faibussowitsch             v4 = a->a + ii[row - 3]; /* fall through */
3603d71ae5a4SJacob Faibussowitsch           case 3:
3604d71ae5a4SJacob Faibussowitsch             v3 = a->a + ii[row - 2]; /* fall through */
3605d71ae5a4SJacob Faibussowitsch           case 2:
3606d71ae5a4SJacob Faibussowitsch             v2 = a->a + ii[row - 1]; /* fall through */
3607d71ae5a4SJacob Faibussowitsch           case 1:
3608d71ae5a4SJacob Faibussowitsch             v1 = a->a + ii[row];
3609d71ae5a4SJacob Faibussowitsch             break;
3610d71ae5a4SJacob Faibussowitsch           default:
3611d71ae5a4SJacob Faibussowitsch             SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3612d876e2b0SMark Adams           }
3613d876e2b0SMark Adams         } else {
3614d876e2b0SMark Adams           /* upper, no diag */
3615d876e2b0SMark Adams           sz  = ii[row + 1] - diag[row] - 1;
3616d876e2b0SMark Adams           idx = a->j + diag[row] + 1;
3617d876e2b0SMark Adams           switch (sizes[i]) {
3618d71ae5a4SJacob Faibussowitsch           case 5:
3619d71ae5a4SJacob Faibussowitsch             v5 = a->a + diag[row - 4] + 5; /* fall through */
3620d71ae5a4SJacob Faibussowitsch           case 4:
3621d71ae5a4SJacob Faibussowitsch             v4 = a->a + diag[row - 3] + 4; /* fall through */
3622d71ae5a4SJacob Faibussowitsch           case 3:
3623d71ae5a4SJacob Faibussowitsch             v3 = a->a + diag[row - 2] + 3; /* fall through */
3624d71ae5a4SJacob Faibussowitsch           case 2:
3625d71ae5a4SJacob Faibussowitsch             v2 = a->a + diag[row - 1] + 2; /* fall through */
3626d71ae5a4SJacob Faibussowitsch           case 1:
3627d71ae5a4SJacob Faibussowitsch             v1 = a->a + diag[row] + 1;
3628d876e2b0SMark Adams           }
3629d876e2b0SMark Adams         }
3630d876e2b0SMark Adams         /* set sum */
3631d876e2b0SMark Adams         switch (sizes[i]) {
3632d71ae5a4SJacob Faibussowitsch         case 5:
3633d71ae5a4SJacob Faibussowitsch           sum5 = xb[row - 4]; /* fall through */
3634d71ae5a4SJacob Faibussowitsch         case 4:
3635d71ae5a4SJacob Faibussowitsch           sum4 = xb[row - 3]; /* fall through */
3636d71ae5a4SJacob Faibussowitsch         case 3:
3637d71ae5a4SJacob Faibussowitsch           sum3 = xb[row - 2]; /* fall through */
3638d71ae5a4SJacob Faibussowitsch         case 2:
3639d71ae5a4SJacob Faibussowitsch           sum2 = xb[row - 1]; /* fall through */
3640d876e2b0SMark Adams         case 1:
3641d876e2b0SMark Adams           /* note that sum1 is associated with the last row */
3642d876e2b0SMark Adams           sum1 = xb[row];
3643d876e2b0SMark Adams         }
3644d876e2b0SMark Adams         /* do sums */
3645d876e2b0SMark Adams         for (n = 0; n < sz - 1; n += 2) {
3646d876e2b0SMark Adams           i1 = idx[0];
3647d876e2b0SMark Adams           i2 = idx[1];
3648d876e2b0SMark Adams           idx += 2;
3649d876e2b0SMark Adams           tmp0 = x[i1];
3650d876e2b0SMark Adams           tmp1 = x[i2];
3651d876e2b0SMark Adams           switch (sizes[i]) {
3652d71ae5a4SJacob Faibussowitsch           case 5:
3653d71ae5a4SJacob Faibussowitsch             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3654d71ae5a4SJacob Faibussowitsch             v5 += 2; /* fall through */
3655d71ae5a4SJacob Faibussowitsch           case 4:
3656d71ae5a4SJacob Faibussowitsch             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3657d71ae5a4SJacob Faibussowitsch             v4 += 2; /* fall through */
3658d71ae5a4SJacob Faibussowitsch           case 3:
3659d71ae5a4SJacob Faibussowitsch             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3660d71ae5a4SJacob Faibussowitsch             v3 += 2; /* fall through */
3661d71ae5a4SJacob Faibussowitsch           case 2:
3662d71ae5a4SJacob Faibussowitsch             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3663d71ae5a4SJacob Faibussowitsch             v2 += 2; /* fall through */
3664d71ae5a4SJacob Faibussowitsch           case 1:
3665d71ae5a4SJacob Faibussowitsch             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3666d71ae5a4SJacob Faibussowitsch             v1 += 2;
3667d876e2b0SMark Adams           }
3668d876e2b0SMark Adams         }
3669d876e2b0SMark Adams         /* ragged edge */
3670d876e2b0SMark Adams         if (n == sz - 1) {
3671d876e2b0SMark Adams           tmp0 = x[*idx];
3672d876e2b0SMark Adams           switch (sizes[i]) {
3673d71ae5a4SJacob Faibussowitsch           case 5:
3674d71ae5a4SJacob Faibussowitsch             sum5 -= *v5 * tmp0; /* fall through */
3675d71ae5a4SJacob Faibussowitsch           case 4:
3676d71ae5a4SJacob Faibussowitsch             sum4 -= *v4 * tmp0; /* fall through */
3677d71ae5a4SJacob Faibussowitsch           case 3:
3678d71ae5a4SJacob Faibussowitsch             sum3 -= *v3 * tmp0; /* fall through */
3679d71ae5a4SJacob Faibussowitsch           case 2:
3680d71ae5a4SJacob Faibussowitsch             sum2 -= *v2 * tmp0; /* fall through */
3681d71ae5a4SJacob Faibussowitsch           case 1:
3682d71ae5a4SJacob Faibussowitsch             sum1 -= *v1 * tmp0;
3683d876e2b0SMark Adams           }
3684d876e2b0SMark Adams         }
3685d876e2b0SMark Adams         /* update */
3686d876e2b0SMark Adams         if (xb == b) {
3687d876e2b0SMark Adams           /* whole (old way) w/ diag */
3688d876e2b0SMark Adams           switch (sizes[i]) {
3689d876e2b0SMark Adams           case 5:
36905850ef23SBarry Smith             x[row--] += sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
36915850ef23SBarry Smith             x[row--] += sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
36925850ef23SBarry Smith             x[row--] += sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
36935850ef23SBarry Smith             x[row--] += sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
36945850ef23SBarry Smith             x[row--] += sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
36955850ef23SBarry Smith             break;
3696d876e2b0SMark Adams           case 4:
3697d876e2b0SMark Adams             x[row--] += sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3698d876e2b0SMark Adams             x[row--] += sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3699d876e2b0SMark Adams             x[row--] += sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3700d876e2b0SMark Adams             x[row--] += sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3701d876e2b0SMark Adams             break;
3702d876e2b0SMark Adams           case 3:
3703d876e2b0SMark Adams             x[row--] += sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3704d876e2b0SMark Adams             x[row--] += sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3705d876e2b0SMark Adams             x[row--] += sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3706d876e2b0SMark Adams             break;
3707d876e2b0SMark Adams           case 2:
3708d876e2b0SMark Adams             x[row--] += sum2 * ibdiag[1] + sum1 * ibdiag[3];
3709d876e2b0SMark Adams             x[row--] += sum2 * ibdiag[0] + sum1 * ibdiag[2];
3710d876e2b0SMark Adams             break;
3711d71ae5a4SJacob Faibussowitsch           case 1:
3712d71ae5a4SJacob Faibussowitsch             x[row--] += sum1 * (*ibdiag);
3713d71ae5a4SJacob Faibussowitsch             break;
3714d876e2b0SMark Adams           }
3715d876e2b0SMark Adams         } else {
3716d876e2b0SMark Adams           /* no diag so set =  */
3717d876e2b0SMark Adams           switch (sizes[i]) {
3718d876e2b0SMark Adams           case 5:
3719d876e2b0SMark Adams             x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3720d876e2b0SMark Adams             x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3721d876e2b0SMark Adams             x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3722d876e2b0SMark Adams             x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3723d876e2b0SMark Adams             x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3724d876e2b0SMark Adams             break;
3725d876e2b0SMark Adams           case 4:
3726d876e2b0SMark Adams             x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3727d876e2b0SMark Adams             x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3728d876e2b0SMark Adams             x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3729d876e2b0SMark Adams             x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3730d876e2b0SMark Adams             break;
3731d876e2b0SMark Adams           case 3:
3732d876e2b0SMark Adams             x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3733d876e2b0SMark Adams             x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3734d876e2b0SMark Adams             x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3735d876e2b0SMark Adams             break;
3736d876e2b0SMark Adams           case 2:
3737d876e2b0SMark Adams             x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3738d876e2b0SMark Adams             x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3739d876e2b0SMark Adams             break;
3740d71ae5a4SJacob Faibussowitsch           case 1:
3741d71ae5a4SJacob Faibussowitsch             x[row--] = sum1 * (*ibdiag);
3742d71ae5a4SJacob Faibussowitsch             break;
37435850ef23SBarry Smith           }
37445850ef23SBarry Smith         }
3745d876e2b0SMark Adams       }
3746d876e2b0SMark Adams       if (xb == b) {
37479566063dSJacob Faibussowitsch         PetscCall(PetscLogFlops(2.0 * a->nz));
3748d876e2b0SMark Adams       } else {
37499566063dSJacob Faibussowitsch         PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper, undercounts diag inverse */
3750d876e2b0SMark Adams       }
37515850ef23SBarry Smith     }
37522af78befSBarry Smith   }
375389c6957cSBarry Smith   if (flag & SOR_EISENSTAT) {
375489c6957cSBarry Smith     /*
375589c6957cSBarry Smith           Apply  (U + D)^-1  where D is now the block diagonal
375689c6957cSBarry Smith     */
375789c6957cSBarry Smith     ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
375889c6957cSBarry Smith     for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
375989c6957cSBarry Smith       ibdiag -= sizes[i] * sizes[i];
376089c6957cSBarry Smith       sz  = ii[row + 1] - diag[row] - 1;
376189c6957cSBarry Smith       v1  = a->a + diag[row] + 1;
376289c6957cSBarry Smith       idx = a->j + diag[row] + 1;
37634108e4d5SBarry Smith       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
376489c6957cSBarry Smith       switch (sizes[i]) {
376589c6957cSBarry Smith       case 1:
376689c6957cSBarry Smith 
376789c6957cSBarry Smith         sum1 = b[row];
376889c6957cSBarry Smith         for (n = 0; n < sz - 1; n += 2) {
376989c6957cSBarry Smith           i1 = idx[0];
377089c6957cSBarry Smith           i2 = idx[1];
377189c6957cSBarry Smith           idx += 2;
377289c6957cSBarry Smith           tmp0 = x[i1];
377389c6957cSBarry Smith           tmp1 = x[i2];
37749371c9d4SSatish Balay           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
37759371c9d4SSatish Balay           v1 += 2;
377689c6957cSBarry Smith         }
377789c6957cSBarry Smith 
377889c6957cSBarry Smith         if (n == sz - 1) {
377989c6957cSBarry Smith           tmp0 = x[*idx];
378089c6957cSBarry Smith           sum1 -= *v1 * tmp0;
378189c6957cSBarry Smith         }
37829371c9d4SSatish Balay         x[row] = sum1 * (*ibdiag);
37839371c9d4SSatish Balay         row--;
378489c6957cSBarry Smith         break;
378589c6957cSBarry Smith 
378689c6957cSBarry Smith       case 2:
378789c6957cSBarry Smith 
378889c6957cSBarry Smith         sum1 = b[row];
378989c6957cSBarry Smith         sum2 = b[row - 1];
379089c6957cSBarry Smith         /* note that sum1 is associated with the second of the two rows */
379189c6957cSBarry Smith         v2 = a->a + diag[row - 1] + 2;
379289c6957cSBarry Smith         for (n = 0; n < sz - 1; n += 2) {
379389c6957cSBarry Smith           i1 = idx[0];
379489c6957cSBarry Smith           i2 = idx[1];
379589c6957cSBarry Smith           idx += 2;
379689c6957cSBarry Smith           tmp0 = x[i1];
379789c6957cSBarry Smith           tmp1 = x[i2];
37989371c9d4SSatish Balay           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
37999371c9d4SSatish Balay           v1 += 2;
38009371c9d4SSatish Balay           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
38019371c9d4SSatish Balay           v2 += 2;
380289c6957cSBarry Smith         }
380389c6957cSBarry Smith 
380489c6957cSBarry Smith         if (n == sz - 1) {
380589c6957cSBarry Smith           tmp0 = x[*idx];
380689c6957cSBarry Smith           sum1 -= *v1 * tmp0;
380789c6957cSBarry Smith           sum2 -= *v2 * tmp0;
380889c6957cSBarry Smith         }
3809938d4eb3SBarry Smith         x[row]     = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3810938d4eb3SBarry Smith         x[row - 1] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3811938d4eb3SBarry Smith         row -= 2;
381289c6957cSBarry Smith         break;
381389c6957cSBarry Smith       case 3:
381489c6957cSBarry Smith 
381589c6957cSBarry Smith         sum1 = b[row];
381689c6957cSBarry Smith         sum2 = b[row - 1];
381789c6957cSBarry Smith         sum3 = b[row - 2];
381889c6957cSBarry Smith         v2   = a->a + diag[row - 1] + 2;
381989c6957cSBarry Smith         v3   = a->a + diag[row - 2] + 3;
382089c6957cSBarry Smith         for (n = 0; n < sz - 1; n += 2) {
382189c6957cSBarry Smith           i1 = idx[0];
382289c6957cSBarry Smith           i2 = idx[1];
382389c6957cSBarry Smith           idx += 2;
382489c6957cSBarry Smith           tmp0 = x[i1];
382589c6957cSBarry Smith           tmp1 = x[i2];
38269371c9d4SSatish Balay           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
38279371c9d4SSatish Balay           v1 += 2;
38289371c9d4SSatish Balay           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
38299371c9d4SSatish Balay           v2 += 2;
38309371c9d4SSatish Balay           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
38319371c9d4SSatish Balay           v3 += 2;
383289c6957cSBarry Smith         }
383389c6957cSBarry Smith 
383489c6957cSBarry Smith         if (n == sz - 1) {
383589c6957cSBarry Smith           tmp0 = x[*idx];
383689c6957cSBarry Smith           sum1 -= *v1 * tmp0;
383789c6957cSBarry Smith           sum2 -= *v2 * tmp0;
383889c6957cSBarry Smith           sum3 -= *v3 * tmp0;
383989c6957cSBarry Smith         }
3840938d4eb3SBarry Smith         x[row]     = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3841938d4eb3SBarry Smith         x[row - 1] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3842938d4eb3SBarry Smith         x[row - 2] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3843938d4eb3SBarry Smith         row -= 3;
384489c6957cSBarry Smith         break;
384589c6957cSBarry Smith       case 4:
384689c6957cSBarry Smith 
384789c6957cSBarry Smith         sum1 = b[row];
384889c6957cSBarry Smith         sum2 = b[row - 1];
384989c6957cSBarry Smith         sum3 = b[row - 2];
385089c6957cSBarry Smith         sum4 = b[row - 3];
385189c6957cSBarry Smith         v2   = a->a + diag[row - 1] + 2;
385289c6957cSBarry Smith         v3   = a->a + diag[row - 2] + 3;
385389c6957cSBarry Smith         v4   = a->a + diag[row - 3] + 4;
385489c6957cSBarry Smith         for (n = 0; n < sz - 1; n += 2) {
385589c6957cSBarry Smith           i1 = idx[0];
385689c6957cSBarry Smith           i2 = idx[1];
385789c6957cSBarry Smith           idx += 2;
385889c6957cSBarry Smith           tmp0 = x[i1];
385989c6957cSBarry Smith           tmp1 = x[i2];
38609371c9d4SSatish Balay           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
38619371c9d4SSatish Balay           v1 += 2;
38629371c9d4SSatish Balay           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
38639371c9d4SSatish Balay           v2 += 2;
38649371c9d4SSatish Balay           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
38659371c9d4SSatish Balay           v3 += 2;
38669371c9d4SSatish Balay           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
38679371c9d4SSatish Balay           v4 += 2;
386889c6957cSBarry Smith         }
386989c6957cSBarry Smith 
387089c6957cSBarry Smith         if (n == sz - 1) {
387189c6957cSBarry Smith           tmp0 = x[*idx];
387289c6957cSBarry Smith           sum1 -= *v1 * tmp0;
387389c6957cSBarry Smith           sum2 -= *v2 * tmp0;
387489c6957cSBarry Smith           sum3 -= *v3 * tmp0;
387589c6957cSBarry Smith           sum4 -= *v4 * tmp0;
387689c6957cSBarry Smith         }
3877938d4eb3SBarry Smith         x[row]     = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3878938d4eb3SBarry Smith         x[row - 1] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3879938d4eb3SBarry Smith         x[row - 2] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3880938d4eb3SBarry Smith         x[row - 3] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3881938d4eb3SBarry Smith         row -= 4;
388289c6957cSBarry Smith         break;
388389c6957cSBarry Smith       case 5:
388489c6957cSBarry Smith 
388589c6957cSBarry Smith         sum1 = b[row];
388689c6957cSBarry Smith         sum2 = b[row - 1];
388789c6957cSBarry Smith         sum3 = b[row - 2];
388889c6957cSBarry Smith         sum4 = b[row - 3];
388989c6957cSBarry Smith         sum5 = b[row - 4];
389089c6957cSBarry Smith         v2   = a->a + diag[row - 1] + 2;
389189c6957cSBarry Smith         v3   = a->a + diag[row - 2] + 3;
389289c6957cSBarry Smith         v4   = a->a + diag[row - 3] + 4;
389389c6957cSBarry Smith         v5   = a->a + diag[row - 4] + 5;
389489c6957cSBarry Smith         for (n = 0; n < sz - 1; n += 2) {
389589c6957cSBarry Smith           i1 = idx[0];
389689c6957cSBarry Smith           i2 = idx[1];
389789c6957cSBarry Smith           idx += 2;
389889c6957cSBarry Smith           tmp0 = x[i1];
389989c6957cSBarry Smith           tmp1 = x[i2];
39009371c9d4SSatish Balay           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
39019371c9d4SSatish Balay           v1 += 2;
39029371c9d4SSatish Balay           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
39039371c9d4SSatish Balay           v2 += 2;
39049371c9d4SSatish Balay           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
39059371c9d4SSatish Balay           v3 += 2;
39069371c9d4SSatish Balay           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
39079371c9d4SSatish Balay           v4 += 2;
39089371c9d4SSatish Balay           sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
39099371c9d4SSatish Balay           v5 += 2;
391089c6957cSBarry Smith         }
391189c6957cSBarry Smith 
391289c6957cSBarry Smith         if (n == sz - 1) {
391389c6957cSBarry Smith           tmp0 = x[*idx];
391489c6957cSBarry Smith           sum1 -= *v1 * tmp0;
391589c6957cSBarry Smith           sum2 -= *v2 * tmp0;
391689c6957cSBarry Smith           sum3 -= *v3 * tmp0;
391789c6957cSBarry Smith           sum4 -= *v4 * tmp0;
391889c6957cSBarry Smith           sum5 -= *v5 * tmp0;
391989c6957cSBarry Smith         }
3920938d4eb3SBarry Smith         x[row]     = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3921938d4eb3SBarry Smith         x[row - 1] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3922938d4eb3SBarry Smith         x[row - 2] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3923938d4eb3SBarry Smith         x[row - 3] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3924938d4eb3SBarry Smith         x[row - 4] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3925938d4eb3SBarry Smith         row -= 5;
392689c6957cSBarry Smith         break;
3927d71ae5a4SJacob Faibussowitsch       default:
3928d71ae5a4SJacob Faibussowitsch         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
392989c6957cSBarry Smith       }
393089c6957cSBarry Smith     }
39319566063dSJacob Faibussowitsch     PetscCall(PetscLogFlops(a->nz));
393289c6957cSBarry Smith 
393389c6957cSBarry Smith     /*
393489c6957cSBarry Smith            t = b - D x    where D is the block diagonal
393589c6957cSBarry Smith     */
393689c6957cSBarry Smith     cnt = 0;
393789c6957cSBarry Smith     for (i = 0, row = 0; i < m; i++) {
393889c6957cSBarry Smith       switch (sizes[i]) {
393989c6957cSBarry Smith       case 1:
39409371c9d4SSatish Balay         t[row] = b[row] - bdiag[cnt++] * x[row];
39419371c9d4SSatish Balay         row++;
394289c6957cSBarry Smith         break;
394389c6957cSBarry Smith       case 2:
39449371c9d4SSatish Balay         x1         = x[row];
39459371c9d4SSatish Balay         x2         = x[row + 1];
394689c6957cSBarry Smith         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2];
394789c6957cSBarry Smith         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3];
394889c6957cSBarry Smith         t[row]     = b[row] - tmp1;
39499371c9d4SSatish Balay         t[row + 1] = b[row + 1] - tmp2;
39509371c9d4SSatish Balay         row += 2;
395189c6957cSBarry Smith         cnt += 4;
395289c6957cSBarry Smith         break;
395389c6957cSBarry Smith       case 3:
39549371c9d4SSatish Balay         x1         = x[row];
39559371c9d4SSatish Balay         x2         = x[row + 1];
39569371c9d4SSatish Balay         x3         = x[row + 2];
395789c6957cSBarry Smith         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6];
395889c6957cSBarry Smith         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7];
395989c6957cSBarry Smith         tmp3       = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8];
396089c6957cSBarry Smith         t[row]     = b[row] - tmp1;
396189c6957cSBarry Smith         t[row + 1] = b[row + 1] - tmp2;
39629371c9d4SSatish Balay         t[row + 2] = b[row + 2] - tmp3;
39639371c9d4SSatish Balay         row += 3;
396489c6957cSBarry Smith         cnt += 9;
396589c6957cSBarry Smith         break;
396689c6957cSBarry Smith       case 4:
39679371c9d4SSatish Balay         x1         = x[row];
39689371c9d4SSatish Balay         x2         = x[row + 1];
39699371c9d4SSatish Balay         x3         = x[row + 2];
39709371c9d4SSatish Balay         x4         = x[row + 3];
397189c6957cSBarry Smith         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12];
397289c6957cSBarry Smith         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13];
397389c6957cSBarry Smith         tmp3       = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14];
397489c6957cSBarry Smith         tmp4       = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15];
397589c6957cSBarry Smith         t[row]     = b[row] - tmp1;
397689c6957cSBarry Smith         t[row + 1] = b[row + 1] - tmp2;
397789c6957cSBarry Smith         t[row + 2] = b[row + 2] - tmp3;
39789371c9d4SSatish Balay         t[row + 3] = b[row + 3] - tmp4;
39799371c9d4SSatish Balay         row += 4;
398089c6957cSBarry Smith         cnt += 16;
398189c6957cSBarry Smith         break;
398289c6957cSBarry Smith       case 5:
39839371c9d4SSatish Balay         x1         = x[row];
39849371c9d4SSatish Balay         x2         = x[row + 1];
39859371c9d4SSatish Balay         x3         = x[row + 2];
39869371c9d4SSatish Balay         x4         = x[row + 3];
39879371c9d4SSatish Balay         x5         = x[row + 4];
398889c6957cSBarry Smith         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20];
398989c6957cSBarry Smith         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21];
399089c6957cSBarry Smith         tmp3       = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22];
399189c6957cSBarry Smith         tmp4       = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23];
399289c6957cSBarry Smith         tmp5       = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24];
399389c6957cSBarry Smith         t[row]     = b[row] - tmp1;
399489c6957cSBarry Smith         t[row + 1] = b[row + 1] - tmp2;
399589c6957cSBarry Smith         t[row + 2] = b[row + 2] - tmp3;
399689c6957cSBarry Smith         t[row + 3] = b[row + 3] - tmp4;
39979371c9d4SSatish Balay         t[row + 4] = b[row + 4] - tmp5;
39989371c9d4SSatish Balay         row += 5;
399989c6957cSBarry Smith         cnt += 25;
400089c6957cSBarry Smith         break;
4001d71ae5a4SJacob Faibussowitsch       default:
4002d71ae5a4SJacob Faibussowitsch         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
400389c6957cSBarry Smith       }
400489c6957cSBarry Smith     }
40059566063dSJacob Faibussowitsch     PetscCall(PetscLogFlops(m));
400689c6957cSBarry Smith 
400789c6957cSBarry Smith     /*
400889c6957cSBarry Smith           Apply (L + D)^-1 where D is the block diagonal
400989c6957cSBarry Smith     */
401089c6957cSBarry Smith     for (i = 0, row = 0; i < m; i++) {
401189c6957cSBarry Smith       sz  = diag[row] - ii[row];
401289c6957cSBarry Smith       v1  = a->a + ii[row];
401389c6957cSBarry Smith       idx = a->j + ii[row];
40144108e4d5SBarry Smith       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
401589c6957cSBarry Smith       switch (sizes[i]) {
401689c6957cSBarry Smith       case 1:
401789c6957cSBarry Smith 
401889c6957cSBarry Smith         sum1 = t[row];
401989c6957cSBarry Smith         for (n = 0; n < sz - 1; n += 2) {
402089c6957cSBarry Smith           i1 = idx[0];
402189c6957cSBarry Smith           i2 = idx[1];
402289c6957cSBarry Smith           idx += 2;
402389c6957cSBarry Smith           tmp0 = t[i1];
402489c6957cSBarry Smith           tmp1 = t[i2];
40259371c9d4SSatish Balay           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
40269371c9d4SSatish Balay           v1 += 2;
402789c6957cSBarry Smith         }
402889c6957cSBarry Smith 
402989c6957cSBarry Smith         if (n == sz - 1) {
403089c6957cSBarry Smith           tmp0 = t[*idx];
403189c6957cSBarry Smith           sum1 -= *v1 * tmp0;
403289c6957cSBarry Smith         }
40339371c9d4SSatish Balay         x[row] += t[row] = sum1 * (*ibdiag++);
40349371c9d4SSatish Balay         row++;
403589c6957cSBarry Smith         break;
403689c6957cSBarry Smith       case 2:
403789c6957cSBarry Smith         v2   = a->a + ii[row + 1];
403889c6957cSBarry Smith         sum1 = t[row];
403989c6957cSBarry Smith         sum2 = t[row + 1];
404089c6957cSBarry Smith         for (n = 0; n < sz - 1; n += 2) {
404189c6957cSBarry Smith           i1 = idx[0];
404289c6957cSBarry Smith           i2 = idx[1];
404389c6957cSBarry Smith           idx += 2;
404489c6957cSBarry Smith           tmp0 = t[i1];
404589c6957cSBarry Smith           tmp1 = t[i2];
40469371c9d4SSatish Balay           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
40479371c9d4SSatish Balay           v1 += 2;
40489371c9d4SSatish Balay           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
40499371c9d4SSatish Balay           v2 += 2;
405089c6957cSBarry Smith         }
405189c6957cSBarry Smith 
405289c6957cSBarry Smith         if (n == sz - 1) {
405389c6957cSBarry Smith           tmp0 = t[*idx];
405489c6957cSBarry Smith           sum1 -= v1[0] * tmp0;
405589c6957cSBarry Smith           sum2 -= v2[0] * tmp0;
405689c6957cSBarry Smith         }
405789c6957cSBarry Smith         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[2];
405889c6957cSBarry Smith         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
40599371c9d4SSatish Balay         ibdiag += 4;
40609371c9d4SSatish Balay         row += 2;
406189c6957cSBarry Smith         break;
406289c6957cSBarry Smith       case 3:
406389c6957cSBarry Smith         v2   = a->a + ii[row + 1];
406489c6957cSBarry Smith         v3   = a->a + ii[row + 2];
406589c6957cSBarry Smith         sum1 = t[row];
406689c6957cSBarry Smith         sum2 = t[row + 1];
406789c6957cSBarry Smith         sum3 = t[row + 2];
406889c6957cSBarry Smith         for (n = 0; n < sz - 1; n += 2) {
406989c6957cSBarry Smith           i1 = idx[0];
407089c6957cSBarry Smith           i2 = idx[1];
407189c6957cSBarry Smith           idx += 2;
407289c6957cSBarry Smith           tmp0 = t[i1];
407389c6957cSBarry Smith           tmp1 = t[i2];
40749371c9d4SSatish Balay           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
40759371c9d4SSatish Balay           v1 += 2;
40769371c9d4SSatish Balay           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
40779371c9d4SSatish Balay           v2 += 2;
40789371c9d4SSatish Balay           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
40799371c9d4SSatish Balay           v3 += 2;
408089c6957cSBarry Smith         }
408189c6957cSBarry Smith 
408289c6957cSBarry Smith         if (n == sz - 1) {
408389c6957cSBarry Smith           tmp0 = t[*idx];
408489c6957cSBarry Smith           sum1 -= v1[0] * tmp0;
408589c6957cSBarry Smith           sum2 -= v2[0] * tmp0;
408689c6957cSBarry Smith           sum3 -= v3[0] * tmp0;
408789c6957cSBarry Smith         }
408889c6957cSBarry Smith         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
408989c6957cSBarry Smith         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
409089c6957cSBarry Smith         x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
40919371c9d4SSatish Balay         ibdiag += 9;
40929371c9d4SSatish Balay         row += 3;
409389c6957cSBarry Smith         break;
409489c6957cSBarry Smith       case 4:
409589c6957cSBarry Smith         v2   = a->a + ii[row + 1];
409689c6957cSBarry Smith         v3   = a->a + ii[row + 2];
409789c6957cSBarry Smith         v4   = a->a + ii[row + 3];
409889c6957cSBarry Smith         sum1 = t[row];
409989c6957cSBarry Smith         sum2 = t[row + 1];
410089c6957cSBarry Smith         sum3 = t[row + 2];
410189c6957cSBarry Smith         sum4 = t[row + 3];
410289c6957cSBarry Smith         for (n = 0; n < sz - 1; n += 2) {
410389c6957cSBarry Smith           i1 = idx[0];
410489c6957cSBarry Smith           i2 = idx[1];
410589c6957cSBarry Smith           idx += 2;
410689c6957cSBarry Smith           tmp0 = t[i1];
410789c6957cSBarry Smith           tmp1 = t[i2];
41089371c9d4SSatish Balay           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
41099371c9d4SSatish Balay           v1 += 2;
41109371c9d4SSatish Balay           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
41119371c9d4SSatish Balay           v2 += 2;
41129371c9d4SSatish Balay           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
41139371c9d4SSatish Balay           v3 += 2;
41149371c9d4SSatish Balay           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
41159371c9d4SSatish Balay           v4 += 2;
411689c6957cSBarry Smith         }
411789c6957cSBarry Smith 
411889c6957cSBarry Smith         if (n == sz - 1) {
411989c6957cSBarry Smith           tmp0 = t[*idx];
412089c6957cSBarry Smith           sum1 -= v1[0] * tmp0;
412189c6957cSBarry Smith           sum2 -= v2[0] * tmp0;
412289c6957cSBarry Smith           sum3 -= v3[0] * tmp0;
412389c6957cSBarry Smith           sum4 -= v4[0] * tmp0;
412489c6957cSBarry Smith         }
412589c6957cSBarry Smith         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
412689c6957cSBarry Smith         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
412789c6957cSBarry Smith         x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
412889c6957cSBarry Smith         x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
41299371c9d4SSatish Balay         ibdiag += 16;
41309371c9d4SSatish Balay         row += 4;
413189c6957cSBarry Smith         break;
413289c6957cSBarry Smith       case 5:
413389c6957cSBarry Smith         v2   = a->a + ii[row + 1];
413489c6957cSBarry Smith         v3   = a->a + ii[row + 2];
413589c6957cSBarry Smith         v4   = a->a + ii[row + 3];
413689c6957cSBarry Smith         v5   = a->a + ii[row + 4];
413789c6957cSBarry Smith         sum1 = t[row];
413889c6957cSBarry Smith         sum2 = t[row + 1];
413989c6957cSBarry Smith         sum3 = t[row + 2];
414089c6957cSBarry Smith         sum4 = t[row + 3];
414189c6957cSBarry Smith         sum5 = t[row + 4];
414289c6957cSBarry Smith         for (n = 0; n < sz - 1; n += 2) {
414389c6957cSBarry Smith           i1 = idx[0];
414489c6957cSBarry Smith           i2 = idx[1];
414589c6957cSBarry Smith           idx += 2;
414689c6957cSBarry Smith           tmp0 = t[i1];
414789c6957cSBarry Smith           tmp1 = t[i2];
41489371c9d4SSatish Balay           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
41499371c9d4SSatish Balay           v1 += 2;
41509371c9d4SSatish Balay           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
41519371c9d4SSatish Balay           v2 += 2;
41529371c9d4SSatish Balay           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
41539371c9d4SSatish Balay           v3 += 2;
41549371c9d4SSatish Balay           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
41559371c9d4SSatish Balay           v4 += 2;
41569371c9d4SSatish Balay           sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
41579371c9d4SSatish Balay           v5 += 2;
415889c6957cSBarry Smith         }
415989c6957cSBarry Smith 
416089c6957cSBarry Smith         if (n == sz - 1) {
416189c6957cSBarry Smith           tmp0 = t[*idx];
416289c6957cSBarry Smith           sum1 -= v1[0] * tmp0;
416389c6957cSBarry Smith           sum2 -= v2[0] * tmp0;
416489c6957cSBarry Smith           sum3 -= v3[0] * tmp0;
416589c6957cSBarry Smith           sum4 -= v4[0] * tmp0;
416689c6957cSBarry Smith           sum5 -= v5[0] * tmp0;
416789c6957cSBarry Smith         }
416889c6957cSBarry Smith         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
416989c6957cSBarry Smith         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
417089c6957cSBarry Smith         x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
417189c6957cSBarry Smith         x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
417289c6957cSBarry Smith         x[row + 4] += t[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
41739371c9d4SSatish Balay         ibdiag += 25;
41749371c9d4SSatish Balay         row += 5;
417589c6957cSBarry Smith         break;
4176d71ae5a4SJacob Faibussowitsch       default:
4177d71ae5a4SJacob Faibussowitsch         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
417889c6957cSBarry Smith       }
417989c6957cSBarry Smith     }
41809566063dSJacob Faibussowitsch     PetscCall(PetscLogFlops(a->nz));
41815850ef23SBarry Smith   }
41829566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
41839566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
41843ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
41852af78befSBarry Smith }
41862af78befSBarry Smith 
4187ff6a9541SJacob Faibussowitsch static PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A, Vec bb, Vec xx)
4188d71ae5a4SJacob Faibussowitsch {
418989c6957cSBarry Smith   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
419089c6957cSBarry Smith   PetscScalar       *x, tmp1, tmp2, tmp3, tmp4, tmp5, x1, x2, x3, x4, x5;
419189c6957cSBarry Smith   const MatScalar   *bdiag = a->inode.bdiag;
419289c6957cSBarry Smith   const PetscScalar *b;
419389c6957cSBarry Smith   PetscInt           m = a->inode.node_count, cnt = 0, i, row;
419489c6957cSBarry Smith   const PetscInt    *sizes = a->inode.size;
41952af78befSBarry Smith 
419689c6957cSBarry Smith   PetscFunctionBegin;
419708401ef6SPierre Jolivet   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
41989566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
41999566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
420089c6957cSBarry Smith   cnt = 0;
420189c6957cSBarry Smith   for (i = 0, row = 0; i < m; i++) {
420289c6957cSBarry Smith     switch (sizes[i]) {
420389c6957cSBarry Smith     case 1:
42049371c9d4SSatish Balay       x[row] = b[row] * bdiag[cnt++];
42059371c9d4SSatish Balay       row++;
420689c6957cSBarry Smith       break;
420789c6957cSBarry Smith     case 2:
42089371c9d4SSatish Balay       x1       = b[row];
42099371c9d4SSatish Balay       x2       = b[row + 1];
421089c6957cSBarry Smith       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2];
421189c6957cSBarry Smith       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3];
421289c6957cSBarry Smith       x[row++] = tmp1;
421389c6957cSBarry Smith       x[row++] = tmp2;
421489c6957cSBarry Smith       cnt += 4;
421589c6957cSBarry Smith       break;
421689c6957cSBarry Smith     case 3:
42179371c9d4SSatish Balay       x1       = b[row];
42189371c9d4SSatish Balay       x2       = b[row + 1];
42199371c9d4SSatish Balay       x3       = b[row + 2];
422089c6957cSBarry Smith       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6];
422189c6957cSBarry Smith       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7];
422289c6957cSBarry Smith       tmp3     = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8];
422389c6957cSBarry Smith       x[row++] = tmp1;
422489c6957cSBarry Smith       x[row++] = tmp2;
422589c6957cSBarry Smith       x[row++] = tmp3;
422689c6957cSBarry Smith       cnt += 9;
422789c6957cSBarry Smith       break;
422889c6957cSBarry Smith     case 4:
42299371c9d4SSatish Balay       x1       = b[row];
42309371c9d4SSatish Balay       x2       = b[row + 1];
42319371c9d4SSatish Balay       x3       = b[row + 2];
42329371c9d4SSatish Balay       x4       = b[row + 3];
423389c6957cSBarry Smith       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12];
423489c6957cSBarry Smith       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13];
423589c6957cSBarry Smith       tmp3     = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14];
423689c6957cSBarry Smith       tmp4     = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15];
423789c6957cSBarry Smith       x[row++] = tmp1;
423889c6957cSBarry Smith       x[row++] = tmp2;
423989c6957cSBarry Smith       x[row++] = tmp3;
424089c6957cSBarry Smith       x[row++] = tmp4;
424189c6957cSBarry Smith       cnt += 16;
424289c6957cSBarry Smith       break;
424389c6957cSBarry Smith     case 5:
42449371c9d4SSatish Balay       x1       = b[row];
42459371c9d4SSatish Balay       x2       = b[row + 1];
42469371c9d4SSatish Balay       x3       = b[row + 2];
42479371c9d4SSatish Balay       x4       = b[row + 3];
42489371c9d4SSatish Balay       x5       = b[row + 4];
424989c6957cSBarry Smith       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20];
425089c6957cSBarry Smith       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21];
425189c6957cSBarry Smith       tmp3     = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22];
425289c6957cSBarry Smith       tmp4     = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23];
425389c6957cSBarry Smith       tmp5     = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24];
425489c6957cSBarry Smith       x[row++] = tmp1;
425589c6957cSBarry Smith       x[row++] = tmp2;
425689c6957cSBarry Smith       x[row++] = tmp3;
425789c6957cSBarry Smith       x[row++] = tmp4;
425889c6957cSBarry Smith       x[row++] = tmp5;
425989c6957cSBarry Smith       cnt += 25;
426089c6957cSBarry Smith       break;
4261d71ae5a4SJacob Faibussowitsch     default:
4262d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
426389c6957cSBarry Smith     }
426489c6957cSBarry Smith   }
42659566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * cnt));
42669566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
42679566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
42683ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
426989c6957cSBarry Smith }
427089c6957cSBarry Smith 
4271d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJ_Inode_ResetOps(Mat A)
4272d71ae5a4SJacob Faibussowitsch {
4273b215bc84SStefano Zampini   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4274b215bc84SStefano Zampini 
4275b215bc84SStefano Zampini   PetscFunctionBegin;
4276b215bc84SStefano Zampini   a->inode.node_count       = 0;
4277b215bc84SStefano Zampini   a->inode.use              = PETSC_FALSE;
4278b215bc84SStefano Zampini   a->inode.checked          = PETSC_FALSE;
4279b215bc84SStefano Zampini   a->inode.mat_nonzerostate = -1;
4280b215bc84SStefano Zampini   A->ops->getrowij          = MatGetRowIJ_SeqAIJ;
4281b215bc84SStefano Zampini   A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ;
4282b215bc84SStefano Zampini   A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ;
4283b215bc84SStefano Zampini   A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ;
4284b215bc84SStefano Zampini   A->ops->coloringpatch     = NULL;
4285b215bc84SStefano Zampini   A->ops->multdiagonalblock = NULL;
4286ad540459SPierre Jolivet   if (A->factortype) A->ops->solve = MatSolve_SeqAIJ_inplace;
42873ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4288b215bc84SStefano Zampini }
4289b215bc84SStefano Zampini 
42904c1414c8SBarry Smith /*
42914c1414c8SBarry Smith     samestructure indicates that the matrix has not changed its nonzero structure so we
42924c1414c8SBarry Smith     do not need to recompute the inodes
42934c1414c8SBarry Smith */
4294d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCheckInode(Mat A)
4295d71ae5a4SJacob Faibussowitsch {
42964c1414c8SBarry Smith   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
42978758e1faSBarry Smith   PetscInt        i, j, m, nzx, nzy, *ns, node_count, blk_size;
4298ace3abfcSBarry Smith   PetscBool       flag;
42998758e1faSBarry Smith   const PetscInt *idx, *idy, *ii;
43004c1414c8SBarry Smith 
43014c1414c8SBarry Smith   PetscFunctionBegin;
4302b215bc84SStefano Zampini   if (!a->inode.use) {
43039566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJ_Inode_ResetOps(A));
43049566063dSJacob Faibussowitsch     PetscCall(PetscFree(a->inode.size));
43053ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
4306b215bc84SStefano Zampini   }
43073ba16761SJacob Faibussowitsch   if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(PETSC_SUCCESS);
43084c1414c8SBarry Smith 
4309d0f46423SBarry Smith   m = A->rmap->n;
43109566063dSJacob Faibussowitsch   if (!a->inode.size) PetscCall(PetscMalloc1(m + 1, &a->inode.size));
4311b215bc84SStefano Zampini   ns = a->inode.size;
43124c1414c8SBarry Smith 
43134c1414c8SBarry Smith   i          = 0;
43144c1414c8SBarry Smith   node_count = 0;
43154c1414c8SBarry Smith   idx        = a->j;
43164c1414c8SBarry Smith   ii         = a->i;
43176f2c871aSStefano Zampini   if (idx) {
43184c1414c8SBarry Smith     while (i < m) {            /* For each row */
43194c1414c8SBarry Smith       nzx = ii[i + 1] - ii[i]; /* Number of nonzeros */
43204c1414c8SBarry Smith       /* Limits the number of elements in a node to 'a->inode.limit' */
43214c1414c8SBarry Smith       for (j = i + 1, idy = idx, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) {
43224c1414c8SBarry Smith         nzy = ii[j + 1] - ii[j]; /* Same number of nonzeros */
43234c1414c8SBarry Smith         if (nzy != nzx) break;
43244c1414c8SBarry Smith         idy += nzx; /* Same nonzero pattern */
43259566063dSJacob Faibussowitsch         PetscCall(PetscArraycmp(idx, idy, nzx, &flag));
43264c1414c8SBarry Smith         if (!flag) break;
43274c1414c8SBarry Smith       }
43284c1414c8SBarry Smith       ns[node_count++] = blk_size;
43294c1414c8SBarry Smith       idx += blk_size * nzx;
43304c1414c8SBarry Smith       i = j;
43314c1414c8SBarry Smith     }
43326f2c871aSStefano Zampini   }
43334c1414c8SBarry Smith   /* If not enough inodes found,, do not use inode version of the routines */
43346f2c871aSStefano Zampini   if (!m || !idx || node_count > .8 * m) {
43359566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJ_Inode_ResetOps(A));
43369566063dSJacob Faibussowitsch     PetscCall(PetscFree(a->inode.size));
43379566063dSJacob Faibussowitsch     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m));
43384c1414c8SBarry Smith   } else {
4339d5f3da31SBarry Smith     if (!A->factortype) {
4340375a6242SBarry Smith       A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4341375a6242SBarry Smith       if (A->rmap->n == A->cmap->n) {
43424108e4d5SBarry Smith         A->ops->getrowij        = MatGetRowIJ_SeqAIJ_Inode;
43434108e4d5SBarry Smith         A->ops->restorerowij    = MatRestoreRowIJ_SeqAIJ_Inode;
43444108e4d5SBarry Smith         A->ops->getcolumnij     = MatGetColumnIJ_SeqAIJ_Inode;
43454108e4d5SBarry Smith         A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
43464108e4d5SBarry Smith         A->ops->coloringpatch   = MatColoringPatch_SeqAIJ_Inode;
4347375a6242SBarry Smith       }
4348d3ac4fa3SBarry Smith     } else {
4349d3ac4fa3SBarry Smith       A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4350d3ac4fa3SBarry Smith     }
43514c1414c8SBarry Smith     a->inode.node_count = node_count;
43529566063dSJacob Faibussowitsch     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit));
43534c1414c8SBarry Smith   }
4354be6adb11SBarry Smith   a->inode.checked          = PETSC_TRUE;
4355a02bda8eSBarry Smith   a->inode.mat_nonzerostate = A->nonzerostate;
43563ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
43574c1414c8SBarry Smith }
43584c1414c8SBarry Smith 
4359d71ae5a4SJacob Faibussowitsch PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A, MatDuplicateOption cpvalues, Mat *C)
4360d71ae5a4SJacob Faibussowitsch {
4361150f0143SBarry Smith   Mat         B = *C;
4362150f0143SBarry Smith   Mat_SeqAIJ *c = (Mat_SeqAIJ *)B->data, *a = (Mat_SeqAIJ *)A->data;
4363150f0143SBarry Smith   PetscInt    m = A->rmap->n;
4364150f0143SBarry Smith 
4365150f0143SBarry Smith   PetscFunctionBegin;
4366150f0143SBarry Smith   c->inode.use              = a->inode.use;
4367150f0143SBarry Smith   c->inode.limit            = a->inode.limit;
4368150f0143SBarry Smith   c->inode.max_limit        = a->inode.max_limit;
4369ec710b6aSStefano Zampini   c->inode.checked          = PETSC_FALSE;
4370ec710b6aSStefano Zampini   c->inode.size             = NULL;
4371ec710b6aSStefano Zampini   c->inode.node_count       = 0;
4372ec710b6aSStefano Zampini   c->inode.ibdiagvalid      = PETSC_FALSE;
4373ec710b6aSStefano Zampini   c->inode.ibdiag           = NULL;
4374ec710b6aSStefano Zampini   c->inode.bdiag            = NULL;
4375ec710b6aSStefano Zampini   c->inode.mat_nonzerostate = -1;
4376b215bc84SStefano Zampini   if (a->inode.use) {
4377ec710b6aSStefano Zampini     if (a->inode.checked && a->inode.size) {
43789566063dSJacob Faibussowitsch       PetscCall(PetscMalloc1(m + 1, &c->inode.size));
43799566063dSJacob Faibussowitsch       PetscCall(PetscArraycpy(c->inode.size, a->inode.size, m + 1));
4380ec710b6aSStefano Zampini 
4381ec710b6aSStefano Zampini       c->inode.checked          = PETSC_TRUE;
4382ec710b6aSStefano Zampini       c->inode.node_count       = a->inode.node_count;
4383ec710b6aSStefano Zampini       c->inode.mat_nonzerostate = (*C)->nonzerostate;
4384ec710b6aSStefano Zampini     }
4385a02bda8eSBarry Smith     /* note the table of functions below should match that in MatSeqAIJCheckInode() */
43862c451681SBarry Smith     if (!B->factortype) {
43872c451681SBarry Smith       B->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
43882c451681SBarry Smith       B->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
43892c451681SBarry Smith       B->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
43902c451681SBarry Smith       B->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
43912c451681SBarry Smith       B->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
43922c451681SBarry Smith       B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4393150f0143SBarry Smith     } else {
43942c451681SBarry Smith       B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4395150f0143SBarry Smith     }
4396150f0143SBarry Smith   }
43973ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4398150f0143SBarry Smith }
4399150f0143SBarry Smith 
4400d71ae5a4SJacob Faibussowitsch static inline PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols, PetscInt nzl, PetscInt nzu, PetscInt nz, const PetscInt *ai, const PetscInt *aj, const PetscInt *adiag, PetscInt row)
4401d71ae5a4SJacob Faibussowitsch {
44028758e1faSBarry Smith   PetscInt        k;
44038758e1faSBarry Smith   const PetscInt *vi;
44046e111a19SKarl Rupp 
440517454e89SShri Abhyankar   PetscFunctionBegin;
440617454e89SShri Abhyankar   vi = aj + ai[row];
440717454e89SShri Abhyankar   for (k = 0; k < nzl; k++) cols[k] = vi[k];
440817454e89SShri Abhyankar   vi        = aj + adiag[row];
440917454e89SShri Abhyankar   cols[nzl] = vi[0];
441017454e89SShri Abhyankar   vi        = aj + adiag[row + 1] + 1;
441117454e89SShri Abhyankar   for (k = 0; k < nzu; k++) cols[nzl + 1 + k] = vi[k];
44123ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
441317454e89SShri Abhyankar }
44146936b636SHong Zhang /*
4415a02bda8eSBarry Smith    MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix.
4416a02bda8eSBarry Smith    Modified from MatSeqAIJCheckInode().
44176936b636SHong Zhang 
44186936b636SHong Zhang    Input Parameters:
4419abb87a52SBarry Smith .  Mat A - ILU or LU matrix factor
4420abb87a52SBarry Smith 
44216936b636SHong Zhang */
4422d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A)
4423d71ae5a4SJacob Faibussowitsch {
4424019b515eSShri Abhyankar   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
4425019b515eSShri Abhyankar   PetscInt        i, j, m, nzl1, nzu1, nzl2, nzu2, nzx, nzy, node_count, blk_size;
44268758e1faSBarry Smith   PetscInt       *cols1, *cols2, *ns;
44278758e1faSBarry Smith   const PetscInt *ai = a->i, *aj = a->j, *adiag = a->diag;
4428ace3abfcSBarry Smith   PetscBool       flag;
4429019b515eSShri Abhyankar 
4430019b515eSShri Abhyankar   PetscFunctionBegin;
44313ba16761SJacob Faibussowitsch   if (!a->inode.use) PetscFunctionReturn(PETSC_SUCCESS);
44323ba16761SJacob Faibussowitsch   if (a->inode.checked) PetscFunctionReturn(PETSC_SUCCESS);
4433019b515eSShri Abhyankar 
4434019b515eSShri Abhyankar   m = A->rmap->n;
44352205254eSKarl Rupp   if (a->inode.size) ns = a->inode.size;
443648a46eb9SPierre Jolivet   else PetscCall(PetscMalloc1(m + 1, &ns));
4437019b515eSShri Abhyankar 
4438019b515eSShri Abhyankar   i          = 0;
4439019b515eSShri Abhyankar   node_count = 0;
44409566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(m, &cols1, m, &cols2));
4441019b515eSShri Abhyankar   while (i < m) {                       /* For each row */
4442019b515eSShri Abhyankar     nzl1 = ai[i + 1] - ai[i];           /* Number of nonzeros in L */
4443019b515eSShri Abhyankar     nzu1 = adiag[i] - adiag[i + 1] - 1; /* Number of nonzeros in U excluding diagonal*/
4444019b515eSShri Abhyankar     nzx  = nzl1 + nzu1 + 1;
44453ba16761SJacob Faibussowitsch     PetscCall(MatGetRow_FactoredLU(cols1, nzl1, nzu1, nzx, ai, aj, adiag, i));
4446019b515eSShri Abhyankar 
4447019b515eSShri Abhyankar     /* Limits the number of elements in a node to 'a->inode.limit' */
4448019b515eSShri Abhyankar     for (j = i + 1, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) {
4449019b515eSShri Abhyankar       nzl2 = ai[j + 1] - ai[j];
4450019b515eSShri Abhyankar       nzu2 = adiag[j] - adiag[j + 1] - 1;
4451019b515eSShri Abhyankar       nzy  = nzl2 + nzu2 + 1;
4452019b515eSShri Abhyankar       if (nzy != nzx) break;
44539566063dSJacob Faibussowitsch       PetscCall(MatGetRow_FactoredLU(cols2, nzl2, nzu2, nzy, ai, aj, adiag, j));
44549566063dSJacob Faibussowitsch       PetscCall(PetscArraycmp(cols1, cols2, nzx, &flag));
44558758e1faSBarry Smith       if (!flag) break;
4456019b515eSShri Abhyankar     }
4457019b515eSShri Abhyankar     ns[node_count++] = blk_size;
4458019b515eSShri Abhyankar     i                = j;
4459019b515eSShri Abhyankar   }
44609566063dSJacob Faibussowitsch   PetscCall(PetscFree2(cols1, cols2));
4461019b515eSShri Abhyankar   /* If not enough inodes found,, do not use inode version of the routines */
4462be6adb11SBarry Smith   if (!m || node_count > .8 * m) {
44639566063dSJacob Faibussowitsch     PetscCall(PetscFree(ns));
44642205254eSKarl Rupp 
4465019b515eSShri Abhyankar     a->inode.node_count = 0;
44660298fd71SBarry Smith     a->inode.size       = NULL;
4467019b515eSShri Abhyankar     a->inode.use        = PETSC_FALSE;
44682205254eSKarl Rupp 
44699566063dSJacob Faibussowitsch     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m));
4470019b515eSShri Abhyankar   } else {
4471f4259b30SLisandro Dalcin     A->ops->mult              = NULL;
4472f4259b30SLisandro Dalcin     A->ops->sor               = NULL;
4473f4259b30SLisandro Dalcin     A->ops->multadd           = NULL;
4474f4259b30SLisandro Dalcin     A->ops->getrowij          = NULL;
4475f4259b30SLisandro Dalcin     A->ops->restorerowij      = NULL;
4476f4259b30SLisandro Dalcin     A->ops->getcolumnij       = NULL;
4477f4259b30SLisandro Dalcin     A->ops->restorecolumnij   = NULL;
4478f4259b30SLisandro Dalcin     A->ops->coloringpatch     = NULL;
4479f4259b30SLisandro Dalcin     A->ops->multdiagonalblock = NULL;
4480019b515eSShri Abhyankar     a->inode.node_count       = node_count;
4481019b515eSShri Abhyankar     a->inode.size             = ns;
44822205254eSKarl Rupp 
44839566063dSJacob Faibussowitsch     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit));
4484019b515eSShri Abhyankar   }
4485be6adb11SBarry Smith   a->inode.checked = PETSC_TRUE;
44863ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4487019b515eSShri Abhyankar }
4488019b515eSShri Abhyankar 
4489d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4490d71ae5a4SJacob Faibussowitsch {
4491acf2f550SJed Brown   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4492acf2f550SJed Brown 
4493acf2f550SJed Brown   PetscFunctionBegin;
4494acf2f550SJed Brown   a->inode.ibdiagvalid = PETSC_FALSE;
44953ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4496acf2f550SJed Brown }
4497acf2f550SJed Brown 
44984c1414c8SBarry Smith /*
44994c1414c8SBarry Smith      This is really ugly. if inodes are used this replaces the
45004c1414c8SBarry Smith   permutations with ones that correspond to rows/cols of the matrix
4501467446fbSPierre Jolivet   rather than inode blocks
45024c1414c8SBarry Smith */
4503d71ae5a4SJacob Faibussowitsch PetscErrorCode MatInodeAdjustForInodes(Mat A, IS *rperm, IS *cperm)
4504d71ae5a4SJacob Faibussowitsch {
45054c1414c8SBarry Smith   PetscFunctionBegin;
4506cac4c232SBarry Smith   PetscTryMethod(A, "MatInodeAdjustForInodes_C", (Mat, IS *, IS *), (A, rperm, cperm));
45073ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
45084c1414c8SBarry Smith }
45094c1414c8SBarry Smith 
4510d71ae5a4SJacob Faibussowitsch PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A, IS *rperm, IS *cperm)
4511d71ae5a4SJacob Faibussowitsch {
45124c1414c8SBarry Smith   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
45135d0c19d7SBarry Smith   PetscInt        m = A->rmap->n, n = A->cmap->n, i, j, nslim_row = a->inode.node_count;
45145d0c19d7SBarry Smith   const PetscInt *ridx, *cidx;
45154c1414c8SBarry Smith   PetscInt        row, col, *permr, *permc, *ns_row = a->inode.size, *tns, start_val, end_val, indx;
45164c1414c8SBarry Smith   PetscInt        nslim_col, *ns_col;
45174c1414c8SBarry Smith   IS              ris = *rperm, cis = *cperm;
45184c1414c8SBarry Smith 
45194c1414c8SBarry Smith   PetscFunctionBegin;
45203ba16761SJacob Faibussowitsch   if (!a->inode.size) PetscFunctionReturn(PETSC_SUCCESS);           /* no inodes so return */
45213ba16761SJacob Faibussowitsch   if (a->inode.node_count == m) PetscFunctionReturn(PETSC_SUCCESS); /* all inodes are of size 1 */
45224c1414c8SBarry Smith 
45239566063dSJacob Faibussowitsch   PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
4524*32603206SJames Wright   PetscCall(PetscMalloc1(((nslim_row > nslim_col ? nslim_row : nslim_col) + 1), &tns));
45259566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(m, &permr, n, &permc));
45264c1414c8SBarry Smith 
45279566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(ris, &ridx));
45289566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(cis, &cidx));
45294c1414c8SBarry Smith 
4530baca6076SPierre Jolivet   /* Form the inode structure for the rows of permuted matrix using inv perm*/
45314c1414c8SBarry Smith   for (i = 0, tns[0] = 0; i < nslim_row; ++i) tns[i + 1] = tns[i] + ns_row[i];
45324c1414c8SBarry Smith 
45334c1414c8SBarry Smith   /* Construct the permutations for rows*/
45344c1414c8SBarry Smith   for (i = 0, row = 0; i < nslim_row; ++i) {
45354c1414c8SBarry Smith     indx      = ridx[i];
45364c1414c8SBarry Smith     start_val = tns[indx];
45374c1414c8SBarry Smith     end_val   = tns[indx + 1];
45384c1414c8SBarry Smith     for (j = start_val; j < end_val; ++j, ++row) permr[row] = j;
45394c1414c8SBarry Smith   }
45404c1414c8SBarry Smith 
45414c1414c8SBarry Smith   /* Form the inode structure for the columns of permuted matrix using inv perm*/
45424c1414c8SBarry Smith   for (i = 0, tns[0] = 0; i < nslim_col; ++i) tns[i + 1] = tns[i] + ns_col[i];
45434c1414c8SBarry Smith 
45444c1414c8SBarry Smith   /* Construct permutations for columns */
45454c1414c8SBarry Smith   for (i = 0, col = 0; i < nslim_col; ++i) {
45464c1414c8SBarry Smith     indx      = cidx[i];
45474c1414c8SBarry Smith     start_val = tns[indx];
45484c1414c8SBarry Smith     end_val   = tns[indx + 1];
45494c1414c8SBarry Smith     for (j = start_val; j < end_val; ++j, ++col) permc[col] = j;
45504c1414c8SBarry Smith   }
45514c1414c8SBarry Smith 
45529566063dSJacob Faibussowitsch   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permr, PETSC_COPY_VALUES, rperm));
45539566063dSJacob Faibussowitsch   PetscCall(ISSetPermutation(*rperm));
45549566063dSJacob Faibussowitsch   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permc, PETSC_COPY_VALUES, cperm));
45559566063dSJacob Faibussowitsch   PetscCall(ISSetPermutation(*cperm));
45564c1414c8SBarry Smith 
45579566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(ris, &ridx));
45589566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(cis, &cidx));
45594c1414c8SBarry Smith 
45609566063dSJacob Faibussowitsch   PetscCall(PetscFree(ns_col));
45619566063dSJacob Faibussowitsch   PetscCall(PetscFree2(permr, permc));
45629566063dSJacob Faibussowitsch   PetscCall(ISDestroy(&cis));
45639566063dSJacob Faibussowitsch   PetscCall(ISDestroy(&ris));
45649566063dSJacob Faibussowitsch   PetscCall(PetscFree(tns));
45653ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
45664c1414c8SBarry Smith }
45674c1414c8SBarry Smith 
45684c1414c8SBarry Smith /*@C
456911a5261eSBarry Smith   MatInodeGetInodeSizes - Returns the inode information of a matrix with inodes
45704c1414c8SBarry Smith 
45713f9fe445SBarry Smith   Not Collective
45724c1414c8SBarry Smith 
45734c1414c8SBarry Smith   Input Parameter:
457411a5261eSBarry Smith . A - the Inode matrix or matrix derived from the Inode class -- e.g., `MATSEQAIJ`
45754c1414c8SBarry Smith 
4576d8d19677SJose E. Roman   Output Parameters:
45774c1414c8SBarry Smith + node_count - no of inodes present in the matrix.
45782ef1f0ffSBarry Smith . sizes      - an array of size `node_count`, with the sizes of each inode.
45794c1414c8SBarry Smith - limit      - the max size used to generate the inodes.
45804c1414c8SBarry Smith 
45814c1414c8SBarry Smith   Level: advanced
45824c1414c8SBarry Smith 
458311a5261eSBarry Smith   Note:
45844c1414c8SBarry Smith   It should be called after the matrix is assembled.
45854c1414c8SBarry Smith   The contents of the sizes[] array should not be changed.
45862ef1f0ffSBarry Smith   `NULL` may be passed for information not needed
45874c1414c8SBarry Smith 
45881cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatGetInfo()`
45894c1414c8SBarry Smith @*/
4590d71ae5a4SJacob Faibussowitsch PetscErrorCode MatInodeGetInodeSizes(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit)
4591d71ae5a4SJacob Faibussowitsch {
45925f80ce2aSJacob Faibussowitsch   PetscErrorCode (*f)(Mat, PetscInt *, PetscInt **, PetscInt *);
45934c1414c8SBarry Smith 
45944c1414c8SBarry Smith   PetscFunctionBegin;
45955f80ce2aSJacob Faibussowitsch   PetscCheck(A->assembled, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix");
45969566063dSJacob Faibussowitsch   PetscCall(PetscObjectQueryFunction((PetscObject)A, "MatInodeGetInodeSizes_C", &f));
45979566063dSJacob Faibussowitsch   if (f) PetscCall((*f)(A, node_count, sizes, limit));
45983ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
45994c1414c8SBarry Smith }
46004c1414c8SBarry Smith 
4601d71ae5a4SJacob Faibussowitsch PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit)
4602d71ae5a4SJacob Faibussowitsch {
46034c1414c8SBarry Smith   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
46044c1414c8SBarry Smith 
46054c1414c8SBarry Smith   PetscFunctionBegin;
46064c1414c8SBarry Smith   if (node_count) *node_count = a->inode.node_count;
46074c1414c8SBarry Smith   if (sizes) *sizes = a->inode.size;
46084c1414c8SBarry Smith   if (limit) *limit = a->inode.limit;
46093ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
46104c1414c8SBarry Smith }
4611