xref: /petsc/src/mat/impls/aij/seq/inode.c (revision 708a0e70cd6725d8c07183d050e500ccd6c2dca6)
14c1414c8SBarry Smith /*
24c1414c8SBarry Smith   This file provides high performance routines for the Inode format (compressed sparse row)
34c1414c8SBarry Smith   by taking advantage of rows with identical nonzero structure (I-nodes).
44c1414c8SBarry Smith */
5c6db04a5SJed Brown #include <../src/mat/impls/aij/seq/aij.h>
6fb56d528SJed Brown #if defined(PETSC_HAVE_XMMINTRIN_H)
7fb56d528SJed Brown   #include <xmmintrin.h>
8fb56d528SJed Brown #endif
94c1414c8SBarry Smith 
10d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatCreateColInode_Private(Mat A, PetscInt *size, PetscInt **ns)
11d71ae5a4SJacob Faibussowitsch {
124c1414c8SBarry Smith   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
134c1414c8SBarry Smith   PetscInt    i, count, m, n, min_mn, *ns_row, *ns_col;
144c1414c8SBarry Smith 
154c1414c8SBarry Smith   PetscFunctionBegin;
16d0f46423SBarry Smith   n = A->cmap->n;
17d0f46423SBarry Smith   m = A->rmap->n;
184d12350bSJunchao Zhang   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
194d12350bSJunchao Zhang   ns_row = a->inode.size_csr;
204c1414c8SBarry Smith 
214c1414c8SBarry Smith   min_mn = (m < n) ? m : n;
224c1414c8SBarry Smith   if (!ns) {
234d12350bSJunchao Zhang     for (count = 0, i = 0; count < min_mn; count += (ns_row[i + 1] - ns_row[i]), i++);
24fbccb6d4SPierre Jolivet     for (; count + 1 < n; count++, i++);
25ad540459SPierre Jolivet     if (count < n) i++;
264c1414c8SBarry Smith     *size = i;
273ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
284c1414c8SBarry Smith   }
299566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(n + 1, &ns_col));
304d12350bSJunchao Zhang   ns_col[0] = 0;
314c1414c8SBarry Smith 
324c1414c8SBarry Smith   /* Use the same row structure wherever feasible. */
334d12350bSJunchao Zhang   for (count = 0, i = 0; count < min_mn; count += (ns_row[i + 1] - ns_row[i]), i++) ns_col[i + 1] = ns_row[i + 1];
344c1414c8SBarry Smith 
354c1414c8SBarry Smith   /* if m < n; pad up the remainder with inode_limit */
364d12350bSJunchao Zhang   for (; count + 1 < n; count++, i++) ns_col[i + 1] = ns_col[i] + 1;
37aaa8cc7dSPierre Jolivet   /* The last node is the odd ball. pad it up with the remaining rows; */
384c1414c8SBarry Smith   if (count < n) {
394d12350bSJunchao Zhang     ns_col[i + 1] = ns_col[i] + (n - count);
404c1414c8SBarry Smith     i++;
414c1414c8SBarry Smith   } else if (count > n) {
424c1414c8SBarry Smith     /* Adjust for the over estimation */
434d12350bSJunchao Zhang     ns_col[i] += n - count;
444c1414c8SBarry Smith   }
454c1414c8SBarry Smith   *size = i;
464c1414c8SBarry Smith   *ns   = ns_col;
473ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
484c1414c8SBarry Smith }
494c1414c8SBarry Smith 
504c1414c8SBarry Smith /*
514c1414c8SBarry Smith       This builds symmetric version of nonzero structure,
524c1414c8SBarry Smith */
53d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
54d71ae5a4SJacob Faibussowitsch {
554c1414c8SBarry Smith   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
568758e1faSBarry Smith   PetscInt       *work, *ia, *ja, nz, nslim_row, nslim_col, m, row, col, n;
574d12350bSJunchao Zhang   PetscInt       *tns, *tvc, *ns_row = a->inode.size_csr, *ns_col, nsz, i1, i2;
588758e1faSBarry Smith   const PetscInt *j, *jmax, *ai = a->i, *aj = a->j;
594c1414c8SBarry Smith 
604c1414c8SBarry Smith   PetscFunctionBegin;
614c1414c8SBarry Smith   nslim_row = a->inode.node_count;
62d0f46423SBarry Smith   m         = A->rmap->n;
63d0f46423SBarry Smith   n         = A->cmap->n;
6408401ef6SPierre Jolivet   PetscCheck(m == n, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
654d12350bSJunchao Zhang   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
664c1414c8SBarry Smith 
674c1414c8SBarry Smith   /* Use the row_inode as column_inode */
684c1414c8SBarry Smith   nslim_col = nslim_row;
694c1414c8SBarry Smith   ns_col    = ns_row;
704c1414c8SBarry Smith 
7135cb6cd3SPierre Jolivet   /* allocate space for reformatted inode structure */
729566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
734d12350bSJunchao Zhang   for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + (ns_row[i1 + 1] - ns_row[i1]);
744c1414c8SBarry Smith 
754c1414c8SBarry Smith   for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
764d12350bSJunchao Zhang     nsz = ns_col[i1 + 1] - ns_col[i1];
772205254eSKarl Rupp     for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
784c1414c8SBarry Smith   }
794c1414c8SBarry Smith   /* allocate space for row pointers */
809566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(nslim_row + 1, &ia));
814c1414c8SBarry Smith   *iia = ia;
829566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(nslim_row + 1, &work));
834c1414c8SBarry Smith 
844c1414c8SBarry Smith   /* determine the number of columns in each row */
854c1414c8SBarry Smith   ia[0] = oshift;
864d12350bSJunchao Zhang   for (i1 = 0; i1 < nslim_row; i1++) {
874d12350bSJunchao Zhang     row  = ns_row[i1];
884c1414c8SBarry Smith     j    = aj + ai[row] + ishift;
894c1414c8SBarry Smith     jmax = aj + ai[row + 1] + ishift;
9083fed2edSSatish Balay     if (j == jmax) continue; /* empty row */
914c1414c8SBarry Smith     col = *j++ + ishift;
924c1414c8SBarry Smith     i2  = tvc[col];
936aad120cSJose E. Roman     while (i2 < i1 && j < jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elements */
944c1414c8SBarry Smith       ia[i1 + 1]++;
954c1414c8SBarry Smith       ia[i2 + 1]++;
964c1414c8SBarry Smith       i2++; /* Start col of next node */
9790d2dec7SBarry Smith       while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j;
984c1414c8SBarry Smith       i2 = tvc[col];
994c1414c8SBarry Smith     }
1004c1414c8SBarry Smith     if (i2 == i1) ia[i2 + 1]++; /* now the diagonal element */
1014c1414c8SBarry Smith   }
1024c1414c8SBarry Smith 
1034c1414c8SBarry Smith   /* shift ia[i] to point to next row */
1044c1414c8SBarry Smith   for (i1 = 1; i1 < nslim_row + 1; i1++) {
1054c1414c8SBarry Smith     row = ia[i1 - 1];
1064c1414c8SBarry Smith     ia[i1] += row;
1074c1414c8SBarry Smith     work[i1 - 1] = row - oshift;
1084c1414c8SBarry Smith   }
1094c1414c8SBarry Smith 
1104c1414c8SBarry Smith   /* allocate space for column pointers */
1114c1414c8SBarry Smith   nz = ia[nslim_row] + (!ishift);
1129566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(nz, &ja));
1134c1414c8SBarry Smith   *jja = ja;
1144c1414c8SBarry Smith 
1154c1414c8SBarry Smith   /* loop over lower triangular part putting into ja */
1164d12350bSJunchao Zhang   for (i1 = 0; i1 < nslim_row; i1++) {
1174d12350bSJunchao Zhang     row  = ns_row[i1];
1184c1414c8SBarry Smith     j    = aj + ai[row] + ishift;
1194c1414c8SBarry Smith     jmax = aj + ai[row + 1] + ishift;
12083fed2edSSatish Balay     if (j == jmax) continue; /* empty row */
1214c1414c8SBarry Smith     col = *j++ + ishift;
1224c1414c8SBarry Smith     i2  = tvc[col];
1234c1414c8SBarry Smith     while (i2 < i1 && j < jmax) {
1244c1414c8SBarry Smith       ja[work[i2]++] = i1 + oshift;
1254c1414c8SBarry Smith       ja[work[i1]++] = i2 + oshift;
1264c1414c8SBarry Smith       ++i2;
12790d2dec7SBarry Smith       while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j; /* Skip rest col indices in this node */
1284c1414c8SBarry Smith       i2 = tvc[col];
1294c1414c8SBarry Smith     }
1304c1414c8SBarry Smith     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
1314c1414c8SBarry Smith   }
1329566063dSJacob Faibussowitsch   PetscCall(PetscFree(work));
1339566063dSJacob Faibussowitsch   PetscCall(PetscFree2(tns, tvc));
1343ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1354c1414c8SBarry Smith }
1364c1414c8SBarry Smith 
1374c1414c8SBarry Smith /*
1384c1414c8SBarry Smith       This builds nonsymmetric version of nonzero structure,
1394c1414c8SBarry Smith */
140d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
141d71ae5a4SJacob Faibussowitsch {
1424c1414c8SBarry Smith   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
1438758e1faSBarry Smith   PetscInt       *work, *ia, *ja, nz, nslim_row, n, row, col, *ns_col, nslim_col;
1448758e1faSBarry Smith   PetscInt       *tns, *tvc, nsz, i1, i2;
1454d12350bSJunchao Zhang   const PetscInt *j, *ai = a->i, *aj = a->j, *ns_row = a->inode.size_csr;
1464c1414c8SBarry Smith 
1474c1414c8SBarry Smith   PetscFunctionBegin;
1484d12350bSJunchao Zhang   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
1494c1414c8SBarry Smith   nslim_row = a->inode.node_count;
150d0f46423SBarry Smith   n         = A->cmap->n;
1514c1414c8SBarry Smith 
1524c1414c8SBarry Smith   /* Create The column_inode for this matrix */
1539566063dSJacob Faibussowitsch   PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
1544c1414c8SBarry Smith 
15535cb6cd3SPierre Jolivet   /* allocate space for reformatted column_inode structure */
1569566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
1574d12350bSJunchao Zhang   for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + (ns_col[i1 + 1] - ns_col[i1]);
1584c1414c8SBarry Smith 
1594c1414c8SBarry Smith   for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
1604d12350bSJunchao Zhang     nsz = ns_col[i1 + 1] - ns_col[i1];
1612205254eSKarl Rupp     for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
1624c1414c8SBarry Smith   }
1634c1414c8SBarry Smith   /* allocate space for row pointers */
1649566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(nslim_row + 1, &ia));
1654c1414c8SBarry Smith   *iia = ia;
1669566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(nslim_row + 1, &work));
1674c1414c8SBarry Smith 
1684c1414c8SBarry Smith   /* determine the number of columns in each row */
1694c1414c8SBarry Smith   ia[0] = oshift;
1704d12350bSJunchao Zhang   for (i1 = 0; i1 < nslim_row; i1++) {
1714d12350bSJunchao Zhang     row = ns_row[i1];
1724c1414c8SBarry Smith     j   = aj + ai[row] + ishift;
17383fed2edSSatish Balay     nz  = ai[row + 1] - ai[row];
17483fed2edSSatish Balay     if (!nz) continue; /* empty row */
1754c1414c8SBarry Smith     col = *j++ + ishift;
1764c1414c8SBarry Smith     i2  = tvc[col];
1776aad120cSJose E. Roman     while (nz-- > 0) { /* off-diagonal elements */
1784c1414c8SBarry Smith       ia[i1 + 1]++;
1794c1414c8SBarry Smith       i2++; /* Start col of next node */
180a8e3a797SJed Brown       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
1814c1414c8SBarry Smith       if (nz > 0) i2 = tvc[col];
1824c1414c8SBarry Smith     }
1834c1414c8SBarry Smith   }
1844c1414c8SBarry Smith 
1854c1414c8SBarry Smith   /* shift ia[i] to point to next row */
1864c1414c8SBarry Smith   for (i1 = 1; i1 < nslim_row + 1; i1++) {
1874c1414c8SBarry Smith     row = ia[i1 - 1];
1884c1414c8SBarry Smith     ia[i1] += row;
1894c1414c8SBarry Smith     work[i1 - 1] = row - oshift;
1904c1414c8SBarry Smith   }
1914c1414c8SBarry Smith 
1924c1414c8SBarry Smith   /* allocate space for column pointers */
1934c1414c8SBarry Smith   nz = ia[nslim_row] + (!ishift);
1949566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(nz, &ja));
1954c1414c8SBarry Smith   *jja = ja;
1964c1414c8SBarry Smith 
1974c1414c8SBarry Smith   /* loop over matrix putting into ja */
1984d12350bSJunchao Zhang   for (i1 = 0; i1 < nslim_row; i1++) {
1994d12350bSJunchao Zhang     row = ns_row[i1];
2004c1414c8SBarry Smith     j   = aj + ai[row] + ishift;
20183fed2edSSatish Balay     nz  = ai[row + 1] - ai[row];
20283fed2edSSatish Balay     if (!nz) continue; /* empty row */
2034c1414c8SBarry Smith     col = *j++ + ishift;
2044c1414c8SBarry Smith     i2  = tvc[col];
2054c1414c8SBarry Smith     while (nz-- > 0) {
2064c1414c8SBarry Smith       ja[work[i1]++] = i2 + oshift;
2074c1414c8SBarry Smith       ++i2;
208a8e3a797SJed Brown       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
2094c1414c8SBarry Smith       if (nz > 0) i2 = tvc[col];
2104c1414c8SBarry Smith     }
2114c1414c8SBarry Smith   }
2129566063dSJacob Faibussowitsch   PetscCall(PetscFree(ns_col));
2139566063dSJacob Faibussowitsch   PetscCall(PetscFree(work));
2149566063dSJacob Faibussowitsch   PetscCall(PetscFree2(tns, tvc));
2153ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2164c1414c8SBarry Smith }
2174c1414c8SBarry Smith 
218d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
219d71ae5a4SJacob Faibussowitsch {
2204c1414c8SBarry Smith   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
2214c1414c8SBarry Smith 
2224c1414c8SBarry Smith   PetscFunctionBegin;
22350ba90b4SBarry Smith   if (n) *n = a->inode.node_count;
2243ba16761SJacob Faibussowitsch   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
2258f7157efSSatish Balay   if (!blockcompressed) {
2269566063dSJacob Faibussowitsch     PetscCall(MatGetRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
2278f7157efSSatish Balay   } else if (symmetric) {
2289566063dSJacob Faibussowitsch     PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift));
2294c1414c8SBarry Smith   } else {
2309566063dSJacob Faibussowitsch     PetscCall(MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift));
2314c1414c8SBarry Smith   }
2323ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2334c1414c8SBarry Smith }
2344c1414c8SBarry Smith 
235d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
236d71ae5a4SJacob Faibussowitsch {
2374c1414c8SBarry Smith   PetscFunctionBegin;
2383ba16761SJacob Faibussowitsch   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
2398f7157efSSatish Balay 
2408f7157efSSatish Balay   if (!blockcompressed) {
2419566063dSJacob Faibussowitsch     PetscCall(MatRestoreRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
2428f7157efSSatish Balay   } else {
2439566063dSJacob Faibussowitsch     PetscCall(PetscFree(*ia));
2449566063dSJacob Faibussowitsch     PetscCall(PetscFree(*ja));
2458f7157efSSatish Balay   }
2463ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2474c1414c8SBarry Smith }
2484c1414c8SBarry Smith 
249d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
250d71ae5a4SJacob Faibussowitsch {
2514c1414c8SBarry Smith   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
2524c1414c8SBarry Smith   PetscInt   *work, *ia, *ja, *j, nz, nslim_row, n, row, col, *ns_col, nslim_col;
2534d12350bSJunchao Zhang   PetscInt   *tns, *tvc, *ns_row = a->inode.size_csr, nsz, i1, i2, *ai = a->i, *aj = a->j;
2544c1414c8SBarry Smith 
2554c1414c8SBarry Smith   PetscFunctionBegin;
2564d12350bSJunchao Zhang   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2574c1414c8SBarry Smith   nslim_row = a->inode.node_count;
258d0f46423SBarry Smith   n         = A->cmap->n;
2594c1414c8SBarry Smith 
2604c1414c8SBarry Smith   /* Create The column_inode for this matrix */
2619566063dSJacob Faibussowitsch   PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
2624c1414c8SBarry Smith 
26335cb6cd3SPierre Jolivet   /* allocate space for reformatted column_inode structure */
2649566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
2654d12350bSJunchao Zhang   for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + (ns_col[i1 + 1] - ns_col[i1]);
2664c1414c8SBarry Smith 
2674c1414c8SBarry Smith   for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
2684d12350bSJunchao Zhang     nsz = ns_col[i1 + 1] - ns_col[i1];
2692205254eSKarl Rupp     for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
2704c1414c8SBarry Smith   }
2714c1414c8SBarry Smith   /* allocate space for column pointers */
2729566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(nslim_col + 1, &ia));
2734c1414c8SBarry Smith   *iia = ia;
2749566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(nslim_col + 1, &work));
2754c1414c8SBarry Smith 
2764c1414c8SBarry Smith   /* determine the number of columns in each row */
2774c1414c8SBarry Smith   ia[0] = oshift;
2784d12350bSJunchao Zhang   for (i1 = 0; i1 < nslim_row; i1++) {
2794d12350bSJunchao Zhang     row = ns_row[i1];
2804c1414c8SBarry Smith     j   = aj + ai[row] + ishift;
2814c1414c8SBarry Smith     col = *j++ + ishift;
2824c1414c8SBarry Smith     i2  = tvc[col];
2834c1414c8SBarry Smith     nz  = ai[row + 1] - ai[row];
2846aad120cSJose E. Roman     while (nz-- > 0) { /* off-diagonal elements */
2854c1414c8SBarry Smith       /* ia[i1+1]++; */
2864c1414c8SBarry Smith       ia[i2 + 1]++;
2874c1414c8SBarry Smith       i2++;
288a8e3a797SJed Brown       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
2894c1414c8SBarry Smith       if (nz > 0) i2 = tvc[col];
2904c1414c8SBarry Smith     }
2914c1414c8SBarry Smith   }
2924c1414c8SBarry Smith 
2934c1414c8SBarry Smith   /* shift ia[i] to point to next col */
2944c1414c8SBarry Smith   for (i1 = 1; i1 < nslim_col + 1; i1++) {
2954c1414c8SBarry Smith     col = ia[i1 - 1];
2964c1414c8SBarry Smith     ia[i1] += col;
2974c1414c8SBarry Smith     work[i1 - 1] = col - oshift;
2984c1414c8SBarry Smith   }
2994c1414c8SBarry Smith 
3004c1414c8SBarry Smith   /* allocate space for column pointers */
3014c1414c8SBarry Smith   nz = ia[nslim_col] + (!ishift);
3029566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(nz, &ja));
3034c1414c8SBarry Smith   *jja = ja;
3044c1414c8SBarry Smith 
3054c1414c8SBarry Smith   /* loop over matrix putting into ja */
3064d12350bSJunchao Zhang   for (i1 = 0; i1 < nslim_row; i1++) {
3074d12350bSJunchao Zhang     row = ns_row[i1];
3084c1414c8SBarry Smith     j   = aj + ai[row] + ishift;
3094c1414c8SBarry Smith     col = *j++ + ishift;
3104c1414c8SBarry Smith     i2  = tvc[col];
3114c1414c8SBarry Smith     nz  = ai[row + 1] - ai[row];
3124c1414c8SBarry Smith     while (nz-- > 0) {
3134c1414c8SBarry Smith       /* ja[work[i1]++] = i2 + oshift; */
3144c1414c8SBarry Smith       ja[work[i2]++] = i1 + oshift;
3154c1414c8SBarry Smith       i2++;
316a8e3a797SJed Brown       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
3174c1414c8SBarry Smith       if (nz > 0) i2 = tvc[col];
3184c1414c8SBarry Smith     }
3194c1414c8SBarry Smith   }
3209566063dSJacob Faibussowitsch   PetscCall(PetscFree(ns_col));
3219566063dSJacob Faibussowitsch   PetscCall(PetscFree(work));
3229566063dSJacob Faibussowitsch   PetscCall(PetscFree2(tns, tvc));
3233ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3244c1414c8SBarry Smith }
3254c1414c8SBarry Smith 
326d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
327d71ae5a4SJacob Faibussowitsch {
3284c1414c8SBarry Smith   PetscFunctionBegin;
3299566063dSJacob Faibussowitsch   PetscCall(MatCreateColInode_Private(A, n, NULL));
3303ba16761SJacob Faibussowitsch   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
3314c1414c8SBarry Smith 
3328f7157efSSatish Balay   if (!blockcompressed) {
3339566063dSJacob Faibussowitsch     PetscCall(MatGetColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
3348f7157efSSatish Balay   } else if (symmetric) {
335a5b23f4aSJose E. Roman     /* Since the indices are symmetric it doesn't matter */
3369566063dSJacob Faibussowitsch     PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift));
3374c1414c8SBarry Smith   } else {
3389566063dSJacob Faibussowitsch     PetscCall(MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift));
3394c1414c8SBarry Smith   }
3403ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3414c1414c8SBarry Smith }
3424c1414c8SBarry Smith 
343d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
344d71ae5a4SJacob Faibussowitsch {
3454c1414c8SBarry Smith   PetscFunctionBegin;
3463ba16761SJacob Faibussowitsch   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
3478f7157efSSatish Balay   if (!blockcompressed) {
3489566063dSJacob Faibussowitsch     PetscCall(MatRestoreColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
3498f7157efSSatish Balay   } else {
3509566063dSJacob Faibussowitsch     PetscCall(PetscFree(*ia));
3519566063dSJacob Faibussowitsch     PetscCall(PetscFree(*ja));
3528f7157efSSatish Balay   }
3533ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3544c1414c8SBarry Smith }
3554c1414c8SBarry Smith 
356d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqAIJ_Inode(Mat A, Vec xx, Vec yy)
357d71ae5a4SJacob Faibussowitsch {
3584c1414c8SBarry Smith   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
359d9fead3dSBarry Smith   PetscScalar       *y;
360dd6ea824SBarry Smith   const PetscScalar *x;
361*708a0e70SJunchao Zhang   PetscInt           row, node_max, nonzerorow = 0;
362*708a0e70SJunchao Zhang   PetscInt          *ns;
3634c1414c8SBarry Smith 
3644c1414c8SBarry Smith   PetscFunctionBegin;
3654d12350bSJunchao Zhang   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
3664c1414c8SBarry Smith   node_max = a->inode.node_count;
3674d12350bSJunchao Zhang   ns       = a->inode.size_csr; /* Node Size array */
3689566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
3699566063dSJacob Faibussowitsch   PetscCall(VecGetArray(yy, &y));
3704c1414c8SBarry Smith 
371*708a0e70SJunchao Zhang   PetscPragmaUseOMPKernels(parallel for private(row) reduction(+:nonzerorow))
372*708a0e70SJunchao Zhang   for (PetscInt i = 0; i < node_max; ++i) {
373*708a0e70SJunchao Zhang     PetscInt         i1, i2, nsz, n, sz;
374*708a0e70SJunchao Zhang     const MatScalar *v1, *v2, *v3, *v4, *v5;
375*708a0e70SJunchao Zhang     PetscScalar      sum1, sum2, sum3, sum4, sum5, tmp0, tmp1;
376*708a0e70SJunchao Zhang     const PetscInt  *idx;
377*708a0e70SJunchao Zhang 
378*708a0e70SJunchao Zhang #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
379*708a0e70SJunchao Zhang   #pragma disjoint(*x, *y, *v1, *v2, *v3, *v4, *v5)
380*708a0e70SJunchao Zhang #endif
381*708a0e70SJunchao Zhang     row = ns[i];
3824d12350bSJunchao Zhang     nsz = ns[i + 1] - ns[i];
383*708a0e70SJunchao Zhang     n   = a->i[row + 1] - a->i[row];
38498c9bda7SSatish Balay     nonzerorow += (n > 0) * nsz;
385*708a0e70SJunchao Zhang 
386*708a0e70SJunchao Zhang     idx = &a->j[a->i[row]];
387*708a0e70SJunchao Zhang     v1  = &a->a[a->i[row]];
38850d8bf02SJed Brown     PetscPrefetchBlock(idx + nsz * n, n, 0, PETSC_PREFETCH_HINT_NTA);      /* Prefetch the indices for the block row after the current one */
38950d8bf02SJed Brown     PetscPrefetchBlock(v1 + nsz * n, nsz * n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one  */
3904c1414c8SBarry Smith     sz = n;                                                                /* No of non zeros in this row */
3914c1414c8SBarry Smith                                                                            /* Switch on the size of Node */
3924c1414c8SBarry Smith     switch (nsz) {                                                         /* Each loop in 'case' is unrolled */
3934c1414c8SBarry Smith     case 1:
39475567043SBarry Smith       sum1 = 0.;
3954c1414c8SBarry Smith 
3964c1414c8SBarry Smith       for (n = 0; n < sz - 1; n += 2) {
3974c1414c8SBarry Smith         i1 = idx[0]; /* The instructions are ordered to */
3984c1414c8SBarry Smith         i2 = idx[1]; /* make the compiler's job easy */
3994c1414c8SBarry Smith         idx += 2;
4004c1414c8SBarry Smith         tmp0 = x[i1];
4014c1414c8SBarry Smith         tmp1 = x[i2];
4029371c9d4SSatish Balay         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
4039371c9d4SSatish Balay         v1 += 2;
4044c1414c8SBarry Smith       }
4054c1414c8SBarry Smith 
4064c1414c8SBarry Smith       if (n == sz - 1) { /* Take care of the last nonzero  */
4074c1414c8SBarry Smith         tmp0 = x[*idx++];
4084c1414c8SBarry Smith         sum1 += *v1++ * tmp0;
4094c1414c8SBarry Smith       }
4104c1414c8SBarry Smith       y[row++] = sum1;
4114c1414c8SBarry Smith       break;
4124c1414c8SBarry Smith     case 2:
41375567043SBarry Smith       sum1 = 0.;
41475567043SBarry Smith       sum2 = 0.;
4154c1414c8SBarry Smith       v2   = v1 + n;
4164c1414c8SBarry Smith 
4174c1414c8SBarry Smith       for (n = 0; n < sz - 1; n += 2) {
4184c1414c8SBarry Smith         i1 = idx[0];
4194c1414c8SBarry Smith         i2 = idx[1];
4204c1414c8SBarry Smith         idx += 2;
4214c1414c8SBarry Smith         tmp0 = x[i1];
4224c1414c8SBarry Smith         tmp1 = x[i2];
4239371c9d4SSatish Balay         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
4249371c9d4SSatish Balay         v1 += 2;
4259371c9d4SSatish Balay         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
4269371c9d4SSatish Balay         v2 += 2;
4274c1414c8SBarry Smith       }
4284c1414c8SBarry Smith       if (n == sz - 1) {
4294c1414c8SBarry Smith         tmp0 = x[*idx++];
4304c1414c8SBarry Smith         sum1 += *v1++ * tmp0;
4314c1414c8SBarry Smith         sum2 += *v2++ * tmp0;
4324c1414c8SBarry Smith       }
4334c1414c8SBarry Smith       y[row++] = sum1;
4344c1414c8SBarry Smith       y[row++] = sum2;
4354c1414c8SBarry Smith       v1       = v2; /* Since the next block to be processed starts there*/
4364c1414c8SBarry Smith       idx += sz;
4374c1414c8SBarry Smith       break;
4384c1414c8SBarry Smith     case 3:
43975567043SBarry Smith       sum1 = 0.;
44075567043SBarry Smith       sum2 = 0.;
44175567043SBarry Smith       sum3 = 0.;
4424c1414c8SBarry Smith       v2   = v1 + n;
4434c1414c8SBarry Smith       v3   = v2 + n;
4444c1414c8SBarry Smith 
4454c1414c8SBarry Smith       for (n = 0; n < sz - 1; n += 2) {
4464c1414c8SBarry Smith         i1 = idx[0];
4474c1414c8SBarry Smith         i2 = idx[1];
4484c1414c8SBarry Smith         idx += 2;
4494c1414c8SBarry Smith         tmp0 = x[i1];
4504c1414c8SBarry Smith         tmp1 = x[i2];
4519371c9d4SSatish Balay         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
4529371c9d4SSatish Balay         v1 += 2;
4539371c9d4SSatish Balay         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
4549371c9d4SSatish Balay         v2 += 2;
4559371c9d4SSatish Balay         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
4569371c9d4SSatish Balay         v3 += 2;
4574c1414c8SBarry Smith       }
4584c1414c8SBarry Smith       if (n == sz - 1) {
4594c1414c8SBarry Smith         tmp0 = x[*idx++];
4604c1414c8SBarry Smith         sum1 += *v1++ * tmp0;
4614c1414c8SBarry Smith         sum2 += *v2++ * tmp0;
4624c1414c8SBarry Smith         sum3 += *v3++ * tmp0;
4634c1414c8SBarry Smith       }
4644c1414c8SBarry Smith       y[row++] = sum1;
4654c1414c8SBarry Smith       y[row++] = sum2;
4664c1414c8SBarry Smith       y[row++] = sum3;
4674c1414c8SBarry Smith       v1       = v3; /* Since the next block to be processed starts there*/
4684c1414c8SBarry Smith       idx += 2 * sz;
4694c1414c8SBarry Smith       break;
4704c1414c8SBarry Smith     case 4:
47175567043SBarry Smith       sum1 = 0.;
47275567043SBarry Smith       sum2 = 0.;
47375567043SBarry Smith       sum3 = 0.;
47475567043SBarry Smith       sum4 = 0.;
4754c1414c8SBarry Smith       v2   = v1 + n;
4764c1414c8SBarry Smith       v3   = v2 + n;
4774c1414c8SBarry Smith       v4   = v3 + n;
4784c1414c8SBarry Smith 
4794c1414c8SBarry Smith       for (n = 0; n < sz - 1; n += 2) {
4804c1414c8SBarry Smith         i1 = idx[0];
4814c1414c8SBarry Smith         i2 = idx[1];
4824c1414c8SBarry Smith         idx += 2;
4834c1414c8SBarry Smith         tmp0 = x[i1];
4844c1414c8SBarry Smith         tmp1 = x[i2];
4859371c9d4SSatish Balay         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
4869371c9d4SSatish Balay         v1 += 2;
4879371c9d4SSatish Balay         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
4889371c9d4SSatish Balay         v2 += 2;
4899371c9d4SSatish Balay         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
4909371c9d4SSatish Balay         v3 += 2;
4919371c9d4SSatish Balay         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
4929371c9d4SSatish Balay         v4 += 2;
4934c1414c8SBarry Smith       }
4944c1414c8SBarry Smith       if (n == sz - 1) {
4954c1414c8SBarry Smith         tmp0 = x[*idx++];
4964c1414c8SBarry Smith         sum1 += *v1++ * tmp0;
4974c1414c8SBarry Smith         sum2 += *v2++ * tmp0;
4984c1414c8SBarry Smith         sum3 += *v3++ * tmp0;
4994c1414c8SBarry Smith         sum4 += *v4++ * tmp0;
5004c1414c8SBarry Smith       }
5014c1414c8SBarry Smith       y[row++] = sum1;
5024c1414c8SBarry Smith       y[row++] = sum2;
5034c1414c8SBarry Smith       y[row++] = sum3;
5044c1414c8SBarry Smith       y[row++] = sum4;
5054c1414c8SBarry Smith       v1       = v4; /* Since the next block to be processed starts there*/
5064c1414c8SBarry Smith       idx += 3 * sz;
5074c1414c8SBarry Smith       break;
5084c1414c8SBarry Smith     case 5:
50975567043SBarry Smith       sum1 = 0.;
51075567043SBarry Smith       sum2 = 0.;
51175567043SBarry Smith       sum3 = 0.;
51275567043SBarry Smith       sum4 = 0.;
51375567043SBarry Smith       sum5 = 0.;
5144c1414c8SBarry Smith       v2   = v1 + n;
5154c1414c8SBarry Smith       v3   = v2 + n;
5164c1414c8SBarry Smith       v4   = v3 + n;
5174c1414c8SBarry Smith       v5   = v4 + n;
5184c1414c8SBarry Smith 
5194c1414c8SBarry Smith       for (n = 0; n < sz - 1; n += 2) {
5204c1414c8SBarry Smith         i1 = idx[0];
5214c1414c8SBarry Smith         i2 = idx[1];
5224c1414c8SBarry Smith         idx += 2;
5234c1414c8SBarry Smith         tmp0 = x[i1];
5244c1414c8SBarry Smith         tmp1 = x[i2];
5259371c9d4SSatish Balay         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
5269371c9d4SSatish Balay         v1 += 2;
5279371c9d4SSatish Balay         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
5289371c9d4SSatish Balay         v2 += 2;
5299371c9d4SSatish Balay         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
5309371c9d4SSatish Balay         v3 += 2;
5319371c9d4SSatish Balay         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
5329371c9d4SSatish Balay         v4 += 2;
5339371c9d4SSatish Balay         sum5 += v5[0] * tmp0 + v5[1] * tmp1;
5349371c9d4SSatish Balay         v5 += 2;
5354c1414c8SBarry Smith       }
5364c1414c8SBarry Smith       if (n == sz - 1) {
5374c1414c8SBarry Smith         tmp0 = x[*idx++];
5384c1414c8SBarry Smith         sum1 += *v1++ * tmp0;
5394c1414c8SBarry Smith         sum2 += *v2++ * tmp0;
5404c1414c8SBarry Smith         sum3 += *v3++ * tmp0;
5414c1414c8SBarry Smith         sum4 += *v4++ * tmp0;
5424c1414c8SBarry Smith         sum5 += *v5++ * tmp0;
5434c1414c8SBarry Smith       }
5444c1414c8SBarry Smith       y[row++] = sum1;
5454c1414c8SBarry Smith       y[row++] = sum2;
5464c1414c8SBarry Smith       y[row++] = sum3;
5474c1414c8SBarry Smith       y[row++] = sum4;
5484c1414c8SBarry Smith       y[row++] = sum5;
5494c1414c8SBarry Smith       v1       = v5; /* Since the next block to be processed starts there */
5504c1414c8SBarry Smith       idx += 4 * sz;
5514c1414c8SBarry Smith       break;
552d71ae5a4SJacob Faibussowitsch     default:
553*708a0e70SJunchao Zhang       SETERRABORT(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nsz);
5544c1414c8SBarry Smith     }
5554c1414c8SBarry Smith   }
5569566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
5579566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(yy, &y));
5589566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz - nonzerorow));
5593ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
5604c1414c8SBarry Smith }
5612ef1f0ffSBarry Smith 
5624108e4d5SBarry Smith /* Almost same code as the MatMult_SeqAIJ_Inode() */
563d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A, Vec xx, Vec zz, Vec yy)
564d71ae5a4SJacob Faibussowitsch {
5654c1414c8SBarry Smith   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
5664c1414c8SBarry Smith   PetscScalar        sum1, sum2, sum3, sum4, sum5, tmp0, tmp1;
5678758e1faSBarry Smith   const MatScalar   *v1, *v2, *v3, *v4, *v5;
5688758e1faSBarry Smith   const PetscScalar *x;
5698758e1faSBarry Smith   PetscScalar       *y, *z, *zt;
5708758e1faSBarry Smith   PetscInt           i1, i2, n, i, row, node_max, nsz, sz;
5718758e1faSBarry Smith   const PetscInt    *idx, *ns, *ii;
5724c1414c8SBarry Smith 
5734c1414c8SBarry Smith   PetscFunctionBegin;
5744d12350bSJunchao Zhang   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
5754c1414c8SBarry Smith   node_max = a->inode.node_count;
5764d12350bSJunchao Zhang   ns       = a->inode.size_csr; /* Node Size array */
5772205254eSKarl Rupp 
5789566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
5799566063dSJacob Faibussowitsch   PetscCall(VecGetArrayPair(zz, yy, &z, &y));
5804c1414c8SBarry Smith   zt = z;
5814c1414c8SBarry Smith 
5824c1414c8SBarry Smith   idx = a->j;
5834c1414c8SBarry Smith   v1  = a->a;
5844c1414c8SBarry Smith   ii  = a->i;
5854c1414c8SBarry Smith 
5864d12350bSJunchao Zhang   for (i = 0; i < node_max; ++i) {
5874d12350bSJunchao Zhang     row = ns[i];
5884d12350bSJunchao Zhang     nsz = ns[i + 1] - ns[i];
5894c1414c8SBarry Smith     n   = ii[1] - ii[0];
5904c1414c8SBarry Smith     ii += nsz;
5914c1414c8SBarry Smith     sz = n;        /* No of non zeros in this row */
5924c1414c8SBarry Smith                    /* Switch on the size of Node */
5934c1414c8SBarry Smith     switch (nsz) { /* Each loop in 'case' is unrolled */
5944c1414c8SBarry Smith     case 1:
5954c1414c8SBarry Smith       sum1 = *zt++;
5964c1414c8SBarry Smith 
5974c1414c8SBarry Smith       for (n = 0; n < sz - 1; n += 2) {
5984c1414c8SBarry Smith         i1 = idx[0]; /* The instructions are ordered to */
5994c1414c8SBarry Smith         i2 = idx[1]; /* make the compiler's job easy */
6004c1414c8SBarry Smith         idx += 2;
6014c1414c8SBarry Smith         tmp0 = x[i1];
6024c1414c8SBarry Smith         tmp1 = x[i2];
6039371c9d4SSatish Balay         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
6049371c9d4SSatish Balay         v1 += 2;
6054c1414c8SBarry Smith       }
6064c1414c8SBarry Smith 
6074c1414c8SBarry Smith       if (n == sz - 1) { /* Take care of the last nonzero  */
6084c1414c8SBarry Smith         tmp0 = x[*idx++];
6094c1414c8SBarry Smith         sum1 += *v1++ * tmp0;
6104c1414c8SBarry Smith       }
6114c1414c8SBarry Smith       y[row++] = sum1;
6124c1414c8SBarry Smith       break;
6134c1414c8SBarry Smith     case 2:
6144c1414c8SBarry Smith       sum1 = *zt++;
6154c1414c8SBarry Smith       sum2 = *zt++;
6164c1414c8SBarry Smith       v2   = v1 + n;
6174c1414c8SBarry Smith 
6184c1414c8SBarry Smith       for (n = 0; n < sz - 1; n += 2) {
6194c1414c8SBarry Smith         i1 = idx[0];
6204c1414c8SBarry Smith         i2 = idx[1];
6214c1414c8SBarry Smith         idx += 2;
6224c1414c8SBarry Smith         tmp0 = x[i1];
6234c1414c8SBarry Smith         tmp1 = x[i2];
6249371c9d4SSatish Balay         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
6259371c9d4SSatish Balay         v1 += 2;
6269371c9d4SSatish Balay         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
6279371c9d4SSatish Balay         v2 += 2;
6284c1414c8SBarry Smith       }
6294c1414c8SBarry Smith       if (n == sz - 1) {
6304c1414c8SBarry Smith         tmp0 = x[*idx++];
6314c1414c8SBarry Smith         sum1 += *v1++ * tmp0;
6324c1414c8SBarry Smith         sum2 += *v2++ * tmp0;
6334c1414c8SBarry Smith       }
6344c1414c8SBarry Smith       y[row++] = sum1;
6354c1414c8SBarry Smith       y[row++] = sum2;
6364c1414c8SBarry Smith       v1       = v2; /* Since the next block to be processed starts there*/
6374c1414c8SBarry Smith       idx += sz;
6384c1414c8SBarry Smith       break;
6394c1414c8SBarry Smith     case 3:
6404c1414c8SBarry Smith       sum1 = *zt++;
6414c1414c8SBarry Smith       sum2 = *zt++;
6424c1414c8SBarry Smith       sum3 = *zt++;
6434c1414c8SBarry Smith       v2   = v1 + n;
6444c1414c8SBarry Smith       v3   = v2 + n;
6454c1414c8SBarry Smith 
6464c1414c8SBarry Smith       for (n = 0; n < sz - 1; n += 2) {
6474c1414c8SBarry Smith         i1 = idx[0];
6484c1414c8SBarry Smith         i2 = idx[1];
6494c1414c8SBarry Smith         idx += 2;
6504c1414c8SBarry Smith         tmp0 = x[i1];
6514c1414c8SBarry Smith         tmp1 = x[i2];
6529371c9d4SSatish Balay         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
6539371c9d4SSatish Balay         v1 += 2;
6549371c9d4SSatish Balay         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
6559371c9d4SSatish Balay         v2 += 2;
6569371c9d4SSatish Balay         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
6579371c9d4SSatish Balay         v3 += 2;
6584c1414c8SBarry Smith       }
6594c1414c8SBarry Smith       if (n == sz - 1) {
6604c1414c8SBarry Smith         tmp0 = x[*idx++];
6614c1414c8SBarry Smith         sum1 += *v1++ * tmp0;
6624c1414c8SBarry Smith         sum2 += *v2++ * tmp0;
6634c1414c8SBarry Smith         sum3 += *v3++ * tmp0;
6644c1414c8SBarry Smith       }
6654c1414c8SBarry Smith       y[row++] = sum1;
6664c1414c8SBarry Smith       y[row++] = sum2;
6674c1414c8SBarry Smith       y[row++] = sum3;
6684c1414c8SBarry Smith       v1       = v3; /* Since the next block to be processed starts there*/
6694c1414c8SBarry Smith       idx += 2 * sz;
6704c1414c8SBarry Smith       break;
6714c1414c8SBarry Smith     case 4:
6724c1414c8SBarry Smith       sum1 = *zt++;
6734c1414c8SBarry Smith       sum2 = *zt++;
6744c1414c8SBarry Smith       sum3 = *zt++;
6754c1414c8SBarry Smith       sum4 = *zt++;
6764c1414c8SBarry Smith       v2   = v1 + n;
6774c1414c8SBarry Smith       v3   = v2 + n;
6784c1414c8SBarry Smith       v4   = v3 + n;
6794c1414c8SBarry Smith 
6804c1414c8SBarry Smith       for (n = 0; n < sz - 1; n += 2) {
6814c1414c8SBarry Smith         i1 = idx[0];
6824c1414c8SBarry Smith         i2 = idx[1];
6834c1414c8SBarry Smith         idx += 2;
6844c1414c8SBarry Smith         tmp0 = x[i1];
6854c1414c8SBarry Smith         tmp1 = x[i2];
6869371c9d4SSatish Balay         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
6879371c9d4SSatish Balay         v1 += 2;
6889371c9d4SSatish Balay         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
6899371c9d4SSatish Balay         v2 += 2;
6909371c9d4SSatish Balay         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
6919371c9d4SSatish Balay         v3 += 2;
6929371c9d4SSatish Balay         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
6939371c9d4SSatish Balay         v4 += 2;
6944c1414c8SBarry Smith       }
6954c1414c8SBarry Smith       if (n == sz - 1) {
6964c1414c8SBarry Smith         tmp0 = x[*idx++];
6974c1414c8SBarry Smith         sum1 += *v1++ * tmp0;
6984c1414c8SBarry Smith         sum2 += *v2++ * tmp0;
6994c1414c8SBarry Smith         sum3 += *v3++ * tmp0;
7004c1414c8SBarry Smith         sum4 += *v4++ * tmp0;
7014c1414c8SBarry Smith       }
7024c1414c8SBarry Smith       y[row++] = sum1;
7034c1414c8SBarry Smith       y[row++] = sum2;
7044c1414c8SBarry Smith       y[row++] = sum3;
7054c1414c8SBarry Smith       y[row++] = sum4;
7064c1414c8SBarry Smith       v1       = v4; /* Since the next block to be processed starts there*/
7074c1414c8SBarry Smith       idx += 3 * sz;
7084c1414c8SBarry Smith       break;
7094c1414c8SBarry Smith     case 5:
7104c1414c8SBarry Smith       sum1 = *zt++;
7114c1414c8SBarry Smith       sum2 = *zt++;
7124c1414c8SBarry Smith       sum3 = *zt++;
7134c1414c8SBarry Smith       sum4 = *zt++;
7144c1414c8SBarry Smith       sum5 = *zt++;
7154c1414c8SBarry Smith       v2   = v1 + n;
7164c1414c8SBarry Smith       v3   = v2 + n;
7174c1414c8SBarry Smith       v4   = v3 + n;
7184c1414c8SBarry Smith       v5   = v4 + n;
7194c1414c8SBarry Smith 
7204c1414c8SBarry Smith       for (n = 0; n < sz - 1; n += 2) {
7214c1414c8SBarry Smith         i1 = idx[0];
7224c1414c8SBarry Smith         i2 = idx[1];
7234c1414c8SBarry Smith         idx += 2;
7244c1414c8SBarry Smith         tmp0 = x[i1];
7254c1414c8SBarry Smith         tmp1 = x[i2];
7269371c9d4SSatish Balay         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
7279371c9d4SSatish Balay         v1 += 2;
7289371c9d4SSatish Balay         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
7299371c9d4SSatish Balay         v2 += 2;
7309371c9d4SSatish Balay         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
7319371c9d4SSatish Balay         v3 += 2;
7329371c9d4SSatish Balay         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
7339371c9d4SSatish Balay         v4 += 2;
7349371c9d4SSatish Balay         sum5 += v5[0] * tmp0 + v5[1] * tmp1;
7359371c9d4SSatish Balay         v5 += 2;
7364c1414c8SBarry Smith       }
7374c1414c8SBarry Smith       if (n == sz - 1) {
7384c1414c8SBarry Smith         tmp0 = x[*idx++];
7394c1414c8SBarry Smith         sum1 += *v1++ * tmp0;
7404c1414c8SBarry Smith         sum2 += *v2++ * tmp0;
7414c1414c8SBarry Smith         sum3 += *v3++ * tmp0;
7424c1414c8SBarry Smith         sum4 += *v4++ * tmp0;
7434c1414c8SBarry Smith         sum5 += *v5++ * tmp0;
7444c1414c8SBarry Smith       }
7454c1414c8SBarry Smith       y[row++] = sum1;
7464c1414c8SBarry Smith       y[row++] = sum2;
7474c1414c8SBarry Smith       y[row++] = sum3;
7484c1414c8SBarry Smith       y[row++] = sum4;
7494c1414c8SBarry Smith       y[row++] = sum5;
7504c1414c8SBarry Smith       v1       = v5; /* Since the next block to be processed starts there */
7514c1414c8SBarry Smith       idx += 4 * sz;
7524c1414c8SBarry Smith       break;
753d71ae5a4SJacob Faibussowitsch     default:
754d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported");
7554c1414c8SBarry Smith     }
7564c1414c8SBarry Smith   }
7579566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
7589566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayPair(zz, yy, &z, &y));
7599566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz));
7603ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
7614c1414c8SBarry Smith }
7624c1414c8SBarry Smith 
763ff6a9541SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A, Vec bb, Vec xx)
764d71ae5a4SJacob Faibussowitsch {
7654c1414c8SBarry Smith   Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
7664c1414c8SBarry Smith   IS                 iscol = a->col, isrow = a->row;
7675d0c19d7SBarry Smith   const PetscInt    *r, *c, *rout, *cout;
7688758e1faSBarry Smith   PetscInt           i, j, n = A->rmap->n, nz;
7698758e1faSBarry Smith   PetscInt           node_max, *ns, row, nsz, aii, i0, i1;
7708758e1faSBarry Smith   const PetscInt    *ai = a->i, *a_j = a->j, *vi, *ad, *aj;
771d9fead3dSBarry Smith   PetscScalar       *x, *tmp, *tmps, tmp0, tmp1;
772d9fead3dSBarry Smith   PetscScalar        sum1, sum2, sum3, sum4, sum5;
773dd6ea824SBarry Smith   const MatScalar   *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa;
774dd6ea824SBarry Smith   const PetscScalar *b;
7754c1414c8SBarry Smith 
7764c1414c8SBarry Smith   PetscFunctionBegin;
7774d12350bSJunchao Zhang   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
7784c1414c8SBarry Smith   node_max = a->inode.node_count;
7794d12350bSJunchao Zhang   ns       = a->inode.size_csr; /* Node Size array */
7804c1414c8SBarry Smith 
7819566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
7829566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(xx, &x));
7834c1414c8SBarry Smith   tmp = a->solve_work;
7844c1414c8SBarry Smith 
7859371c9d4SSatish Balay   PetscCall(ISGetIndices(isrow, &rout));
7869371c9d4SSatish Balay   r = rout;
7879371c9d4SSatish Balay   PetscCall(ISGetIndices(iscol, &cout));
7889371c9d4SSatish Balay   c = cout + (n - 1);
7894c1414c8SBarry Smith 
7904c1414c8SBarry Smith   /* forward solve the lower triangular */
7914c1414c8SBarry Smith   tmps = tmp;
7924c1414c8SBarry Smith   aa   = a_a;
7934c1414c8SBarry Smith   aj   = a_j;
7944c1414c8SBarry Smith   ad   = a->diag;
7954c1414c8SBarry Smith 
7964c1414c8SBarry Smith   for (i = 0, row = 0; i < node_max; ++i) {
7974d12350bSJunchao Zhang     row = ns[i];
7984d12350bSJunchao Zhang     nsz = ns[i + 1] - ns[i];
7994c1414c8SBarry Smith     aii = ai[row];
8004c1414c8SBarry Smith     v1  = aa + aii;
8014c1414c8SBarry Smith     vi  = aj + aii;
8024c1414c8SBarry Smith     nz  = ad[row] - aii;
80326549573SJed Brown     if (i < node_max - 1) {
80426549573SJed Brown       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
80591c35059SPierre Jolivet       * but our indexing to determine its size could. */
80650d8bf02SJed Brown       PetscPrefetchBlock(aj + ai[row + nsz], ad[row + nsz] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */
80726549573SJed Brown       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
8084d12350bSJunchao Zhang       PetscPrefetchBlock(aa + ai[row + nsz], ad[ns[i + 2] - 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA);
80926549573SJed Brown       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
81026549573SJed Brown     }
8114c1414c8SBarry Smith 
8124c1414c8SBarry Smith     switch (nsz) { /* Each loop in 'case' is unrolled */
8134c1414c8SBarry Smith     case 1:
8144c1414c8SBarry Smith       sum1 = b[*r++];
8154c1414c8SBarry Smith       for (j = 0; j < nz - 1; j += 2) {
8164c1414c8SBarry Smith         i0 = vi[0];
8174c1414c8SBarry Smith         i1 = vi[1];
8184c1414c8SBarry Smith         vi += 2;
8194c1414c8SBarry Smith         tmp0 = tmps[i0];
8204c1414c8SBarry Smith         tmp1 = tmps[i1];
8219371c9d4SSatish Balay         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
8229371c9d4SSatish Balay         v1 += 2;
8234c1414c8SBarry Smith       }
8244c1414c8SBarry Smith       if (j == nz - 1) {
8254c1414c8SBarry Smith         tmp0 = tmps[*vi++];
8264c1414c8SBarry Smith         sum1 -= *v1++ * tmp0;
8274c1414c8SBarry Smith       }
8284c1414c8SBarry Smith       tmp[row++] = sum1;
8294c1414c8SBarry Smith       break;
8304c1414c8SBarry Smith     case 2:
8314c1414c8SBarry Smith       sum1 = b[*r++];
8324c1414c8SBarry Smith       sum2 = b[*r++];
8334c1414c8SBarry Smith       v2   = aa + ai[row + 1];
8344c1414c8SBarry Smith 
8354c1414c8SBarry Smith       for (j = 0; j < nz - 1; j += 2) {
8364c1414c8SBarry Smith         i0 = vi[0];
8374c1414c8SBarry Smith         i1 = vi[1];
8384c1414c8SBarry Smith         vi += 2;
8394c1414c8SBarry Smith         tmp0 = tmps[i0];
8404c1414c8SBarry Smith         tmp1 = tmps[i1];
8419371c9d4SSatish Balay         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
8429371c9d4SSatish Balay         v1 += 2;
8439371c9d4SSatish Balay         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
8449371c9d4SSatish Balay         v2 += 2;
8454c1414c8SBarry Smith       }
8464c1414c8SBarry Smith       if (j == nz - 1) {
8474c1414c8SBarry Smith         tmp0 = tmps[*vi++];
8484c1414c8SBarry Smith         sum1 -= *v1++ * tmp0;
8494c1414c8SBarry Smith         sum2 -= *v2++ * tmp0;
8504c1414c8SBarry Smith       }
8514c1414c8SBarry Smith       sum2 -= *v2++ * sum1;
8524c1414c8SBarry Smith       tmp[row++] = sum1;
8534c1414c8SBarry Smith       tmp[row++] = sum2;
8544c1414c8SBarry Smith       break;
8554c1414c8SBarry Smith     case 3:
8564c1414c8SBarry Smith       sum1 = b[*r++];
8574c1414c8SBarry Smith       sum2 = b[*r++];
8584c1414c8SBarry Smith       sum3 = b[*r++];
8594c1414c8SBarry Smith       v2   = aa + ai[row + 1];
8604c1414c8SBarry Smith       v3   = aa + ai[row + 2];
8614c1414c8SBarry Smith 
8624c1414c8SBarry Smith       for (j = 0; j < nz - 1; j += 2) {
8634c1414c8SBarry Smith         i0 = vi[0];
8644c1414c8SBarry Smith         i1 = vi[1];
8654c1414c8SBarry Smith         vi += 2;
8664c1414c8SBarry Smith         tmp0 = tmps[i0];
8674c1414c8SBarry Smith         tmp1 = tmps[i1];
8689371c9d4SSatish Balay         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
8699371c9d4SSatish Balay         v1 += 2;
8709371c9d4SSatish Balay         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
8719371c9d4SSatish Balay         v2 += 2;
8729371c9d4SSatish Balay         sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
8739371c9d4SSatish Balay         v3 += 2;
8744c1414c8SBarry Smith       }
8754c1414c8SBarry Smith       if (j == nz - 1) {
8764c1414c8SBarry Smith         tmp0 = tmps[*vi++];
8774c1414c8SBarry Smith         sum1 -= *v1++ * tmp0;
8784c1414c8SBarry Smith         sum2 -= *v2++ * tmp0;
8794c1414c8SBarry Smith         sum3 -= *v3++ * tmp0;
8804c1414c8SBarry Smith       }
8814c1414c8SBarry Smith       sum2 -= *v2++ * sum1;
8824c1414c8SBarry Smith       sum3 -= *v3++ * sum1;
8834c1414c8SBarry Smith       sum3 -= *v3++ * sum2;
8842205254eSKarl Rupp 
8854c1414c8SBarry Smith       tmp[row++] = sum1;
8864c1414c8SBarry Smith       tmp[row++] = sum2;
8874c1414c8SBarry Smith       tmp[row++] = sum3;
8884c1414c8SBarry Smith       break;
8894c1414c8SBarry Smith 
8904c1414c8SBarry Smith     case 4:
8914c1414c8SBarry Smith       sum1 = b[*r++];
8924c1414c8SBarry Smith       sum2 = b[*r++];
8934c1414c8SBarry Smith       sum3 = b[*r++];
8944c1414c8SBarry Smith       sum4 = b[*r++];
8954c1414c8SBarry Smith       v2   = aa + ai[row + 1];
8964c1414c8SBarry Smith       v3   = aa + ai[row + 2];
8974c1414c8SBarry Smith       v4   = aa + ai[row + 3];
8984c1414c8SBarry Smith 
8994c1414c8SBarry Smith       for (j = 0; j < nz - 1; j += 2) {
9004c1414c8SBarry Smith         i0 = vi[0];
9014c1414c8SBarry Smith         i1 = vi[1];
9024c1414c8SBarry Smith         vi += 2;
9034c1414c8SBarry Smith         tmp0 = tmps[i0];
9044c1414c8SBarry Smith         tmp1 = tmps[i1];
9059371c9d4SSatish Balay         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
9069371c9d4SSatish Balay         v1 += 2;
9079371c9d4SSatish Balay         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
9089371c9d4SSatish Balay         v2 += 2;
9099371c9d4SSatish Balay         sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
9109371c9d4SSatish Balay         v3 += 2;
9119371c9d4SSatish Balay         sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
9129371c9d4SSatish Balay         v4 += 2;
9134c1414c8SBarry Smith       }
9144c1414c8SBarry Smith       if (j == nz - 1) {
9154c1414c8SBarry Smith         tmp0 = tmps[*vi++];
9164c1414c8SBarry Smith         sum1 -= *v1++ * tmp0;
9174c1414c8SBarry Smith         sum2 -= *v2++ * tmp0;
9184c1414c8SBarry Smith         sum3 -= *v3++ * tmp0;
9194c1414c8SBarry Smith         sum4 -= *v4++ * tmp0;
9204c1414c8SBarry Smith       }
9214c1414c8SBarry Smith       sum2 -= *v2++ * sum1;
9224c1414c8SBarry Smith       sum3 -= *v3++ * sum1;
9234c1414c8SBarry Smith       sum4 -= *v4++ * sum1;
9244c1414c8SBarry Smith       sum3 -= *v3++ * sum2;
9254c1414c8SBarry Smith       sum4 -= *v4++ * sum2;
9264c1414c8SBarry Smith       sum4 -= *v4++ * sum3;
9274c1414c8SBarry Smith 
9284c1414c8SBarry Smith       tmp[row++] = sum1;
9294c1414c8SBarry Smith       tmp[row++] = sum2;
9304c1414c8SBarry Smith       tmp[row++] = sum3;
9314c1414c8SBarry Smith       tmp[row++] = sum4;
9324c1414c8SBarry Smith       break;
9334c1414c8SBarry Smith     case 5:
9344c1414c8SBarry Smith       sum1 = b[*r++];
9354c1414c8SBarry Smith       sum2 = b[*r++];
9364c1414c8SBarry Smith       sum3 = b[*r++];
9374c1414c8SBarry Smith       sum4 = b[*r++];
9384c1414c8SBarry Smith       sum5 = b[*r++];
9394c1414c8SBarry Smith       v2   = aa + ai[row + 1];
9404c1414c8SBarry Smith       v3   = aa + ai[row + 2];
9414c1414c8SBarry Smith       v4   = aa + ai[row + 3];
9424c1414c8SBarry Smith       v5   = aa + ai[row + 4];
9434c1414c8SBarry Smith 
9444c1414c8SBarry Smith       for (j = 0; j < nz - 1; j += 2) {
9454c1414c8SBarry Smith         i0 = vi[0];
9464c1414c8SBarry Smith         i1 = vi[1];
9474c1414c8SBarry Smith         vi += 2;
9484c1414c8SBarry Smith         tmp0 = tmps[i0];
9494c1414c8SBarry Smith         tmp1 = tmps[i1];
9509371c9d4SSatish Balay         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
9519371c9d4SSatish Balay         v1 += 2;
9529371c9d4SSatish Balay         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
9539371c9d4SSatish Balay         v2 += 2;
9549371c9d4SSatish Balay         sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
9559371c9d4SSatish Balay         v3 += 2;
9569371c9d4SSatish Balay         sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
9579371c9d4SSatish Balay         v4 += 2;
9589371c9d4SSatish Balay         sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
9599371c9d4SSatish Balay         v5 += 2;
9604c1414c8SBarry Smith       }
9614c1414c8SBarry Smith       if (j == nz - 1) {
9624c1414c8SBarry Smith         tmp0 = tmps[*vi++];
9634c1414c8SBarry Smith         sum1 -= *v1++ * tmp0;
9644c1414c8SBarry Smith         sum2 -= *v2++ * tmp0;
9654c1414c8SBarry Smith         sum3 -= *v3++ * tmp0;
9664c1414c8SBarry Smith         sum4 -= *v4++ * tmp0;
9674c1414c8SBarry Smith         sum5 -= *v5++ * tmp0;
9684c1414c8SBarry Smith       }
9694c1414c8SBarry Smith 
9704c1414c8SBarry Smith       sum2 -= *v2++ * sum1;
9714c1414c8SBarry Smith       sum3 -= *v3++ * sum1;
9724c1414c8SBarry Smith       sum4 -= *v4++ * sum1;
9734c1414c8SBarry Smith       sum5 -= *v5++ * sum1;
9744c1414c8SBarry Smith       sum3 -= *v3++ * sum2;
9754c1414c8SBarry Smith       sum4 -= *v4++ * sum2;
9764c1414c8SBarry Smith       sum5 -= *v5++ * sum2;
9774c1414c8SBarry Smith       sum4 -= *v4++ * sum3;
9784c1414c8SBarry Smith       sum5 -= *v5++ * sum3;
9794c1414c8SBarry Smith       sum5 -= *v5++ * sum4;
9804c1414c8SBarry Smith 
9814c1414c8SBarry Smith       tmp[row++] = sum1;
9824c1414c8SBarry Smith       tmp[row++] = sum2;
9834c1414c8SBarry Smith       tmp[row++] = sum3;
9844c1414c8SBarry Smith       tmp[row++] = sum4;
9854c1414c8SBarry Smith       tmp[row++] = sum5;
9864c1414c8SBarry Smith       break;
987d71ae5a4SJacob Faibussowitsch     default:
988d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
9894c1414c8SBarry Smith     }
9904c1414c8SBarry Smith   }
9914c1414c8SBarry Smith   /* backward solve the upper triangular */
9924d12350bSJunchao Zhang   for (i = node_max - 1; i >= 0; i--) {
9934d12350bSJunchao Zhang     row = ns[i + 1];
9944d12350bSJunchao Zhang     nsz = ns[i + 1] - ns[i];
9954c1414c8SBarry Smith     aii = ai[row + 1] - 1;
9964c1414c8SBarry Smith     v1  = aa + aii;
9974c1414c8SBarry Smith     vi  = aj + aii;
9984c1414c8SBarry Smith     nz  = aii - ad[row];
9994c1414c8SBarry Smith     switch (nsz) { /* Each loop in 'case' is unrolled */
10004c1414c8SBarry Smith     case 1:
10014c1414c8SBarry Smith       sum1 = tmp[row];
10024c1414c8SBarry Smith 
10034c1414c8SBarry Smith       for (j = nz; j > 1; j -= 2) {
10044c1414c8SBarry Smith         vi -= 2;
10054c1414c8SBarry Smith         i0   = vi[2];
10064c1414c8SBarry Smith         i1   = vi[1];
10074c1414c8SBarry Smith         tmp0 = tmps[i0];
10084c1414c8SBarry Smith         tmp1 = tmps[i1];
10094c1414c8SBarry Smith         v1 -= 2;
10104c1414c8SBarry Smith         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
10114c1414c8SBarry Smith       }
10124c1414c8SBarry Smith       if (j == 1) {
10134c1414c8SBarry Smith         tmp0 = tmps[*vi--];
10144c1414c8SBarry Smith         sum1 -= *v1-- * tmp0;
10154c1414c8SBarry Smith       }
10169371c9d4SSatish Balay       x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
10179371c9d4SSatish Balay       row--;
10184c1414c8SBarry Smith       break;
10194c1414c8SBarry Smith     case 2:
10204c1414c8SBarry Smith       sum1 = tmp[row];
10214c1414c8SBarry Smith       sum2 = tmp[row - 1];
10224c1414c8SBarry Smith       v2   = aa + ai[row] - 1;
10234c1414c8SBarry Smith       for (j = nz; j > 1; j -= 2) {
10244c1414c8SBarry Smith         vi -= 2;
10254c1414c8SBarry Smith         i0   = vi[2];
10264c1414c8SBarry Smith         i1   = vi[1];
10274c1414c8SBarry Smith         tmp0 = tmps[i0];
10284c1414c8SBarry Smith         tmp1 = tmps[i1];
10294c1414c8SBarry Smith         v1 -= 2;
10304c1414c8SBarry Smith         v2 -= 2;
10314c1414c8SBarry Smith         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
10324c1414c8SBarry Smith         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
10334c1414c8SBarry Smith       }
10344c1414c8SBarry Smith       if (j == 1) {
10354c1414c8SBarry Smith         tmp0 = tmps[*vi--];
10364c1414c8SBarry Smith         sum1 -= *v1-- * tmp0;
10374c1414c8SBarry Smith         sum2 -= *v2-- * tmp0;
10384c1414c8SBarry Smith       }
10394c1414c8SBarry Smith 
10409371c9d4SSatish Balay       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
10419371c9d4SSatish Balay       row--;
10424c1414c8SBarry Smith       sum2 -= *v2-- * tmp0;
10439371c9d4SSatish Balay       x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
10449371c9d4SSatish Balay       row--;
10454c1414c8SBarry Smith       break;
10464c1414c8SBarry Smith     case 3:
10474c1414c8SBarry Smith       sum1 = tmp[row];
10484c1414c8SBarry Smith       sum2 = tmp[row - 1];
10494c1414c8SBarry Smith       sum3 = tmp[row - 2];
10504c1414c8SBarry Smith       v2   = aa + ai[row] - 1;
10514c1414c8SBarry Smith       v3   = aa + ai[row - 1] - 1;
10524c1414c8SBarry Smith       for (j = nz; j > 1; j -= 2) {
10534c1414c8SBarry Smith         vi -= 2;
10544c1414c8SBarry Smith         i0   = vi[2];
10554c1414c8SBarry Smith         i1   = vi[1];
10564c1414c8SBarry Smith         tmp0 = tmps[i0];
10574c1414c8SBarry Smith         tmp1 = tmps[i1];
10584c1414c8SBarry Smith         v1 -= 2;
10594c1414c8SBarry Smith         v2 -= 2;
10604c1414c8SBarry Smith         v3 -= 2;
10614c1414c8SBarry Smith         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
10624c1414c8SBarry Smith         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
10634c1414c8SBarry Smith         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
10644c1414c8SBarry Smith       }
10654c1414c8SBarry Smith       if (j == 1) {
10664c1414c8SBarry Smith         tmp0 = tmps[*vi--];
10674c1414c8SBarry Smith         sum1 -= *v1-- * tmp0;
10684c1414c8SBarry Smith         sum2 -= *v2-- * tmp0;
10694c1414c8SBarry Smith         sum3 -= *v3-- * tmp0;
10704c1414c8SBarry Smith       }
10719371c9d4SSatish Balay       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
10729371c9d4SSatish Balay       row--;
10734c1414c8SBarry Smith       sum2 -= *v2-- * tmp0;
10744c1414c8SBarry Smith       sum3 -= *v3-- * tmp0;
10759371c9d4SSatish Balay       tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
10769371c9d4SSatish Balay       row--;
10774c1414c8SBarry Smith       sum3 -= *v3-- * tmp0;
10789371c9d4SSatish Balay       x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
10799371c9d4SSatish Balay       row--;
10804c1414c8SBarry Smith 
10814c1414c8SBarry Smith       break;
10824c1414c8SBarry Smith     case 4:
10834c1414c8SBarry Smith       sum1 = tmp[row];
10844c1414c8SBarry Smith       sum2 = tmp[row - 1];
10854c1414c8SBarry Smith       sum3 = tmp[row - 2];
10864c1414c8SBarry Smith       sum4 = tmp[row - 3];
10874c1414c8SBarry Smith       v2   = aa + ai[row] - 1;
10884c1414c8SBarry Smith       v3   = aa + ai[row - 1] - 1;
10894c1414c8SBarry Smith       v4   = aa + ai[row - 2] - 1;
10904c1414c8SBarry Smith 
10914c1414c8SBarry Smith       for (j = nz; j > 1; j -= 2) {
10924c1414c8SBarry Smith         vi -= 2;
10934c1414c8SBarry Smith         i0   = vi[2];
10944c1414c8SBarry Smith         i1   = vi[1];
10954c1414c8SBarry Smith         tmp0 = tmps[i0];
10964c1414c8SBarry Smith         tmp1 = tmps[i1];
10974c1414c8SBarry Smith         v1 -= 2;
10984c1414c8SBarry Smith         v2 -= 2;
10994c1414c8SBarry Smith         v3 -= 2;
11004c1414c8SBarry Smith         v4 -= 2;
11014c1414c8SBarry Smith         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
11024c1414c8SBarry Smith         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
11034c1414c8SBarry Smith         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
11044c1414c8SBarry Smith         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
11054c1414c8SBarry Smith       }
11064c1414c8SBarry Smith       if (j == 1) {
11074c1414c8SBarry Smith         tmp0 = tmps[*vi--];
11084c1414c8SBarry Smith         sum1 -= *v1-- * tmp0;
11094c1414c8SBarry Smith         sum2 -= *v2-- * tmp0;
11104c1414c8SBarry Smith         sum3 -= *v3-- * tmp0;
11114c1414c8SBarry Smith         sum4 -= *v4-- * tmp0;
11124c1414c8SBarry Smith       }
11134c1414c8SBarry Smith 
11149371c9d4SSatish Balay       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
11159371c9d4SSatish Balay       row--;
11164c1414c8SBarry Smith       sum2 -= *v2-- * tmp0;
11174c1414c8SBarry Smith       sum3 -= *v3-- * tmp0;
11184c1414c8SBarry Smith       sum4 -= *v4-- * tmp0;
11199371c9d4SSatish Balay       tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
11209371c9d4SSatish Balay       row--;
11214c1414c8SBarry Smith       sum3 -= *v3-- * tmp0;
11224c1414c8SBarry Smith       sum4 -= *v4-- * tmp0;
11239371c9d4SSatish Balay       tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
11249371c9d4SSatish Balay       row--;
11254c1414c8SBarry Smith       sum4 -= *v4-- * tmp0;
11269371c9d4SSatish Balay       x[*c--] = tmp[row] = sum4 * a_a[ad[row]];
11279371c9d4SSatish Balay       row--;
11284c1414c8SBarry Smith       break;
11294c1414c8SBarry Smith     case 5:
11304c1414c8SBarry Smith       sum1 = tmp[row];
11314c1414c8SBarry Smith       sum2 = tmp[row - 1];
11324c1414c8SBarry Smith       sum3 = tmp[row - 2];
11334c1414c8SBarry Smith       sum4 = tmp[row - 3];
11344c1414c8SBarry Smith       sum5 = tmp[row - 4];
11354c1414c8SBarry Smith       v2   = aa + ai[row] - 1;
11364c1414c8SBarry Smith       v3   = aa + ai[row - 1] - 1;
11374c1414c8SBarry Smith       v4   = aa + ai[row - 2] - 1;
11384c1414c8SBarry Smith       v5   = aa + ai[row - 3] - 1;
11394c1414c8SBarry Smith       for (j = nz; j > 1; j -= 2) {
11404c1414c8SBarry Smith         vi -= 2;
11414c1414c8SBarry Smith         i0   = vi[2];
11424c1414c8SBarry Smith         i1   = vi[1];
11434c1414c8SBarry Smith         tmp0 = tmps[i0];
11444c1414c8SBarry Smith         tmp1 = tmps[i1];
11454c1414c8SBarry Smith         v1 -= 2;
11464c1414c8SBarry Smith         v2 -= 2;
11474c1414c8SBarry Smith         v3 -= 2;
11484c1414c8SBarry Smith         v4 -= 2;
11494c1414c8SBarry Smith         v5 -= 2;
11504c1414c8SBarry Smith         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
11514c1414c8SBarry Smith         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
11524c1414c8SBarry Smith         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
11534c1414c8SBarry Smith         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
11544c1414c8SBarry Smith         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
11554c1414c8SBarry Smith       }
11564c1414c8SBarry Smith       if (j == 1) {
11574c1414c8SBarry Smith         tmp0 = tmps[*vi--];
11584c1414c8SBarry Smith         sum1 -= *v1-- * tmp0;
11594c1414c8SBarry Smith         sum2 -= *v2-- * tmp0;
11604c1414c8SBarry Smith         sum3 -= *v3-- * tmp0;
11614c1414c8SBarry Smith         sum4 -= *v4-- * tmp0;
11624c1414c8SBarry Smith         sum5 -= *v5-- * tmp0;
11634c1414c8SBarry Smith       }
11644c1414c8SBarry Smith 
11659371c9d4SSatish Balay       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
11669371c9d4SSatish Balay       row--;
11674c1414c8SBarry Smith       sum2 -= *v2-- * tmp0;
11684c1414c8SBarry Smith       sum3 -= *v3-- * tmp0;
11694c1414c8SBarry Smith       sum4 -= *v4-- * tmp0;
11704c1414c8SBarry Smith       sum5 -= *v5-- * tmp0;
11719371c9d4SSatish Balay       tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
11729371c9d4SSatish Balay       row--;
11734c1414c8SBarry Smith       sum3 -= *v3-- * tmp0;
11744c1414c8SBarry Smith       sum4 -= *v4-- * tmp0;
11754c1414c8SBarry Smith       sum5 -= *v5-- * tmp0;
11769371c9d4SSatish Balay       tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
11779371c9d4SSatish Balay       row--;
11784c1414c8SBarry Smith       sum4 -= *v4-- * tmp0;
11794c1414c8SBarry Smith       sum5 -= *v5-- * tmp0;
11809371c9d4SSatish Balay       tmp0 = x[*c--] = tmp[row] = sum4 * a_a[ad[row]];
11819371c9d4SSatish Balay       row--;
11824c1414c8SBarry Smith       sum5 -= *v5-- * tmp0;
11839371c9d4SSatish Balay       x[*c--] = tmp[row] = sum5 * a_a[ad[row]];
11849371c9d4SSatish Balay       row--;
11854c1414c8SBarry Smith       break;
1186d71ae5a4SJacob Faibussowitsch     default:
1187d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
11884c1414c8SBarry Smith     }
11894c1414c8SBarry Smith   }
11909566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &rout));
11919566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &cout));
11929566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
11939566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(xx, &x));
11949566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
11953ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
11964c1414c8SBarry Smith }
11974c1414c8SBarry Smith 
1198d71ae5a4SJacob Faibussowitsch PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B, Mat A, const MatFactorInfo *info)
1199d71ae5a4SJacob Faibussowitsch {
120028f1b45aSHong Zhang   Mat              C = B;
120128f1b45aSHong Zhang   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data;
120228f1b45aSHong Zhang   IS               isrow = b->row, isicol = b->icol;
120328f1b45aSHong Zhang   const PetscInt  *r, *ic, *ics;
120428f1b45aSHong Zhang   const PetscInt   n = A->rmap->n, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag;
120528f1b45aSHong Zhang   PetscInt         i, j, k, nz, nzL, row, *pj;
120628f1b45aSHong Zhang   const PetscInt  *ajtmp, *bjtmp;
12079877982aSShri Abhyankar   MatScalar       *pc, *pc1, *pc2, *pc3, *pc4, mul1, mul2, mul3, mul4, *pv, *rtmp1, *rtmp2, *rtmp3, *rtmp4;
12089877982aSShri Abhyankar   const MatScalar *aa = a->a, *v, *v1, *v2, *v3, *v4;
120928f1b45aSHong Zhang   FactorShiftCtx   sctx;
12104f81c4b7SBarry Smith   const PetscInt  *ddiag;
121128f1b45aSHong Zhang   PetscReal        rs;
121228f1b45aSHong Zhang   MatScalar        d;
12134f81c4b7SBarry Smith   PetscInt         inod, nodesz, node_max, col;
12144f81c4b7SBarry Smith   const PetscInt  *ns;
121507b50cabSHong Zhang   PetscInt        *tmp_vec1, *tmp_vec2, *nsmap;
12160e95ead3SHong Zhang 
121728f1b45aSHong Zhang   PetscFunctionBegin;
121828f1b45aSHong Zhang   /* MatPivotSetUp(): initialize shift context sctx */
12199566063dSJacob Faibussowitsch   PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
122028f1b45aSHong Zhang 
1221f4db908eSBarry Smith   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
122228f1b45aSHong Zhang     ddiag          = a->diag;
122328f1b45aSHong Zhang     sctx.shift_top = info->zeropivot;
122428f1b45aSHong Zhang     for (i = 0; i < n; i++) {
122528f1b45aSHong Zhang       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
122628f1b45aSHong Zhang       d  = (aa)[ddiag[i]];
122728f1b45aSHong Zhang       rs = -PetscAbsScalar(d) - PetscRealPart(d);
122828f1b45aSHong Zhang       v  = aa + ai[i];
122928f1b45aSHong Zhang       nz = ai[i + 1] - ai[i];
12302205254eSKarl Rupp       for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]);
123128f1b45aSHong Zhang       if (rs > sctx.shift_top) sctx.shift_top = rs;
123228f1b45aSHong Zhang     }
123328f1b45aSHong Zhang     sctx.shift_top *= 1.1;
123428f1b45aSHong Zhang     sctx.nshift_max = 5;
123528f1b45aSHong Zhang     sctx.shift_lo   = 0.;
123628f1b45aSHong Zhang     sctx.shift_hi   = 1.;
123728f1b45aSHong Zhang   }
123828f1b45aSHong Zhang 
12399566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isrow, &r));
12409566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isicol, &ic));
124168785679SHong Zhang 
12429566063dSJacob Faibussowitsch   PetscCall(PetscCalloc4(n, &rtmp1, n, &rtmp2, n, &rtmp3, n, &rtmp4));
124328f1b45aSHong Zhang   ics = ic;
124428f1b45aSHong Zhang 
124528f1b45aSHong Zhang   node_max = a->inode.node_count;
12464d12350bSJunchao Zhang   ns       = a->inode.size_csr;
124728b400f6SJacob Faibussowitsch   PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information");
124828f1b45aSHong Zhang 
12499877982aSShri Abhyankar   /* If max inode size > 4, split it into two inodes.*/
125068785679SHong Zhang   /* also map the inode sizes according to the ordering */
12519566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(n + 1, &tmp_vec1));
125268785679SHong Zhang   for (i = 0, j = 0; i < node_max; ++i, ++j) {
12534d12350bSJunchao Zhang     nodesz = ns[i + 1] - ns[i];
12544d12350bSJunchao Zhang     if (nodesz > 4) {
1255048b5e81SShri Abhyankar       tmp_vec1[j] = 4;
125668785679SHong Zhang       ++j;
12574d12350bSJunchao Zhang       tmp_vec1[j] = nodesz - tmp_vec1[j - 1];
125868785679SHong Zhang     } else {
12594d12350bSJunchao Zhang       tmp_vec1[j] = nodesz;
126068785679SHong Zhang     }
126168785679SHong Zhang   }
126268785679SHong Zhang   /* Use the correct node_max */
126368785679SHong Zhang   node_max = j;
126468785679SHong Zhang 
126568785679SHong Zhang   /* Now reorder the inode info based on mat re-ordering info */
126668785679SHong Zhang   /* First create a row -> inode_size_array_index map */
12679566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(n + 1, &nsmap));
12689566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(node_max + 1, &tmp_vec2));
12694d12350bSJunchao Zhang   tmp_vec2[0] = 0;
127068785679SHong Zhang   for (i = 0, row = 0; i < node_max; i++) {
127168785679SHong Zhang     nodesz = tmp_vec1[i];
1272ad540459SPierre Jolivet     for (j = 0; j < nodesz; j++, row++) nsmap[row] = i;
127368785679SHong Zhang   }
127468785679SHong Zhang   /* Using nsmap, create a reordered ns structure */
127568785679SHong Zhang   for (i = 0, j = 0; i < node_max; i++) {
127668785679SHong Zhang     nodesz          = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
12774d12350bSJunchao Zhang     tmp_vec2[i + 1] = tmp_vec2[i] + nodesz;
127868785679SHong Zhang     j += nodesz;
127968785679SHong Zhang   }
12809566063dSJacob Faibussowitsch   PetscCall(PetscFree(nsmap));
12819566063dSJacob Faibussowitsch   PetscCall(PetscFree(tmp_vec1));
1282b89f182dSHong Zhang 
128368785679SHong Zhang   /* Now use the correct ns */
128468785679SHong Zhang   ns = tmp_vec2;
128568785679SHong Zhang 
128628f1b45aSHong Zhang   do {
128707b50cabSHong Zhang     sctx.newshift = PETSC_FALSE;
128828f1b45aSHong Zhang     /* Now loop over each block-row, and do the factorization */
128928f1b45aSHong Zhang     for (inod = 0, i = 0; inod < node_max; inod++) { /* i: row index; inod: inode index */
12904d12350bSJunchao Zhang       nodesz = ns[inod + 1] - ns[inod];
129128f1b45aSHong Zhang 
129228f1b45aSHong Zhang       switch (nodesz) {
129328f1b45aSHong Zhang       case 1:
1294b89f182dSHong Zhang         /* zero rtmp1 */
129528f1b45aSHong Zhang         /* L part */
129628f1b45aSHong Zhang         nz    = bi[i + 1] - bi[i];
129728f1b45aSHong Zhang         bjtmp = bj + bi[i];
1298b89f182dSHong Zhang         for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0;
129928f1b45aSHong Zhang 
130028f1b45aSHong Zhang         /* U part */
130128f1b45aSHong Zhang         nz    = bdiag[i] - bdiag[i + 1];
130228f1b45aSHong Zhang         bjtmp = bj + bdiag[i + 1] + 1;
1303b89f182dSHong Zhang         for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0;
130428f1b45aSHong Zhang 
130528f1b45aSHong Zhang         /* load in initial (unfactored row) */
130628f1b45aSHong Zhang         nz    = ai[r[i] + 1] - ai[r[i]];
130728f1b45aSHong Zhang         ajtmp = aj + ai[r[i]];
130828f1b45aSHong Zhang         v     = aa + ai[r[i]];
13092205254eSKarl Rupp         for (j = 0; j < nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];
13102205254eSKarl Rupp 
131128f1b45aSHong Zhang         /* ZeropivotApply() */
1312b89f182dSHong Zhang         rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */
131328f1b45aSHong Zhang 
131428f1b45aSHong Zhang         /* elimination */
131528f1b45aSHong Zhang         bjtmp = bj + bi[i];
131628f1b45aSHong Zhang         row   = *bjtmp++;
131728f1b45aSHong Zhang         nzL   = bi[i + 1] - bi[i];
131828f1b45aSHong Zhang         for (k = 0; k < nzL; k++) {
1319b89f182dSHong Zhang           pc = rtmp1 + row;
132028f1b45aSHong Zhang           if (*pc != 0.0) {
132128f1b45aSHong Zhang             pv   = b->a + bdiag[row];
1322b89f182dSHong Zhang             mul1 = *pc * (*pv);
1323b89f182dSHong Zhang             *pc  = mul1;
132428f1b45aSHong Zhang             pj   = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
132528f1b45aSHong Zhang             pv   = b->a + bdiag[row + 1] + 1;
132628f1b45aSHong Zhang             nz   = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1327b89f182dSHong Zhang             for (j = 0; j < nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
13289566063dSJacob Faibussowitsch             PetscCall(PetscLogFlops(1 + 2.0 * nz));
132928f1b45aSHong Zhang           }
133028f1b45aSHong Zhang           row = *bjtmp++;
133128f1b45aSHong Zhang         }
133228f1b45aSHong Zhang 
133328f1b45aSHong Zhang         /* finished row so stick it into b->a */
133428f1b45aSHong Zhang         rs = 0.0;
133528f1b45aSHong Zhang         /* L part */
133628f1b45aSHong Zhang         pv = b->a + bi[i];
133728f1b45aSHong Zhang         pj = b->j + bi[i];
133828f1b45aSHong Zhang         nz = bi[i + 1] - bi[i];
133928f1b45aSHong Zhang         for (j = 0; j < nz; j++) {
13409371c9d4SSatish Balay           pv[j] = rtmp1[pj[j]];
13419371c9d4SSatish Balay           rs += PetscAbsScalar(pv[j]);
134228f1b45aSHong Zhang         }
134328f1b45aSHong Zhang 
134428f1b45aSHong Zhang         /* U part */
134528f1b45aSHong Zhang         pv = b->a + bdiag[i + 1] + 1;
134628f1b45aSHong Zhang         pj = b->j + bdiag[i + 1] + 1;
134728f1b45aSHong Zhang         nz = bdiag[i] - bdiag[i + 1] - 1;
134828f1b45aSHong Zhang         for (j = 0; j < nz; j++) {
13499371c9d4SSatish Balay           pv[j] = rtmp1[pj[j]];
13509371c9d4SSatish Balay           rs += PetscAbsScalar(pv[j]);
135128f1b45aSHong Zhang         }
135228f1b45aSHong Zhang 
1353b89f182dSHong Zhang         /* Check zero pivot */
135428f1b45aSHong Zhang         sctx.rs = rs;
1355b89f182dSHong Zhang         sctx.pv = rtmp1[i];
13569566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
135707b50cabSHong Zhang         if (sctx.newshift) break;
135828f1b45aSHong Zhang 
1359a5b23f4aSJose E. Roman         /* Mark diagonal and invert diagonal for simpler triangular solves */
136028f1b45aSHong Zhang         pv  = b->a + bdiag[i];
1361b89f182dSHong Zhang         *pv = 1.0 / sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
136228f1b45aSHong Zhang         break;
136328f1b45aSHong Zhang 
136428f1b45aSHong Zhang       case 2:
1365b89f182dSHong Zhang         /* zero rtmp1 and rtmp2 */
136628f1b45aSHong Zhang         /* L part */
136728f1b45aSHong Zhang         nz    = bi[i + 1] - bi[i];
136828f1b45aSHong Zhang         bjtmp = bj + bi[i];
136928f1b45aSHong Zhang         for (j = 0; j < nz; j++) {
137068785679SHong Zhang           col        = bjtmp[j];
13719371c9d4SSatish Balay           rtmp1[col] = 0.0;
13729371c9d4SSatish Balay           rtmp2[col] = 0.0;
137328f1b45aSHong Zhang         }
137428f1b45aSHong Zhang 
137528f1b45aSHong Zhang         /* U part */
137628f1b45aSHong Zhang         nz    = bdiag[i] - bdiag[i + 1];
137728f1b45aSHong Zhang         bjtmp = bj + bdiag[i + 1] + 1;
137828f1b45aSHong Zhang         for (j = 0; j < nz; j++) {
137968785679SHong Zhang           col        = bjtmp[j];
13809371c9d4SSatish Balay           rtmp1[col] = 0.0;
13819371c9d4SSatish Balay           rtmp2[col] = 0.0;
138228f1b45aSHong Zhang         }
138328f1b45aSHong Zhang 
138428f1b45aSHong Zhang         /* load in initial (unfactored row) */
138528f1b45aSHong Zhang         nz    = ai[r[i] + 1] - ai[r[i]];
138628f1b45aSHong Zhang         ajtmp = aj + ai[r[i]];
13879371c9d4SSatish Balay         v1    = aa + ai[r[i]];
13889371c9d4SSatish Balay         v2    = aa + ai[r[i] + 1];
138928f1b45aSHong Zhang         for (j = 0; j < nz; j++) {
139068785679SHong Zhang           col        = ics[ajtmp[j]];
13919371c9d4SSatish Balay           rtmp1[col] = v1[j];
13929371c9d4SSatish Balay           rtmp2[col] = v2[j];
139328f1b45aSHong Zhang         }
139428f1b45aSHong Zhang         /* ZeropivotApply(): shift the diagonal of the matrix  */
13959371c9d4SSatish Balay         rtmp1[i] += sctx.shift_amount;
13969371c9d4SSatish Balay         rtmp2[i + 1] += sctx.shift_amount;
139728f1b45aSHong Zhang 
139828f1b45aSHong Zhang         /* elimination */
139928f1b45aSHong Zhang         bjtmp = bj + bi[i];
140028f1b45aSHong Zhang         row   = *bjtmp++; /* pivot row */
140128f1b45aSHong Zhang         nzL   = bi[i + 1] - bi[i];
140228f1b45aSHong Zhang         for (k = 0; k < nzL; k++) {
1403b89f182dSHong Zhang           pc1 = rtmp1 + row;
1404b89f182dSHong Zhang           pc2 = rtmp2 + row;
140528f1b45aSHong Zhang           if (*pc1 != 0.0 || *pc2 != 0.0) {
140628f1b45aSHong Zhang             pv   = b->a + bdiag[row];
14079371c9d4SSatish Balay             mul1 = *pc1 * (*pv);
14089371c9d4SSatish Balay             mul2 = *pc2 * (*pv);
14099371c9d4SSatish Balay             *pc1 = mul1;
14109371c9d4SSatish Balay             *pc2 = mul2;
141128f1b45aSHong Zhang 
141228f1b45aSHong Zhang             pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
141328f1b45aSHong Zhang             pv = b->a + bdiag[row + 1] + 1;
141428f1b45aSHong Zhang             nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
141528f1b45aSHong Zhang             for (j = 0; j < nz; j++) {
141668785679SHong Zhang               col = pj[j];
1417b89f182dSHong Zhang               rtmp1[col] -= mul1 * pv[j];
1418b89f182dSHong Zhang               rtmp2[col] -= mul2 * pv[j];
141928f1b45aSHong Zhang             }
14209566063dSJacob Faibussowitsch             PetscCall(PetscLogFlops(2 + 4.0 * nz));
142128f1b45aSHong Zhang           }
142228f1b45aSHong Zhang           row = *bjtmp++;
142328f1b45aSHong Zhang         }
142428f1b45aSHong Zhang 
1425b89f182dSHong Zhang         /* finished row i; check zero pivot, then stick row i into b->a */
142628f1b45aSHong Zhang         rs = 0.0;
142728f1b45aSHong Zhang         /* L part */
1428b89f182dSHong Zhang         pc1 = b->a + bi[i];
142928f1b45aSHong Zhang         pj  = b->j + bi[i];
143028f1b45aSHong Zhang         nz  = bi[i + 1] - bi[i];
143128f1b45aSHong Zhang         for (j = 0; j < nz; j++) {
143268785679SHong Zhang           col    = pj[j];
14339371c9d4SSatish Balay           pc1[j] = rtmp1[col];
14349371c9d4SSatish Balay           rs += PetscAbsScalar(pc1[j]);
143528f1b45aSHong Zhang         }
143628f1b45aSHong Zhang         /* U part */
1437b89f182dSHong Zhang         pc1 = b->a + bdiag[i + 1] + 1;
143828f1b45aSHong Zhang         pj  = b->j + bdiag[i + 1] + 1;
14390e7a5c2bSHong Zhang         nz  = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
144028f1b45aSHong Zhang         for (j = 0; j < nz; j++) {
144168785679SHong Zhang           col    = pj[j];
14429371c9d4SSatish Balay           pc1[j] = rtmp1[col];
14439371c9d4SSatish Balay           rs += PetscAbsScalar(pc1[j]);
144428f1b45aSHong Zhang         }
144528f1b45aSHong Zhang 
144628f1b45aSHong Zhang         sctx.rs = rs;
1447b89f182dSHong Zhang         sctx.pv = rtmp1[i];
14489566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
144907b50cabSHong Zhang         if (sctx.newshift) break;
1450b89f182dSHong Zhang         pc1  = b->a + bdiag[i]; /* Mark diagonal */
1451b89f182dSHong Zhang         *pc1 = 1.0 / sctx.pv;
1452b89f182dSHong Zhang 
1453b89f182dSHong Zhang         /* Now take care of diagonal 2x2 block. */
1454b89f182dSHong Zhang         pc2 = rtmp2 + i;
1455b89f182dSHong Zhang         if (*pc2 != 0.0) {
1456b89f182dSHong Zhang           mul1 = (*pc2) * (*pc1);             /* *pc1=diag[i] is inverted! */
1457b89f182dSHong Zhang           *pc2 = mul1;                        /* insert L entry */
1458b89f182dSHong Zhang           pj   = b->j + bdiag[i + 1] + 1;     /* beginning of U(i,:) */
1459b89f182dSHong Zhang           nz   = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1460b89f182dSHong Zhang           for (j = 0; j < nz; j++) {
14619371c9d4SSatish Balay             col = pj[j];
14629371c9d4SSatish Balay             rtmp2[col] -= mul1 * rtmp1[col];
146328f1b45aSHong Zhang           }
14649566063dSJacob Faibussowitsch           PetscCall(PetscLogFlops(1 + 2.0 * nz));
1465b89f182dSHong Zhang         }
1466b89f182dSHong Zhang 
1467b89f182dSHong Zhang         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1468b89f182dSHong Zhang         rs = 0.0;
1469b89f182dSHong Zhang         /* L part */
1470b89f182dSHong Zhang         pc2 = b->a + bi[i + 1];
1471b89f182dSHong Zhang         pj  = b->j + bi[i + 1];
1472b89f182dSHong Zhang         nz  = bi[i + 2] - bi[i + 1];
1473b89f182dSHong Zhang         for (j = 0; j < nz; j++) {
1474b89f182dSHong Zhang           col    = pj[j];
14759371c9d4SSatish Balay           pc2[j] = rtmp2[col];
14769371c9d4SSatish Balay           rs += PetscAbsScalar(pc2[j]);
1477b89f182dSHong Zhang         }
1478b89f182dSHong Zhang         /* U part */
1479b89f182dSHong Zhang         pc2 = b->a + bdiag[i + 2] + 1;
14800e7a5c2bSHong Zhang         pj  = b->j + bdiag[i + 2] + 1;
14810e7a5c2bSHong Zhang         nz  = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1482b89f182dSHong Zhang         for (j = 0; j < nz; j++) {
1483b89f182dSHong Zhang           col    = pj[j];
14849371c9d4SSatish Balay           pc2[j] = rtmp2[col];
14859371c9d4SSatish Balay           rs += PetscAbsScalar(pc2[j]);
1486b89f182dSHong Zhang         }
1487b89f182dSHong Zhang 
148828f1b45aSHong Zhang         sctx.rs = rs;
1489b89f182dSHong Zhang         sctx.pv = rtmp2[i + 1];
14909566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
149107b50cabSHong Zhang         if (sctx.newshift) break;
149228f1b45aSHong Zhang         pc2  = b->a + bdiag[i + 1];
1493b89f182dSHong Zhang         *pc2 = 1.0 / sctx.pv;
149428f1b45aSHong Zhang         break;
1495b89f182dSHong Zhang 
149668785679SHong Zhang       case 3:
149768785679SHong Zhang         /* zero rtmp */
149868785679SHong Zhang         /* L part */
149968785679SHong Zhang         nz    = bi[i + 1] - bi[i];
150068785679SHong Zhang         bjtmp = bj + bi[i];
150168785679SHong Zhang         for (j = 0; j < nz; j++) {
150268785679SHong Zhang           col        = bjtmp[j];
15039371c9d4SSatish Balay           rtmp1[col] = 0.0;
15049371c9d4SSatish Balay           rtmp2[col] = 0.0;
15059371c9d4SSatish Balay           rtmp3[col] = 0.0;
150668785679SHong Zhang         }
150768785679SHong Zhang 
150868785679SHong Zhang         /* U part */
150968785679SHong Zhang         nz    = bdiag[i] - bdiag[i + 1];
151068785679SHong Zhang         bjtmp = bj + bdiag[i + 1] + 1;
151168785679SHong Zhang         for (j = 0; j < nz; j++) {
151268785679SHong Zhang           col        = bjtmp[j];
15139371c9d4SSatish Balay           rtmp1[col] = 0.0;
15149371c9d4SSatish Balay           rtmp2[col] = 0.0;
15159371c9d4SSatish Balay           rtmp3[col] = 0.0;
151668785679SHong Zhang         }
151768785679SHong Zhang 
151868785679SHong Zhang         /* load in initial (unfactored row) */
151968785679SHong Zhang         nz    = ai[r[i] + 1] - ai[r[i]];
152068785679SHong Zhang         ajtmp = aj + ai[r[i]];
15219371c9d4SSatish Balay         v1    = aa + ai[r[i]];
15229371c9d4SSatish Balay         v2    = aa + ai[r[i] + 1];
15239371c9d4SSatish Balay         v3    = aa + ai[r[i] + 2];
152468785679SHong Zhang         for (j = 0; j < nz; j++) {
152568785679SHong Zhang           col        = ics[ajtmp[j]];
15269371c9d4SSatish Balay           rtmp1[col] = v1[j];
15279371c9d4SSatish Balay           rtmp2[col] = v2[j];
15289371c9d4SSatish Balay           rtmp3[col] = v3[j];
152968785679SHong Zhang         }
153068785679SHong Zhang         /* ZeropivotApply(): shift the diagonal of the matrix  */
15319371c9d4SSatish Balay         rtmp1[i] += sctx.shift_amount;
15329371c9d4SSatish Balay         rtmp2[i + 1] += sctx.shift_amount;
15339371c9d4SSatish Balay         rtmp3[i + 2] += sctx.shift_amount;
153468785679SHong Zhang 
153568785679SHong Zhang         /* elimination */
153668785679SHong Zhang         bjtmp = bj + bi[i];
153768785679SHong Zhang         row   = *bjtmp++; /* pivot row */
153868785679SHong Zhang         nzL   = bi[i + 1] - bi[i];
153968785679SHong Zhang         for (k = 0; k < nzL; k++) {
1540b89f182dSHong Zhang           pc1 = rtmp1 + row;
1541b89f182dSHong Zhang           pc2 = rtmp2 + row;
1542b89f182dSHong Zhang           pc3 = rtmp3 + row;
154368785679SHong Zhang           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
154468785679SHong Zhang             pv   = b->a + bdiag[row];
15459371c9d4SSatish Balay             mul1 = *pc1 * (*pv);
15469371c9d4SSatish Balay             mul2 = *pc2 * (*pv);
15479371c9d4SSatish Balay             mul3 = *pc3 * (*pv);
15489371c9d4SSatish Balay             *pc1 = mul1;
15499371c9d4SSatish Balay             *pc2 = mul2;
15509371c9d4SSatish Balay             *pc3 = mul3;
155168785679SHong Zhang 
155268785679SHong Zhang             pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
155368785679SHong Zhang             pv = b->a + bdiag[row + 1] + 1;
155468785679SHong Zhang             nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
155568785679SHong Zhang             for (j = 0; j < nz; j++) {
155668785679SHong Zhang               col = pj[j];
1557b89f182dSHong Zhang               rtmp1[col] -= mul1 * pv[j];
1558b89f182dSHong Zhang               rtmp2[col] -= mul2 * pv[j];
1559b89f182dSHong Zhang               rtmp3[col] -= mul3 * pv[j];
156068785679SHong Zhang             }
15619566063dSJacob Faibussowitsch             PetscCall(PetscLogFlops(3 + 6.0 * nz));
156268785679SHong Zhang           }
156368785679SHong Zhang           row = *bjtmp++;
156468785679SHong Zhang         }
156568785679SHong Zhang 
1566b89f182dSHong Zhang         /* finished row i; check zero pivot, then stick row i into b->a */
1567b89f182dSHong Zhang         rs = 0.0;
1568b89f182dSHong Zhang         /* L part */
1569b89f182dSHong Zhang         pc1 = b->a + bi[i];
1570b89f182dSHong Zhang         pj  = b->j + bi[i];
1571b89f182dSHong Zhang         nz  = bi[i + 1] - bi[i];
1572b89f182dSHong Zhang         for (j = 0; j < nz; j++) {
1573b89f182dSHong Zhang           col    = pj[j];
15749371c9d4SSatish Balay           pc1[j] = rtmp1[col];
15759371c9d4SSatish Balay           rs += PetscAbsScalar(pc1[j]);
1576b89f182dSHong Zhang         }
1577b89f182dSHong Zhang         /* U part */
1578b89f182dSHong Zhang         pc1 = b->a + bdiag[i + 1] + 1;
1579b89f182dSHong Zhang         pj  = b->j + bdiag[i + 1] + 1;
15800e7a5c2bSHong Zhang         nz  = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1581b89f182dSHong Zhang         for (j = 0; j < nz; j++) {
1582b89f182dSHong Zhang           col    = pj[j];
15839371c9d4SSatish Balay           pc1[j] = rtmp1[col];
15849371c9d4SSatish Balay           rs += PetscAbsScalar(pc1[j]);
1585b89f182dSHong Zhang         }
158668785679SHong Zhang 
1587b89f182dSHong Zhang         sctx.rs = rs;
1588b89f182dSHong Zhang         sctx.pv = rtmp1[i];
15899566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
159007b50cabSHong Zhang         if (sctx.newshift) break;
1591b89f182dSHong Zhang         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1592b89f182dSHong Zhang         *pc1 = 1.0 / sctx.pv;
1593b89f182dSHong Zhang 
1594b89f182dSHong Zhang         /* Now take care of 1st column of diagonal 3x3 block. */
1595b89f182dSHong Zhang         pc2 = rtmp2 + i;
1596b89f182dSHong Zhang         pc3 = rtmp3 + i;
1597b89f182dSHong Zhang         if (*pc2 != 0.0 || *pc3 != 0.0) {
15989371c9d4SSatish Balay           mul2 = (*pc2) * (*pc1);
15999371c9d4SSatish Balay           *pc2 = mul2;
16009371c9d4SSatish Balay           mul3 = (*pc3) * (*pc1);
16019371c9d4SSatish Balay           *pc3 = mul3;
160268785679SHong Zhang           pj   = b->j + bdiag[i + 1] + 1;     /* beginning of U(i,:) */
160368785679SHong Zhang           nz   = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
160468785679SHong Zhang           for (j = 0; j < nz; j++) {
160568785679SHong Zhang             col = pj[j];
1606b89f182dSHong Zhang             rtmp2[col] -= mul2 * rtmp1[col];
1607b89f182dSHong Zhang             rtmp3[col] -= mul3 * rtmp1[col];
160868785679SHong Zhang           }
16099566063dSJacob Faibussowitsch           PetscCall(PetscLogFlops(2 + 4.0 * nz));
161068785679SHong Zhang         }
161168785679SHong Zhang 
1612b89f182dSHong Zhang         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1613b89f182dSHong Zhang         rs = 0.0;
1614b89f182dSHong Zhang         /* L part */
1615b89f182dSHong Zhang         pc2 = b->a + bi[i + 1];
1616b89f182dSHong Zhang         pj  = b->j + bi[i + 1];
1617b89f182dSHong Zhang         nz  = bi[i + 2] - bi[i + 1];
1618b89f182dSHong Zhang         for (j = 0; j < nz; j++) {
1619b89f182dSHong Zhang           col    = pj[j];
16209371c9d4SSatish Balay           pc2[j] = rtmp2[col];
16219371c9d4SSatish Balay           rs += PetscAbsScalar(pc2[j]);
1622b89f182dSHong Zhang         }
1623b89f182dSHong Zhang         /* U part */
1624b89f182dSHong Zhang         pc2 = b->a + bdiag[i + 2] + 1;
16250e7a5c2bSHong Zhang         pj  = b->j + bdiag[i + 2] + 1;
16260e7a5c2bSHong Zhang         nz  = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1627b89f182dSHong Zhang         for (j = 0; j < nz; j++) {
1628b89f182dSHong Zhang           col    = pj[j];
16299371c9d4SSatish Balay           pc2[j] = rtmp2[col];
16309371c9d4SSatish Balay           rs += PetscAbsScalar(pc2[j]);
1631b89f182dSHong Zhang         }
1632b89f182dSHong Zhang 
1633b89f182dSHong Zhang         sctx.rs = rs;
1634b89f182dSHong Zhang         sctx.pv = rtmp2[i + 1];
16359566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
163607b50cabSHong Zhang         if (sctx.newshift) break;
1637b89f182dSHong Zhang         pc2  = b->a + bdiag[i + 1];
1638b89f182dSHong Zhang         *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */
1639b89f182dSHong Zhang 
1640b89f182dSHong Zhang         /* Now take care of 2nd column of diagonal 3x3 block. */
1641b89f182dSHong Zhang         pc3 = rtmp3 + i + 1;
164268785679SHong Zhang         if (*pc3 != 0.0) {
16439371c9d4SSatish Balay           mul3 = (*pc3) * (*pc2);
16449371c9d4SSatish Balay           *pc3 = mul3;
164568785679SHong Zhang           pj   = b->j + bdiag[i + 2] + 1;         /* beginning of U(i+1,:) */
164668785679SHong Zhang           nz   = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */
164768785679SHong Zhang           for (j = 0; j < nz; j++) {
164868785679SHong Zhang             col = pj[j];
1649b89f182dSHong Zhang             rtmp3[col] -= mul3 * rtmp2[col];
165068785679SHong Zhang           }
16519566063dSJacob Faibussowitsch           PetscCall(PetscLogFlops(1 + 2.0 * nz));
165268785679SHong Zhang         }
165368785679SHong Zhang 
1654b89f182dSHong Zhang         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
165568785679SHong Zhang         rs = 0.0;
165668785679SHong Zhang         /* L part */
1657b89f182dSHong Zhang         pc3 = b->a + bi[i + 2];
1658b89f182dSHong Zhang         pj  = b->j + bi[i + 2];
1659b89f182dSHong Zhang         nz  = bi[i + 3] - bi[i + 2];
166068785679SHong Zhang         for (j = 0; j < nz; j++) {
166168785679SHong Zhang           col    = pj[j];
16629371c9d4SSatish Balay           pc3[j] = rtmp3[col];
16639371c9d4SSatish Balay           rs += PetscAbsScalar(pc3[j]);
166468785679SHong Zhang         }
166568785679SHong Zhang         /* U part */
1666b89f182dSHong Zhang         pc3 = b->a + bdiag[i + 3] + 1;
16670e7a5c2bSHong Zhang         pj  = b->j + bdiag[i + 3] + 1;
16680e7a5c2bSHong Zhang         nz  = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */
166968785679SHong Zhang         for (j = 0; j < nz; j++) {
167068785679SHong Zhang           col    = pj[j];
16719371c9d4SSatish Balay           pc3[j] = rtmp3[col];
16729371c9d4SSatish Balay           rs += PetscAbsScalar(pc3[j]);
167368785679SHong Zhang         }
167468785679SHong Zhang 
167568785679SHong Zhang         sctx.rs = rs;
1676b89f182dSHong Zhang         sctx.pv = rtmp3[i + 2];
16779566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2));
167807b50cabSHong Zhang         if (sctx.newshift) break;
167968785679SHong Zhang         pc3  = b->a + bdiag[i + 2];
1680b89f182dSHong Zhang         *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */
168168785679SHong Zhang         break;
16829877982aSShri Abhyankar       case 4:
16839877982aSShri Abhyankar         /* zero rtmp */
16849877982aSShri Abhyankar         /* L part */
16859877982aSShri Abhyankar         nz    = bi[i + 1] - bi[i];
16869877982aSShri Abhyankar         bjtmp = bj + bi[i];
16879877982aSShri Abhyankar         for (j = 0; j < nz; j++) {
16889877982aSShri Abhyankar           col        = bjtmp[j];
16899371c9d4SSatish Balay           rtmp1[col] = 0.0;
16909371c9d4SSatish Balay           rtmp2[col] = 0.0;
16919371c9d4SSatish Balay           rtmp3[col] = 0.0;
16929371c9d4SSatish Balay           rtmp4[col] = 0.0;
16939877982aSShri Abhyankar         }
16949877982aSShri Abhyankar 
16959877982aSShri Abhyankar         /* U part */
16969877982aSShri Abhyankar         nz    = bdiag[i] - bdiag[i + 1];
16979877982aSShri Abhyankar         bjtmp = bj + bdiag[i + 1] + 1;
16989877982aSShri Abhyankar         for (j = 0; j < nz; j++) {
16999877982aSShri Abhyankar           col        = bjtmp[j];
17009371c9d4SSatish Balay           rtmp1[col] = 0.0;
17019371c9d4SSatish Balay           rtmp2[col] = 0.0;
17029371c9d4SSatish Balay           rtmp3[col] = 0.0;
17039371c9d4SSatish Balay           rtmp4[col] = 0.0;
17049877982aSShri Abhyankar         }
17059877982aSShri Abhyankar 
17069877982aSShri Abhyankar         /* load in initial (unfactored row) */
17079877982aSShri Abhyankar         nz    = ai[r[i] + 1] - ai[r[i]];
17089877982aSShri Abhyankar         ajtmp = aj + ai[r[i]];
17099371c9d4SSatish Balay         v1    = aa + ai[r[i]];
17109371c9d4SSatish Balay         v2    = aa + ai[r[i] + 1];
17119371c9d4SSatish Balay         v3    = aa + ai[r[i] + 2];
17129371c9d4SSatish Balay         v4    = aa + ai[r[i] + 3];
17139877982aSShri Abhyankar         for (j = 0; j < nz; j++) {
17149877982aSShri Abhyankar           col        = ics[ajtmp[j]];
17159371c9d4SSatish Balay           rtmp1[col] = v1[j];
17169371c9d4SSatish Balay           rtmp2[col] = v2[j];
17179371c9d4SSatish Balay           rtmp3[col] = v3[j];
17189371c9d4SSatish Balay           rtmp4[col] = v4[j];
17199877982aSShri Abhyankar         }
17209877982aSShri Abhyankar         /* ZeropivotApply(): shift the diagonal of the matrix  */
17219371c9d4SSatish Balay         rtmp1[i] += sctx.shift_amount;
17229371c9d4SSatish Balay         rtmp2[i + 1] += sctx.shift_amount;
17239371c9d4SSatish Balay         rtmp3[i + 2] += sctx.shift_amount;
17249371c9d4SSatish Balay         rtmp4[i + 3] += sctx.shift_amount;
17259877982aSShri Abhyankar 
17269877982aSShri Abhyankar         /* elimination */
17279877982aSShri Abhyankar         bjtmp = bj + bi[i];
17289877982aSShri Abhyankar         row   = *bjtmp++; /* pivot row */
17299877982aSShri Abhyankar         nzL   = bi[i + 1] - bi[i];
17309877982aSShri Abhyankar         for (k = 0; k < nzL; k++) {
17319877982aSShri Abhyankar           pc1 = rtmp1 + row;
17329877982aSShri Abhyankar           pc2 = rtmp2 + row;
17339877982aSShri Abhyankar           pc3 = rtmp3 + row;
17349877982aSShri Abhyankar           pc4 = rtmp4 + row;
17359877982aSShri Abhyankar           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
17369877982aSShri Abhyankar             pv   = b->a + bdiag[row];
17379371c9d4SSatish Balay             mul1 = *pc1 * (*pv);
17389371c9d4SSatish Balay             mul2 = *pc2 * (*pv);
17399371c9d4SSatish Balay             mul3 = *pc3 * (*pv);
17409371c9d4SSatish Balay             mul4 = *pc4 * (*pv);
17419371c9d4SSatish Balay             *pc1 = mul1;
17429371c9d4SSatish Balay             *pc2 = mul2;
17439371c9d4SSatish Balay             *pc3 = mul3;
17449371c9d4SSatish Balay             *pc4 = mul4;
17459877982aSShri Abhyankar 
17469877982aSShri Abhyankar             pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
17479877982aSShri Abhyankar             pv = b->a + bdiag[row + 1] + 1;
17489877982aSShri Abhyankar             nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
17499877982aSShri Abhyankar             for (j = 0; j < nz; j++) {
17509877982aSShri Abhyankar               col = pj[j];
17519877982aSShri Abhyankar               rtmp1[col] -= mul1 * pv[j];
17529877982aSShri Abhyankar               rtmp2[col] -= mul2 * pv[j];
17539877982aSShri Abhyankar               rtmp3[col] -= mul3 * pv[j];
17549877982aSShri Abhyankar               rtmp4[col] -= mul4 * pv[j];
17559877982aSShri Abhyankar             }
17569566063dSJacob Faibussowitsch             PetscCall(PetscLogFlops(4 + 8.0 * nz));
17579877982aSShri Abhyankar           }
17589877982aSShri Abhyankar           row = *bjtmp++;
17599877982aSShri Abhyankar         }
17609877982aSShri Abhyankar 
17619877982aSShri Abhyankar         /* finished row i; check zero pivot, then stick row i into b->a */
17629877982aSShri Abhyankar         rs = 0.0;
17639877982aSShri Abhyankar         /* L part */
17649877982aSShri Abhyankar         pc1 = b->a + bi[i];
17659877982aSShri Abhyankar         pj  = b->j + bi[i];
17669877982aSShri Abhyankar         nz  = bi[i + 1] - bi[i];
17679877982aSShri Abhyankar         for (j = 0; j < nz; j++) {
17689877982aSShri Abhyankar           col    = pj[j];
17699371c9d4SSatish Balay           pc1[j] = rtmp1[col];
17709371c9d4SSatish Balay           rs += PetscAbsScalar(pc1[j]);
17719877982aSShri Abhyankar         }
17729877982aSShri Abhyankar         /* U part */
17739877982aSShri Abhyankar         pc1 = b->a + bdiag[i + 1] + 1;
17749877982aSShri Abhyankar         pj  = b->j + bdiag[i + 1] + 1;
17759877982aSShri Abhyankar         nz  = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
17769877982aSShri Abhyankar         for (j = 0; j < nz; j++) {
17779877982aSShri Abhyankar           col    = pj[j];
17789371c9d4SSatish Balay           pc1[j] = rtmp1[col];
17799371c9d4SSatish Balay           rs += PetscAbsScalar(pc1[j]);
17809877982aSShri Abhyankar         }
17819877982aSShri Abhyankar 
17829877982aSShri Abhyankar         sctx.rs = rs;
17839877982aSShri Abhyankar         sctx.pv = rtmp1[i];
17849566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
178507b50cabSHong Zhang         if (sctx.newshift) break;
17869877982aSShri Abhyankar         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
17879877982aSShri Abhyankar         *pc1 = 1.0 / sctx.pv;
17889877982aSShri Abhyankar 
17899877982aSShri Abhyankar         /* Now take care of 1st column of diagonal 4x4 block. */
17909877982aSShri Abhyankar         pc2 = rtmp2 + i;
17919877982aSShri Abhyankar         pc3 = rtmp3 + i;
17929877982aSShri Abhyankar         pc4 = rtmp4 + i;
17939877982aSShri Abhyankar         if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
17949371c9d4SSatish Balay           mul2 = (*pc2) * (*pc1);
17959371c9d4SSatish Balay           *pc2 = mul2;
17969371c9d4SSatish Balay           mul3 = (*pc3) * (*pc1);
17979371c9d4SSatish Balay           *pc3 = mul3;
17989371c9d4SSatish Balay           mul4 = (*pc4) * (*pc1);
17999371c9d4SSatish Balay           *pc4 = mul4;
18009877982aSShri Abhyankar           pj   = b->j + bdiag[i + 1] + 1;     /* beginning of U(i,:) */
18019877982aSShri Abhyankar           nz   = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
18029877982aSShri Abhyankar           for (j = 0; j < nz; j++) {
18039877982aSShri Abhyankar             col = pj[j];
18049877982aSShri Abhyankar             rtmp2[col] -= mul2 * rtmp1[col];
18059877982aSShri Abhyankar             rtmp3[col] -= mul3 * rtmp1[col];
18069877982aSShri Abhyankar             rtmp4[col] -= mul4 * rtmp1[col];
18079877982aSShri Abhyankar           }
18089566063dSJacob Faibussowitsch           PetscCall(PetscLogFlops(3 + 6.0 * nz));
18099877982aSShri Abhyankar         }
18109877982aSShri Abhyankar 
18119877982aSShri Abhyankar         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
18129877982aSShri Abhyankar         rs = 0.0;
18139877982aSShri Abhyankar         /* L part */
18149877982aSShri Abhyankar         pc2 = b->a + bi[i + 1];
18159877982aSShri Abhyankar         pj  = b->j + bi[i + 1];
18169877982aSShri Abhyankar         nz  = bi[i + 2] - bi[i + 1];
18179877982aSShri Abhyankar         for (j = 0; j < nz; j++) {
18189877982aSShri Abhyankar           col    = pj[j];
18199371c9d4SSatish Balay           pc2[j] = rtmp2[col];
18209371c9d4SSatish Balay           rs += PetscAbsScalar(pc2[j]);
18219877982aSShri Abhyankar         }
18229877982aSShri Abhyankar         /* U part */
18239877982aSShri Abhyankar         pc2 = b->a + bdiag[i + 2] + 1;
18249877982aSShri Abhyankar         pj  = b->j + bdiag[i + 2] + 1;
18259877982aSShri Abhyankar         nz  = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
18269877982aSShri Abhyankar         for (j = 0; j < nz; j++) {
18279877982aSShri Abhyankar           col    = pj[j];
18289371c9d4SSatish Balay           pc2[j] = rtmp2[col];
18299371c9d4SSatish Balay           rs += PetscAbsScalar(pc2[j]);
18309877982aSShri Abhyankar         }
18319877982aSShri Abhyankar 
18329877982aSShri Abhyankar         sctx.rs = rs;
18339877982aSShri Abhyankar         sctx.pv = rtmp2[i + 1];
18349566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
183507b50cabSHong Zhang         if (sctx.newshift) break;
18369877982aSShri Abhyankar         pc2  = b->a + bdiag[i + 1];
18379877982aSShri Abhyankar         *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */
18389877982aSShri Abhyankar 
18399877982aSShri Abhyankar         /* Now take care of 2nd column of diagonal 4x4 block. */
18409877982aSShri Abhyankar         pc3 = rtmp3 + i + 1;
18419877982aSShri Abhyankar         pc4 = rtmp4 + i + 1;
18429877982aSShri Abhyankar         if (*pc3 != 0.0 || *pc4 != 0.0) {
18439371c9d4SSatish Balay           mul3 = (*pc3) * (*pc2);
18449371c9d4SSatish Balay           *pc3 = mul3;
18459371c9d4SSatish Balay           mul4 = (*pc4) * (*pc2);
18469371c9d4SSatish Balay           *pc4 = mul4;
18479877982aSShri Abhyankar           pj   = b->j + bdiag[i + 2] + 1;         /* beginning of U(i+1,:) */
18489877982aSShri Abhyankar           nz   = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */
18499877982aSShri Abhyankar           for (j = 0; j < nz; j++) {
18509877982aSShri Abhyankar             col = pj[j];
18519877982aSShri Abhyankar             rtmp3[col] -= mul3 * rtmp2[col];
18529877982aSShri Abhyankar             rtmp4[col] -= mul4 * rtmp2[col];
18539877982aSShri Abhyankar           }
18549566063dSJacob Faibussowitsch           PetscCall(PetscLogFlops(4.0 * nz));
18559877982aSShri Abhyankar         }
18569877982aSShri Abhyankar 
18579877982aSShri Abhyankar         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
18589877982aSShri Abhyankar         rs = 0.0;
18599877982aSShri Abhyankar         /* L part */
18609877982aSShri Abhyankar         pc3 = b->a + bi[i + 2];
18619877982aSShri Abhyankar         pj  = b->j + bi[i + 2];
18629877982aSShri Abhyankar         nz  = bi[i + 3] - bi[i + 2];
18639877982aSShri Abhyankar         for (j = 0; j < nz; j++) {
18649877982aSShri Abhyankar           col    = pj[j];
18659371c9d4SSatish Balay           pc3[j] = rtmp3[col];
18669371c9d4SSatish Balay           rs += PetscAbsScalar(pc3[j]);
18679877982aSShri Abhyankar         }
18689877982aSShri Abhyankar         /* U part */
18699877982aSShri Abhyankar         pc3 = b->a + bdiag[i + 3] + 1;
18709877982aSShri Abhyankar         pj  = b->j + bdiag[i + 3] + 1;
18719877982aSShri Abhyankar         nz  = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */
18729877982aSShri Abhyankar         for (j = 0; j < nz; j++) {
18739877982aSShri Abhyankar           col    = pj[j];
18749371c9d4SSatish Balay           pc3[j] = rtmp3[col];
18759371c9d4SSatish Balay           rs += PetscAbsScalar(pc3[j]);
18769877982aSShri Abhyankar         }
18779877982aSShri Abhyankar 
18789877982aSShri Abhyankar         sctx.rs = rs;
18799877982aSShri Abhyankar         sctx.pv = rtmp3[i + 2];
18809566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2));
188107b50cabSHong Zhang         if (sctx.newshift) break;
18829877982aSShri Abhyankar         pc3  = b->a + bdiag[i + 2];
18839877982aSShri Abhyankar         *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */
18849877982aSShri Abhyankar 
18859877982aSShri Abhyankar         /* Now take care of 3rd column of diagonal 4x4 block. */
18869877982aSShri Abhyankar         pc4 = rtmp4 + i + 2;
18879877982aSShri Abhyankar         if (*pc4 != 0.0) {
18889371c9d4SSatish Balay           mul4 = (*pc4) * (*pc3);
18899371c9d4SSatish Balay           *pc4 = mul4;
18909877982aSShri Abhyankar           pj   = b->j + bdiag[i + 3] + 1;         /* beginning of U(i+2,:) */
18919877982aSShri Abhyankar           nz   = bdiag[i + 2] - bdiag[i + 3] - 1; /* num of entries in U(i+2,:) excluding diag */
18929877982aSShri Abhyankar           for (j = 0; j < nz; j++) {
18939877982aSShri Abhyankar             col = pj[j];
18949877982aSShri Abhyankar             rtmp4[col] -= mul4 * rtmp3[col];
18959877982aSShri Abhyankar           }
18969566063dSJacob Faibussowitsch           PetscCall(PetscLogFlops(1 + 2.0 * nz));
18979877982aSShri Abhyankar         }
18989877982aSShri Abhyankar 
18999877982aSShri Abhyankar         /* finished i+3; check zero pivot, then stick row i+3 into b->a */
19009877982aSShri Abhyankar         rs = 0.0;
19019877982aSShri Abhyankar         /* L part */
19029877982aSShri Abhyankar         pc4 = b->a + bi[i + 3];
19039877982aSShri Abhyankar         pj  = b->j + bi[i + 3];
19049877982aSShri Abhyankar         nz  = bi[i + 4] - bi[i + 3];
19059877982aSShri Abhyankar         for (j = 0; j < nz; j++) {
19069877982aSShri Abhyankar           col    = pj[j];
19079371c9d4SSatish Balay           pc4[j] = rtmp4[col];
19089371c9d4SSatish Balay           rs += PetscAbsScalar(pc4[j]);
19099877982aSShri Abhyankar         }
19109877982aSShri Abhyankar         /* U part */
19119877982aSShri Abhyankar         pc4 = b->a + bdiag[i + 4] + 1;
19129877982aSShri Abhyankar         pj  = b->j + bdiag[i + 4] + 1;
19139877982aSShri Abhyankar         nz  = bdiag[i + 3] - bdiag[i + 4] - 1; /* exclude diagonal */
19149877982aSShri Abhyankar         for (j = 0; j < nz; j++) {
19159877982aSShri Abhyankar           col    = pj[j];
19169371c9d4SSatish Balay           pc4[j] = rtmp4[col];
19179371c9d4SSatish Balay           rs += PetscAbsScalar(pc4[j]);
19189877982aSShri Abhyankar         }
19199877982aSShri Abhyankar 
19209877982aSShri Abhyankar         sctx.rs = rs;
19219877982aSShri Abhyankar         sctx.pv = rtmp4[i + 3];
19229566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 3));
192307b50cabSHong Zhang         if (sctx.newshift) break;
19249877982aSShri Abhyankar         pc4  = b->a + bdiag[i + 3];
19259877982aSShri Abhyankar         *pc4 = 1.0 / sctx.pv; /* Mark diag[i+3] */
19269877982aSShri Abhyankar         break;
192768785679SHong Zhang 
1928d71ae5a4SJacob Faibussowitsch       default:
1929d71ae5a4SJacob Faibussowitsch         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported ");
193028f1b45aSHong Zhang       }
1931c2b86aeeSHong Zhang       if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
193228f1b45aSHong Zhang       i += nodesz;              /* Update the row */
193368785679SHong Zhang     }
193428f1b45aSHong Zhang 
193528f1b45aSHong Zhang     /* MatPivotRefine() */
193607b50cabSHong Zhang     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) {
193728f1b45aSHong Zhang       /*
193828f1b45aSHong Zhang        * if no shift in this attempt & shifting & started shifting & can refine,
193928f1b45aSHong Zhang        * then try lower shift
194028f1b45aSHong Zhang        */
194128f1b45aSHong Zhang       sctx.shift_hi       = sctx.shift_fraction;
194228f1b45aSHong Zhang       sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.;
194328f1b45aSHong Zhang       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
194407b50cabSHong Zhang       sctx.newshift       = PETSC_TRUE;
194528f1b45aSHong Zhang       sctx.nshift++;
194628f1b45aSHong Zhang     }
194707b50cabSHong Zhang   } while (sctx.newshift);
194828f1b45aSHong Zhang 
19499566063dSJacob Faibussowitsch   PetscCall(PetscFree4(rtmp1, rtmp2, rtmp3, rtmp4));
19509566063dSJacob Faibussowitsch   PetscCall(PetscFree(tmp_vec2));
19519566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isicol, &ic));
19529566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &r));
195328f1b45aSHong Zhang 
19544d12350bSJunchao Zhang   if (b->inode.size_csr) {
1955abb87a52SBarry Smith     C->ops->solve = MatSolve_SeqAIJ_Inode;
1956abb87a52SBarry Smith   } else {
1957d3ac4fa3SBarry Smith     C->ops->solve = MatSolve_SeqAIJ;
1958abb87a52SBarry Smith   }
195928f1b45aSHong Zhang   C->ops->solveadd          = MatSolveAdd_SeqAIJ;
196028f1b45aSHong Zhang   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
196128f1b45aSHong Zhang   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
196228f1b45aSHong Zhang   C->ops->matsolve          = MatMatSolve_SeqAIJ;
1963a3d9026eSPierre Jolivet   C->ops->matsolvetranspose = MatMatSolveTranspose_SeqAIJ;
196428f1b45aSHong Zhang   C->assembled              = PETSC_TRUE;
196528f1b45aSHong Zhang   C->preallocated           = PETSC_TRUE;
19662205254eSKarl Rupp 
19679566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(C->cmap->n));
196828f1b45aSHong Zhang 
196928f1b45aSHong Zhang   /* MatShiftView(A,info,&sctx) */
197028f1b45aSHong Zhang   if (sctx.nshift) {
1971f4db908eSBarry Smith     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
19729566063dSJacob Faibussowitsch       PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
1973f4db908eSBarry Smith     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
19749566063dSJacob Faibussowitsch       PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
1975f4db908eSBarry Smith     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
19769566063dSJacob Faibussowitsch       PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount));
197728f1b45aSHong Zhang     }
197828f1b45aSHong Zhang   }
19793ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
198028f1b45aSHong Zhang }
1981628f99d7SShri Abhyankar 
1982ff6a9541SJacob Faibussowitsch #if 0
1983ff6a9541SJacob Faibussowitsch // unused
1984ff6a9541SJacob Faibussowitsch static PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B, Mat A, const MatFactorInfo *info)
1985d71ae5a4SJacob Faibussowitsch {
1986628f99d7SShri Abhyankar   Mat              C = B;
1987628f99d7SShri Abhyankar   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data;
1988628f99d7SShri Abhyankar   IS               iscol = b->col, isrow = b->row, isicol = b->icol;
1989628f99d7SShri Abhyankar   const PetscInt  *r, *ic, *c, *ics;
1990628f99d7SShri Abhyankar   PetscInt         n = A->rmap->n, *bi = b->i;
1991628f99d7SShri Abhyankar   PetscInt        *bj = b->j, *nbj = b->j + 1, *ajtmp, *bjtmp, nz, nz_tmp, row, prow;
19928758e1faSBarry Smith   PetscInt         i, j, idx, *bd = b->diag, node_max, nodesz;
19938758e1faSBarry Smith   PetscInt        *ai = a->i, *aj = a->j;
1994628f99d7SShri Abhyankar   PetscInt        *ns, *tmp_vec1, *tmp_vec2, *nsmap, *pj;
1995628f99d7SShri Abhyankar   PetscScalar      mul1, mul2, mul3, tmp;
1996628f99d7SShri Abhyankar   MatScalar       *pc1, *pc2, *pc3, *ba = b->a, *pv, *rtmp11, *rtmp22, *rtmp33;
1997628f99d7SShri Abhyankar   const MatScalar *v1, *v2, *v3, *aa    = a->a, *rtmp1;
1998628f99d7SShri Abhyankar   PetscReal        rs = 0.0;
1999628f99d7SShri Abhyankar   FactorShiftCtx   sctx;
2000628f99d7SShri Abhyankar 
2001628f99d7SShri Abhyankar   PetscFunctionBegin;
2002628f99d7SShri Abhyankar   sctx.shift_top      = 0;
2003628f99d7SShri Abhyankar   sctx.nshift_max     = 0;
2004628f99d7SShri Abhyankar   sctx.shift_lo       = 0;
2005628f99d7SShri Abhyankar   sctx.shift_hi       = 0;
2006628f99d7SShri Abhyankar   sctx.shift_fraction = 0;
2007628f99d7SShri Abhyankar 
2008628f99d7SShri Abhyankar   /* if both shift schemes are chosen by user, only use info->shiftpd */
2009f4db908eSBarry Smith   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
2010628f99d7SShri Abhyankar     sctx.shift_top = 0;
2011628f99d7SShri Abhyankar     for (i = 0; i < n; i++) {
2012628f99d7SShri Abhyankar       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
2013628f99d7SShri Abhyankar       rs    = 0.0;
2014628f99d7SShri Abhyankar       ajtmp = aj + ai[i];
2015628f99d7SShri Abhyankar       rtmp1 = aa + ai[i];
2016628f99d7SShri Abhyankar       nz    = ai[i + 1] - ai[i];
2017628f99d7SShri Abhyankar       for (j = 0; j < nz; j++) {
2018628f99d7SShri Abhyankar         if (*ajtmp != i) {
2019628f99d7SShri Abhyankar           rs += PetscAbsScalar(*rtmp1++);
2020628f99d7SShri Abhyankar         } else {
2021628f99d7SShri Abhyankar           rs -= PetscRealPart(*rtmp1++);
2022628f99d7SShri Abhyankar         }
2023628f99d7SShri Abhyankar         ajtmp++;
2024628f99d7SShri Abhyankar       }
2025628f99d7SShri Abhyankar       if (rs > sctx.shift_top) sctx.shift_top = rs;
2026628f99d7SShri Abhyankar     }
2027628f99d7SShri Abhyankar     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
2028628f99d7SShri Abhyankar     sctx.shift_top *= 1.1;
2029628f99d7SShri Abhyankar     sctx.nshift_max = 5;
2030628f99d7SShri Abhyankar     sctx.shift_lo   = 0.;
2031628f99d7SShri Abhyankar     sctx.shift_hi   = 1.;
2032628f99d7SShri Abhyankar   }
2033628f99d7SShri Abhyankar   sctx.shift_amount = 0;
2034628f99d7SShri Abhyankar   sctx.nshift       = 0;
2035628f99d7SShri Abhyankar 
20369566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isrow, &r));
20379566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(iscol, &c));
20389566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isicol, &ic));
20399566063dSJacob Faibussowitsch   PetscCall(PetscCalloc3(n, &rtmp11, n, &rtmp22, n, &rtmp33));
2040628f99d7SShri Abhyankar   ics = ic;
2041628f99d7SShri Abhyankar 
2042628f99d7SShri Abhyankar   node_max = a->inode.node_count;
2043628f99d7SShri Abhyankar   ns       = a->inode.size;
204428b400f6SJacob Faibussowitsch   PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information");
2045628f99d7SShri Abhyankar 
2046628f99d7SShri Abhyankar   /* If max inode size > 3, split it into two inodes.*/
2047628f99d7SShri Abhyankar   /* also map the inode sizes according to the ordering */
20489566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(n + 1, &tmp_vec1));
2049628f99d7SShri Abhyankar   for (i = 0, j = 0; i < node_max; ++i, ++j) {
2050628f99d7SShri Abhyankar     if (ns[i] > 3) {
2051628f99d7SShri Abhyankar       tmp_vec1[j] = ns[i] / 2; /* Assuming ns[i] < =5  */
2052628f99d7SShri Abhyankar       ++j;
2053628f99d7SShri Abhyankar       tmp_vec1[j] = ns[i] - tmp_vec1[j - 1];
2054628f99d7SShri Abhyankar     } else {
2055628f99d7SShri Abhyankar       tmp_vec1[j] = ns[i];
2056628f99d7SShri Abhyankar     }
2057628f99d7SShri Abhyankar   }
2058628f99d7SShri Abhyankar   /* Use the correct node_max */
2059628f99d7SShri Abhyankar   node_max = j;
2060628f99d7SShri Abhyankar 
2061628f99d7SShri Abhyankar   /* Now reorder the inode info based on mat re-ordering info */
2062628f99d7SShri Abhyankar   /* First create a row -> inode_size_array_index map */
20639566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(n + 1, &nsmap, node_max + 1, &tmp_vec2));
2064628f99d7SShri Abhyankar   for (i = 0, row = 0; i < node_max; i++) {
2065628f99d7SShri Abhyankar     nodesz = tmp_vec1[i];
2066ad540459SPierre Jolivet     for (j = 0; j < nodesz; j++, row++) nsmap[row] = i;
2067628f99d7SShri Abhyankar   }
2068628f99d7SShri Abhyankar   /* Using nsmap, create a reordered ns structure */
2069628f99d7SShri Abhyankar   for (i = 0, j = 0; i < node_max; i++) {
2070628f99d7SShri Abhyankar     nodesz      = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
2071628f99d7SShri Abhyankar     tmp_vec2[i] = nodesz;
2072628f99d7SShri Abhyankar     j += nodesz;
2073628f99d7SShri Abhyankar   }
20749566063dSJacob Faibussowitsch   PetscCall(PetscFree2(nsmap, tmp_vec1));
2075628f99d7SShri Abhyankar   /* Now use the correct ns */
2076628f99d7SShri Abhyankar   ns = tmp_vec2;
2077628f99d7SShri Abhyankar 
2078628f99d7SShri Abhyankar   do {
207907b50cabSHong Zhang     sctx.newshift = PETSC_FALSE;
2080628f99d7SShri Abhyankar     /* Now loop over each block-row, and do the factorization */
2081628f99d7SShri Abhyankar     for (i = 0, row = 0; i < node_max; i++) {
2082628f99d7SShri Abhyankar       nodesz = ns[i];
2083628f99d7SShri Abhyankar       nz     = bi[row + 1] - bi[row];
2084628f99d7SShri Abhyankar       bjtmp  = bj + bi[row];
2085628f99d7SShri Abhyankar 
2086628f99d7SShri Abhyankar       switch (nodesz) {
2087628f99d7SShri Abhyankar       case 1:
2088628f99d7SShri Abhyankar         for (j = 0; j < nz; j++) {
2089628f99d7SShri Abhyankar           idx         = bjtmp[j];
2090628f99d7SShri Abhyankar           rtmp11[idx] = 0.0;
2091628f99d7SShri Abhyankar         }
2092628f99d7SShri Abhyankar 
2093628f99d7SShri Abhyankar         /* load in initial (unfactored row) */
2094628f99d7SShri Abhyankar         idx    = r[row];
2095628f99d7SShri Abhyankar         nz_tmp = ai[idx + 1] - ai[idx];
2096628f99d7SShri Abhyankar         ajtmp  = aj + ai[idx];
2097628f99d7SShri Abhyankar         v1     = aa + ai[idx];
2098628f99d7SShri Abhyankar 
2099628f99d7SShri Abhyankar         for (j = 0; j < nz_tmp; j++) {
2100628f99d7SShri Abhyankar           idx         = ics[ajtmp[j]];
2101628f99d7SShri Abhyankar           rtmp11[idx] = v1[j];
2102628f99d7SShri Abhyankar         }
2103628f99d7SShri Abhyankar         rtmp11[ics[r[row]]] += sctx.shift_amount;
2104628f99d7SShri Abhyankar 
2105628f99d7SShri Abhyankar         prow = *bjtmp++;
2106628f99d7SShri Abhyankar         while (prow < row) {
2107628f99d7SShri Abhyankar           pc1 = rtmp11 + prow;
2108628f99d7SShri Abhyankar           if (*pc1 != 0.0) {
2109628f99d7SShri Abhyankar             pv     = ba + bd[prow];
2110628f99d7SShri Abhyankar             pj     = nbj + bd[prow];
2111628f99d7SShri Abhyankar             mul1   = *pc1 * *pv++;
2112628f99d7SShri Abhyankar             *pc1   = mul1;
2113628f99d7SShri Abhyankar             nz_tmp = bi[prow + 1] - bd[prow] - 1;
21149566063dSJacob Faibussowitsch             PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2115628f99d7SShri Abhyankar             for (j = 0; j < nz_tmp; j++) {
2116628f99d7SShri Abhyankar               tmp = pv[j];
2117628f99d7SShri Abhyankar               idx = pj[j];
2118628f99d7SShri Abhyankar               rtmp11[idx] -= mul1 * tmp;
2119628f99d7SShri Abhyankar             }
2120628f99d7SShri Abhyankar           }
2121628f99d7SShri Abhyankar           prow = *bjtmp++;
2122628f99d7SShri Abhyankar         }
2123628f99d7SShri Abhyankar         pj  = bj + bi[row];
2124628f99d7SShri Abhyankar         pc1 = ba + bi[row];
2125628f99d7SShri Abhyankar 
2126628f99d7SShri Abhyankar         sctx.pv     = rtmp11[row];
2127628f99d7SShri Abhyankar         rtmp11[row] = 1.0 / rtmp11[row]; /* invert diag */
2128628f99d7SShri Abhyankar         rs          = 0.0;
2129628f99d7SShri Abhyankar         for (j = 0; j < nz; j++) {
2130628f99d7SShri Abhyankar           idx    = pj[j];
2131628f99d7SShri Abhyankar           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2132628f99d7SShri Abhyankar           if (idx != row) rs += PetscAbsScalar(pc1[j]);
2133628f99d7SShri Abhyankar         }
2134628f99d7SShri Abhyankar         sctx.rs = rs;
21359566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, row));
213607b50cabSHong Zhang         if (sctx.newshift) goto endofwhile;
2137628f99d7SShri Abhyankar         break;
2138628f99d7SShri Abhyankar 
2139628f99d7SShri Abhyankar       case 2:
2140628f99d7SShri Abhyankar         for (j = 0; j < nz; j++) {
2141628f99d7SShri Abhyankar           idx         = bjtmp[j];
2142628f99d7SShri Abhyankar           rtmp11[idx] = 0.0;
2143628f99d7SShri Abhyankar           rtmp22[idx] = 0.0;
2144628f99d7SShri Abhyankar         }
2145628f99d7SShri Abhyankar 
2146628f99d7SShri Abhyankar         /* load in initial (unfactored row) */
2147628f99d7SShri Abhyankar         idx    = r[row];
2148628f99d7SShri Abhyankar         nz_tmp = ai[idx + 1] - ai[idx];
2149628f99d7SShri Abhyankar         ajtmp  = aj + ai[idx];
2150628f99d7SShri Abhyankar         v1     = aa + ai[idx];
2151628f99d7SShri Abhyankar         v2     = aa + ai[idx + 1];
2152628f99d7SShri Abhyankar         for (j = 0; j < nz_tmp; j++) {
2153628f99d7SShri Abhyankar           idx         = ics[ajtmp[j]];
2154628f99d7SShri Abhyankar           rtmp11[idx] = v1[j];
2155628f99d7SShri Abhyankar           rtmp22[idx] = v2[j];
2156628f99d7SShri Abhyankar         }
2157628f99d7SShri Abhyankar         rtmp11[ics[r[row]]] += sctx.shift_amount;
2158628f99d7SShri Abhyankar         rtmp22[ics[r[row + 1]]] += sctx.shift_amount;
2159628f99d7SShri Abhyankar 
2160628f99d7SShri Abhyankar         prow = *bjtmp++;
2161628f99d7SShri Abhyankar         while (prow < row) {
2162628f99d7SShri Abhyankar           pc1 = rtmp11 + prow;
2163628f99d7SShri Abhyankar           pc2 = rtmp22 + prow;
2164628f99d7SShri Abhyankar           if (*pc1 != 0.0 || *pc2 != 0.0) {
2165628f99d7SShri Abhyankar             pv   = ba + bd[prow];
2166628f99d7SShri Abhyankar             pj   = nbj + bd[prow];
2167628f99d7SShri Abhyankar             mul1 = *pc1 * *pv;
2168628f99d7SShri Abhyankar             mul2 = *pc2 * *pv;
2169628f99d7SShri Abhyankar             ++pv;
2170628f99d7SShri Abhyankar             *pc1 = mul1;
2171628f99d7SShri Abhyankar             *pc2 = mul2;
2172628f99d7SShri Abhyankar 
2173628f99d7SShri Abhyankar             nz_tmp = bi[prow + 1] - bd[prow] - 1;
2174628f99d7SShri Abhyankar             for (j = 0; j < nz_tmp; j++) {
2175628f99d7SShri Abhyankar               tmp = pv[j];
2176628f99d7SShri Abhyankar               idx = pj[j];
2177628f99d7SShri Abhyankar               rtmp11[idx] -= mul1 * tmp;
2178628f99d7SShri Abhyankar               rtmp22[idx] -= mul2 * tmp;
2179628f99d7SShri Abhyankar             }
21809566063dSJacob Faibussowitsch             PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp));
2181628f99d7SShri Abhyankar           }
2182628f99d7SShri Abhyankar           prow = *bjtmp++;
2183628f99d7SShri Abhyankar         }
2184628f99d7SShri Abhyankar 
2185628f99d7SShri Abhyankar         /* Now take care of diagonal 2x2 block. Note: prow = row here */
2186628f99d7SShri Abhyankar         pc1 = rtmp11 + prow;
2187628f99d7SShri Abhyankar         pc2 = rtmp22 + prow;
2188628f99d7SShri Abhyankar 
2189628f99d7SShri Abhyankar         sctx.pv = *pc1;
2190628f99d7SShri Abhyankar         pj      = bj + bi[prow];
2191628f99d7SShri Abhyankar         rs      = 0.0;
2192628f99d7SShri Abhyankar         for (j = 0; j < nz; j++) {
2193628f99d7SShri Abhyankar           idx = pj[j];
2194628f99d7SShri Abhyankar           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2195628f99d7SShri Abhyankar         }
2196628f99d7SShri Abhyankar         sctx.rs = rs;
21979566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, row));
219807b50cabSHong Zhang         if (sctx.newshift) goto endofwhile;
2199628f99d7SShri Abhyankar 
2200628f99d7SShri Abhyankar         if (*pc2 != 0.0) {
2201628f99d7SShri Abhyankar           pj     = nbj + bd[prow];
2202628f99d7SShri Abhyankar           mul2   = (*pc2) / (*pc1); /* since diag is not yet inverted.*/
2203628f99d7SShri Abhyankar           *pc2   = mul2;
2204628f99d7SShri Abhyankar           nz_tmp = bi[prow + 1] - bd[prow] - 1;
2205628f99d7SShri Abhyankar           for (j = 0; j < nz_tmp; j++) {
2206628f99d7SShri Abhyankar             idx = pj[j];
2207628f99d7SShri Abhyankar             tmp = rtmp11[idx];
2208628f99d7SShri Abhyankar             rtmp22[idx] -= mul2 * tmp;
2209628f99d7SShri Abhyankar           }
22109566063dSJacob Faibussowitsch           PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2211628f99d7SShri Abhyankar         }
2212628f99d7SShri Abhyankar 
2213628f99d7SShri Abhyankar         pj  = bj + bi[row];
2214628f99d7SShri Abhyankar         pc1 = ba + bi[row];
2215628f99d7SShri Abhyankar         pc2 = ba + bi[row + 1];
2216628f99d7SShri Abhyankar 
2217628f99d7SShri Abhyankar         sctx.pv         = rtmp22[row + 1];
2218628f99d7SShri Abhyankar         rs              = 0.0;
2219628f99d7SShri Abhyankar         rtmp11[row]     = 1.0 / rtmp11[row];
2220628f99d7SShri Abhyankar         rtmp22[row + 1] = 1.0 / rtmp22[row + 1];
2221628f99d7SShri Abhyankar         /* copy row entries from dense representation to sparse */
2222628f99d7SShri Abhyankar         for (j = 0; j < nz; j++) {
2223628f99d7SShri Abhyankar           idx    = pj[j];
2224628f99d7SShri Abhyankar           pc1[j] = rtmp11[idx];
2225628f99d7SShri Abhyankar           pc2[j] = rtmp22[idx];
2226628f99d7SShri Abhyankar           if (idx != row + 1) rs += PetscAbsScalar(pc2[j]);
2227628f99d7SShri Abhyankar         }
2228628f99d7SShri Abhyankar         sctx.rs = rs;
22299566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1));
223007b50cabSHong Zhang         if (sctx.newshift) goto endofwhile;
2231628f99d7SShri Abhyankar         break;
2232628f99d7SShri Abhyankar 
2233628f99d7SShri Abhyankar       case 3:
2234628f99d7SShri Abhyankar         for (j = 0; j < nz; j++) {
2235628f99d7SShri Abhyankar           idx         = bjtmp[j];
2236628f99d7SShri Abhyankar           rtmp11[idx] = 0.0;
2237628f99d7SShri Abhyankar           rtmp22[idx] = 0.0;
2238628f99d7SShri Abhyankar           rtmp33[idx] = 0.0;
2239628f99d7SShri Abhyankar         }
2240628f99d7SShri Abhyankar         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2241628f99d7SShri Abhyankar         idx    = r[row];
2242628f99d7SShri Abhyankar         nz_tmp = ai[idx + 1] - ai[idx];
2243628f99d7SShri Abhyankar         ajtmp  = aj + ai[idx];
2244628f99d7SShri Abhyankar         v1     = aa + ai[idx];
2245628f99d7SShri Abhyankar         v2     = aa + ai[idx + 1];
2246628f99d7SShri Abhyankar         v3     = aa + ai[idx + 2];
2247628f99d7SShri Abhyankar         for (j = 0; j < nz_tmp; j++) {
2248628f99d7SShri Abhyankar           idx         = ics[ajtmp[j]];
2249628f99d7SShri Abhyankar           rtmp11[idx] = v1[j];
2250628f99d7SShri Abhyankar           rtmp22[idx] = v2[j];
2251628f99d7SShri Abhyankar           rtmp33[idx] = v3[j];
2252628f99d7SShri Abhyankar         }
2253628f99d7SShri Abhyankar         rtmp11[ics[r[row]]] += sctx.shift_amount;
2254628f99d7SShri Abhyankar         rtmp22[ics[r[row + 1]]] += sctx.shift_amount;
2255628f99d7SShri Abhyankar         rtmp33[ics[r[row + 2]]] += sctx.shift_amount;
2256628f99d7SShri Abhyankar 
2257628f99d7SShri Abhyankar         /* loop over all pivot row blocks above this row block */
2258628f99d7SShri Abhyankar         prow = *bjtmp++;
2259628f99d7SShri Abhyankar         while (prow < row) {
2260628f99d7SShri Abhyankar           pc1 = rtmp11 + prow;
2261628f99d7SShri Abhyankar           pc2 = rtmp22 + prow;
2262628f99d7SShri Abhyankar           pc3 = rtmp33 + prow;
2263628f99d7SShri Abhyankar           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
2264628f99d7SShri Abhyankar             pv   = ba + bd[prow];
2265628f99d7SShri Abhyankar             pj   = nbj + bd[prow];
2266628f99d7SShri Abhyankar             mul1 = *pc1 * *pv;
2267628f99d7SShri Abhyankar             mul2 = *pc2 * *pv;
2268628f99d7SShri Abhyankar             mul3 = *pc3 * *pv;
2269628f99d7SShri Abhyankar             ++pv;
2270628f99d7SShri Abhyankar             *pc1 = mul1;
2271628f99d7SShri Abhyankar             *pc2 = mul2;
2272628f99d7SShri Abhyankar             *pc3 = mul3;
2273628f99d7SShri Abhyankar 
2274628f99d7SShri Abhyankar             nz_tmp = bi[prow + 1] - bd[prow] - 1;
2275628f99d7SShri Abhyankar             /* update this row based on pivot row */
2276628f99d7SShri Abhyankar             for (j = 0; j < nz_tmp; j++) {
2277628f99d7SShri Abhyankar               tmp = pv[j];
2278628f99d7SShri Abhyankar               idx = pj[j];
2279628f99d7SShri Abhyankar               rtmp11[idx] -= mul1 * tmp;
2280628f99d7SShri Abhyankar               rtmp22[idx] -= mul2 * tmp;
2281628f99d7SShri Abhyankar               rtmp33[idx] -= mul3 * tmp;
2282628f99d7SShri Abhyankar             }
22839566063dSJacob Faibussowitsch             PetscCall(PetscLogFlops(3 + 6.0 * nz_tmp));
2284628f99d7SShri Abhyankar           }
2285628f99d7SShri Abhyankar           prow = *bjtmp++;
2286628f99d7SShri Abhyankar         }
2287628f99d7SShri Abhyankar 
2288628f99d7SShri Abhyankar         /* Now take care of diagonal 3x3 block in this set of rows */
2289628f99d7SShri Abhyankar         /* note: prow = row here */
2290628f99d7SShri Abhyankar         pc1 = rtmp11 + prow;
2291628f99d7SShri Abhyankar         pc2 = rtmp22 + prow;
2292628f99d7SShri Abhyankar         pc3 = rtmp33 + prow;
2293628f99d7SShri Abhyankar 
2294628f99d7SShri Abhyankar         sctx.pv = *pc1;
2295628f99d7SShri Abhyankar         pj      = bj + bi[prow];
2296628f99d7SShri Abhyankar         rs      = 0.0;
2297628f99d7SShri Abhyankar         for (j = 0; j < nz; j++) {
2298628f99d7SShri Abhyankar           idx = pj[j];
2299628f99d7SShri Abhyankar           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2300628f99d7SShri Abhyankar         }
2301628f99d7SShri Abhyankar         sctx.rs = rs;
23029566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, row));
230307b50cabSHong Zhang         if (sctx.newshift) goto endofwhile;
2304628f99d7SShri Abhyankar 
2305628f99d7SShri Abhyankar         if (*pc2 != 0.0 || *pc3 != 0.0) {
2306628f99d7SShri Abhyankar           mul2   = (*pc2) / (*pc1);
2307628f99d7SShri Abhyankar           mul3   = (*pc3) / (*pc1);
2308628f99d7SShri Abhyankar           *pc2   = mul2;
2309628f99d7SShri Abhyankar           *pc3   = mul3;
2310628f99d7SShri Abhyankar           nz_tmp = bi[prow + 1] - bd[prow] - 1;
2311628f99d7SShri Abhyankar           pj     = nbj + bd[prow];
2312628f99d7SShri Abhyankar           for (j = 0; j < nz_tmp; j++) {
2313628f99d7SShri Abhyankar             idx = pj[j];
2314628f99d7SShri Abhyankar             tmp = rtmp11[idx];
2315628f99d7SShri Abhyankar             rtmp22[idx] -= mul2 * tmp;
2316628f99d7SShri Abhyankar             rtmp33[idx] -= mul3 * tmp;
2317628f99d7SShri Abhyankar           }
23189566063dSJacob Faibussowitsch           PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp));
2319628f99d7SShri Abhyankar         }
2320628f99d7SShri Abhyankar         ++prow;
2321628f99d7SShri Abhyankar 
2322628f99d7SShri Abhyankar         pc2     = rtmp22 + prow;
2323628f99d7SShri Abhyankar         pc3     = rtmp33 + prow;
2324628f99d7SShri Abhyankar         sctx.pv = *pc2;
2325628f99d7SShri Abhyankar         pj      = bj + bi[prow];
2326628f99d7SShri Abhyankar         rs      = 0.0;
2327628f99d7SShri Abhyankar         for (j = 0; j < nz; j++) {
2328628f99d7SShri Abhyankar           idx = pj[j];
2329628f99d7SShri Abhyankar           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2330628f99d7SShri Abhyankar         }
2331628f99d7SShri Abhyankar         sctx.rs = rs;
23329566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1));
233307b50cabSHong Zhang         if (sctx.newshift) goto endofwhile;
2334628f99d7SShri Abhyankar 
2335628f99d7SShri Abhyankar         if (*pc3 != 0.0) {
2336628f99d7SShri Abhyankar           mul3   = (*pc3) / (*pc2);
2337628f99d7SShri Abhyankar           *pc3   = mul3;
2338628f99d7SShri Abhyankar           pj     = nbj + bd[prow];
2339628f99d7SShri Abhyankar           nz_tmp = bi[prow + 1] - bd[prow] - 1;
2340628f99d7SShri Abhyankar           for (j = 0; j < nz_tmp; j++) {
2341628f99d7SShri Abhyankar             idx = pj[j];
2342628f99d7SShri Abhyankar             tmp = rtmp22[idx];
2343628f99d7SShri Abhyankar             rtmp33[idx] -= mul3 * tmp;
2344628f99d7SShri Abhyankar           }
23459566063dSJacob Faibussowitsch           PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2346628f99d7SShri Abhyankar         }
2347628f99d7SShri Abhyankar 
2348628f99d7SShri Abhyankar         pj  = bj + bi[row];
2349628f99d7SShri Abhyankar         pc1 = ba + bi[row];
2350628f99d7SShri Abhyankar         pc2 = ba + bi[row + 1];
2351628f99d7SShri Abhyankar         pc3 = ba + bi[row + 2];
2352628f99d7SShri Abhyankar 
2353628f99d7SShri Abhyankar         sctx.pv         = rtmp33[row + 2];
2354628f99d7SShri Abhyankar         rs              = 0.0;
2355628f99d7SShri Abhyankar         rtmp11[row]     = 1.0 / rtmp11[row];
2356628f99d7SShri Abhyankar         rtmp22[row + 1] = 1.0 / rtmp22[row + 1];
2357628f99d7SShri Abhyankar         rtmp33[row + 2] = 1.0 / rtmp33[row + 2];
2358628f99d7SShri Abhyankar         /* copy row entries from dense representation to sparse */
2359628f99d7SShri Abhyankar         for (j = 0; j < nz; j++) {
2360628f99d7SShri Abhyankar           idx    = pj[j];
2361628f99d7SShri Abhyankar           pc1[j] = rtmp11[idx];
2362628f99d7SShri Abhyankar           pc2[j] = rtmp22[idx];
2363628f99d7SShri Abhyankar           pc3[j] = rtmp33[idx];
2364628f99d7SShri Abhyankar           if (idx != row + 2) rs += PetscAbsScalar(pc3[j]);
2365628f99d7SShri Abhyankar         }
2366628f99d7SShri Abhyankar 
2367628f99d7SShri Abhyankar         sctx.rs = rs;
23689566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, row + 2));
236907b50cabSHong Zhang         if (sctx.newshift) goto endofwhile;
2370628f99d7SShri Abhyankar         break;
2371628f99d7SShri Abhyankar 
2372d71ae5a4SJacob Faibussowitsch       default:
2373d71ae5a4SJacob Faibussowitsch         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported ");
2374628f99d7SShri Abhyankar       }
2375628f99d7SShri Abhyankar       row += nodesz; /* Update the row */
2376628f99d7SShri Abhyankar     }
2377628f99d7SShri Abhyankar   endofwhile:;
237807b50cabSHong Zhang   } while (sctx.newshift);
23799566063dSJacob Faibussowitsch   PetscCall(PetscFree3(rtmp11, rtmp22, rtmp33));
23809566063dSJacob Faibussowitsch   PetscCall(PetscFree(tmp_vec2));
23819566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isicol, &ic));
23829566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &r));
23839566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &c));
23842205254eSKarl Rupp 
2385d3ac4fa3SBarry Smith   (B)->ops->solve = MatSolve_SeqAIJ_inplace;
2386628f99d7SShri Abhyankar   /* do not set solve add, since MatSolve_Inode + Add is faster */
2387628f99d7SShri Abhyankar   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ_inplace;
2388628f99d7SShri Abhyankar   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2389628f99d7SShri Abhyankar   C->assembled              = PETSC_TRUE;
2390628f99d7SShri Abhyankar   C->preallocated           = PETSC_TRUE;
2391628f99d7SShri Abhyankar   if (sctx.nshift) {
2392f4db908eSBarry Smith     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
23939566063dSJacob Faibussowitsch       PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
2394f4db908eSBarry Smith     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
23959566063dSJacob Faibussowitsch       PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
2396628f99d7SShri Abhyankar     }
2397628f99d7SShri Abhyankar   }
23989566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(C->cmap->n));
23999566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCheckInode(C));
24003ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2401628f99d7SShri Abhyankar }
2402ff6a9541SJacob Faibussowitsch #endif
2403628f99d7SShri Abhyankar 
2404d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A, Vec bb, Vec xx)
2405d71ae5a4SJacob Faibussowitsch {
2406019b515eSShri Abhyankar   Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
2407019b515eSShri Abhyankar   IS                 iscol = a->col, isrow = a->row;
2408019b515eSShri Abhyankar   const PetscInt    *r, *c, *rout, *cout;
24094d12350bSJunchao Zhang   PetscInt           i, j;
24108758e1faSBarry Smith   PetscInt           node_max, row, nsz, aii, i0, i1, nz;
24118758e1faSBarry Smith   const PetscInt    *ai = a->i, *a_j = a->j, *ns, *vi, *ad, *aj;
2412019b515eSShri Abhyankar   PetscScalar       *x, *tmp, *tmps, tmp0, tmp1;
2413019b515eSShri Abhyankar   PetscScalar        sum1, sum2, sum3, sum4, sum5;
2414019b515eSShri Abhyankar   const MatScalar   *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa;
2415019b515eSShri Abhyankar   const PetscScalar *b;
2416019b515eSShri Abhyankar 
2417019b515eSShri Abhyankar   PetscFunctionBegin;
24184d12350bSJunchao Zhang   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2419019b515eSShri Abhyankar   node_max = a->inode.node_count;
24204d12350bSJunchao Zhang   ns       = a->inode.size_csr; /* Node Size array */
2421019b515eSShri Abhyankar 
24229566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
24239566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(xx, &x));
2424019b515eSShri Abhyankar   tmp = a->solve_work;
2425019b515eSShri Abhyankar 
24269371c9d4SSatish Balay   PetscCall(ISGetIndices(isrow, &rout));
24279371c9d4SSatish Balay   r = rout;
24289371c9d4SSatish Balay   PetscCall(ISGetIndices(iscol, &cout));
24299371c9d4SSatish Balay   c = cout;
2430019b515eSShri Abhyankar 
2431019b515eSShri Abhyankar   /* forward solve the lower triangular */
2432019b515eSShri Abhyankar   tmps = tmp;
2433019b515eSShri Abhyankar   aa   = a_a;
2434019b515eSShri Abhyankar   aj   = a_j;
2435019b515eSShri Abhyankar   ad   = a->diag;
2436019b515eSShri Abhyankar 
24374d12350bSJunchao Zhang   for (i = 0; i < node_max; ++i) {
24384d12350bSJunchao Zhang     row = ns[i];
24394d12350bSJunchao Zhang     nsz = ns[i + 1] - ns[i];
2440019b515eSShri Abhyankar     aii = ai[row];
2441019b515eSShri Abhyankar     v1  = aa + aii;
2442019b515eSShri Abhyankar     vi  = aj + aii;
2443019b515eSShri Abhyankar     nz  = ai[row + 1] - ai[row];
2444019b515eSShri Abhyankar 
244598991853SShri Abhyankar     if (i < node_max - 1) {
244698991853SShri Abhyankar       /* Prefetch the indices for the next block */
244750d8bf02SJed Brown       PetscPrefetchBlock(aj + ai[row + nsz], ai[row + nsz + 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */
244898991853SShri Abhyankar       /* Prefetch the data for the next block */
24494d12350bSJunchao Zhang       PetscPrefetchBlock(aa + ai[row + nsz], ai[ns[i + 2]] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA);
245098991853SShri Abhyankar     }
245198991853SShri Abhyankar 
2452019b515eSShri Abhyankar     switch (nsz) { /* Each loop in 'case' is unrolled */
2453019b515eSShri Abhyankar     case 1:
2454019b515eSShri Abhyankar       sum1 = b[r[row]];
2455019b515eSShri Abhyankar       for (j = 0; j < nz - 1; j += 2) {
2456019b515eSShri Abhyankar         i0   = vi[j];
2457019b515eSShri Abhyankar         i1   = vi[j + 1];
2458019b515eSShri Abhyankar         tmp0 = tmps[i0];
2459019b515eSShri Abhyankar         tmp1 = tmps[i1];
2460019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2461019b515eSShri Abhyankar       }
2462019b515eSShri Abhyankar       if (j == nz - 1) {
2463019b515eSShri Abhyankar         tmp0 = tmps[vi[j]];
2464019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0;
2465019b515eSShri Abhyankar       }
2466019b515eSShri Abhyankar       tmp[row++] = sum1;
2467019b515eSShri Abhyankar       break;
2468019b515eSShri Abhyankar     case 2:
2469019b515eSShri Abhyankar       sum1 = b[r[row]];
2470019b515eSShri Abhyankar       sum2 = b[r[row + 1]];
2471019b515eSShri Abhyankar       v2   = aa + ai[row + 1];
2472019b515eSShri Abhyankar 
2473019b515eSShri Abhyankar       for (j = 0; j < nz - 1; j += 2) {
2474019b515eSShri Abhyankar         i0   = vi[j];
2475019b515eSShri Abhyankar         i1   = vi[j + 1];
2476019b515eSShri Abhyankar         tmp0 = tmps[i0];
2477019b515eSShri Abhyankar         tmp1 = tmps[i1];
2478019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2479019b515eSShri Abhyankar         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2480019b515eSShri Abhyankar       }
2481019b515eSShri Abhyankar       if (j == nz - 1) {
2482019b515eSShri Abhyankar         tmp0 = tmps[vi[j]];
2483019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0;
2484019b515eSShri Abhyankar         sum2 -= v2[j] * tmp0;
2485019b515eSShri Abhyankar       }
2486019b515eSShri Abhyankar       sum2 -= v2[nz] * sum1;
2487019b515eSShri Abhyankar       tmp[row++] = sum1;
2488019b515eSShri Abhyankar       tmp[row++] = sum2;
2489019b515eSShri Abhyankar       break;
2490019b515eSShri Abhyankar     case 3:
2491019b515eSShri Abhyankar       sum1 = b[r[row]];
2492019b515eSShri Abhyankar       sum2 = b[r[row + 1]];
2493019b515eSShri Abhyankar       sum3 = b[r[row + 2]];
2494019b515eSShri Abhyankar       v2   = aa + ai[row + 1];
2495019b515eSShri Abhyankar       v3   = aa + ai[row + 2];
2496019b515eSShri Abhyankar 
2497019b515eSShri Abhyankar       for (j = 0; j < nz - 1; j += 2) {
2498019b515eSShri Abhyankar         i0   = vi[j];
2499019b515eSShri Abhyankar         i1   = vi[j + 1];
2500019b515eSShri Abhyankar         tmp0 = tmps[i0];
2501019b515eSShri Abhyankar         tmp1 = tmps[i1];
2502019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2503019b515eSShri Abhyankar         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2504019b515eSShri Abhyankar         sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2505019b515eSShri Abhyankar       }
2506019b515eSShri Abhyankar       if (j == nz - 1) {
2507019b515eSShri Abhyankar         tmp0 = tmps[vi[j]];
2508019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0;
2509019b515eSShri Abhyankar         sum2 -= v2[j] * tmp0;
2510019b515eSShri Abhyankar         sum3 -= v3[j] * tmp0;
2511019b515eSShri Abhyankar       }
2512019b515eSShri Abhyankar       sum2 -= v2[nz] * sum1;
2513019b515eSShri Abhyankar       sum3 -= v3[nz] * sum1;
2514019b515eSShri Abhyankar       sum3 -= v3[nz + 1] * sum2;
2515019b515eSShri Abhyankar       tmp[row++] = sum1;
2516019b515eSShri Abhyankar       tmp[row++] = sum2;
2517019b515eSShri Abhyankar       tmp[row++] = sum3;
2518019b515eSShri Abhyankar       break;
2519019b515eSShri Abhyankar 
2520019b515eSShri Abhyankar     case 4:
2521019b515eSShri Abhyankar       sum1 = b[r[row]];
2522019b515eSShri Abhyankar       sum2 = b[r[row + 1]];
2523019b515eSShri Abhyankar       sum3 = b[r[row + 2]];
2524019b515eSShri Abhyankar       sum4 = b[r[row + 3]];
2525019b515eSShri Abhyankar       v2   = aa + ai[row + 1];
2526019b515eSShri Abhyankar       v3   = aa + ai[row + 2];
2527019b515eSShri Abhyankar       v4   = aa + ai[row + 3];
2528019b515eSShri Abhyankar 
2529019b515eSShri Abhyankar       for (j = 0; j < nz - 1; j += 2) {
2530019b515eSShri Abhyankar         i0   = vi[j];
2531019b515eSShri Abhyankar         i1   = vi[j + 1];
2532019b515eSShri Abhyankar         tmp0 = tmps[i0];
2533019b515eSShri Abhyankar         tmp1 = tmps[i1];
2534019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2535019b515eSShri Abhyankar         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2536019b515eSShri Abhyankar         sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2537019b515eSShri Abhyankar         sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1;
2538019b515eSShri Abhyankar       }
2539019b515eSShri Abhyankar       if (j == nz - 1) {
2540019b515eSShri Abhyankar         tmp0 = tmps[vi[j]];
2541019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0;
2542019b515eSShri Abhyankar         sum2 -= v2[j] * tmp0;
2543019b515eSShri Abhyankar         sum3 -= v3[j] * tmp0;
2544019b515eSShri Abhyankar         sum4 -= v4[j] * tmp0;
2545019b515eSShri Abhyankar       }
2546019b515eSShri Abhyankar       sum2 -= v2[nz] * sum1;
2547019b515eSShri Abhyankar       sum3 -= v3[nz] * sum1;
2548019b515eSShri Abhyankar       sum4 -= v4[nz] * sum1;
2549019b515eSShri Abhyankar       sum3 -= v3[nz + 1] * sum2;
2550019b515eSShri Abhyankar       sum4 -= v4[nz + 1] * sum2;
2551019b515eSShri Abhyankar       sum4 -= v4[nz + 2] * sum3;
2552019b515eSShri Abhyankar 
2553019b515eSShri Abhyankar       tmp[row++] = sum1;
2554019b515eSShri Abhyankar       tmp[row++] = sum2;
2555019b515eSShri Abhyankar       tmp[row++] = sum3;
2556019b515eSShri Abhyankar       tmp[row++] = sum4;
2557019b515eSShri Abhyankar       break;
2558019b515eSShri Abhyankar     case 5:
2559019b515eSShri Abhyankar       sum1 = b[r[row]];
2560019b515eSShri Abhyankar       sum2 = b[r[row + 1]];
2561019b515eSShri Abhyankar       sum3 = b[r[row + 2]];
2562019b515eSShri Abhyankar       sum4 = b[r[row + 3]];
2563019b515eSShri Abhyankar       sum5 = b[r[row + 4]];
2564019b515eSShri Abhyankar       v2   = aa + ai[row + 1];
2565019b515eSShri Abhyankar       v3   = aa + ai[row + 2];
2566019b515eSShri Abhyankar       v4   = aa + ai[row + 3];
2567019b515eSShri Abhyankar       v5   = aa + ai[row + 4];
2568019b515eSShri Abhyankar 
2569019b515eSShri Abhyankar       for (j = 0; j < nz - 1; j += 2) {
2570019b515eSShri Abhyankar         i0   = vi[j];
2571019b515eSShri Abhyankar         i1   = vi[j + 1];
2572019b515eSShri Abhyankar         tmp0 = tmps[i0];
2573019b515eSShri Abhyankar         tmp1 = tmps[i1];
2574019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2575019b515eSShri Abhyankar         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2576019b515eSShri Abhyankar         sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2577019b515eSShri Abhyankar         sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1;
2578019b515eSShri Abhyankar         sum5 -= v5[j] * tmp0 + v5[j + 1] * tmp1;
2579019b515eSShri Abhyankar       }
2580019b515eSShri Abhyankar       if (j == nz - 1) {
2581019b515eSShri Abhyankar         tmp0 = tmps[vi[j]];
2582019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0;
2583019b515eSShri Abhyankar         sum2 -= v2[j] * tmp0;
2584019b515eSShri Abhyankar         sum3 -= v3[j] * tmp0;
2585019b515eSShri Abhyankar         sum4 -= v4[j] * tmp0;
2586019b515eSShri Abhyankar         sum5 -= v5[j] * tmp0;
2587019b515eSShri Abhyankar       }
2588019b515eSShri Abhyankar 
2589019b515eSShri Abhyankar       sum2 -= v2[nz] * sum1;
2590019b515eSShri Abhyankar       sum3 -= v3[nz] * sum1;
2591019b515eSShri Abhyankar       sum4 -= v4[nz] * sum1;
2592019b515eSShri Abhyankar       sum5 -= v5[nz] * sum1;
2593019b515eSShri Abhyankar       sum3 -= v3[nz + 1] * sum2;
2594019b515eSShri Abhyankar       sum4 -= v4[nz + 1] * sum2;
2595019b515eSShri Abhyankar       sum5 -= v5[nz + 1] * sum2;
2596019b515eSShri Abhyankar       sum4 -= v4[nz + 2] * sum3;
2597019b515eSShri Abhyankar       sum5 -= v5[nz + 2] * sum3;
2598019b515eSShri Abhyankar       sum5 -= v5[nz + 3] * sum4;
2599019b515eSShri Abhyankar 
2600019b515eSShri Abhyankar       tmp[row++] = sum1;
2601019b515eSShri Abhyankar       tmp[row++] = sum2;
2602019b515eSShri Abhyankar       tmp[row++] = sum3;
2603019b515eSShri Abhyankar       tmp[row++] = sum4;
2604019b515eSShri Abhyankar       tmp[row++] = sum5;
2605019b515eSShri Abhyankar       break;
2606d71ae5a4SJacob Faibussowitsch     default:
2607d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
2608019b515eSShri Abhyankar     }
2609019b515eSShri Abhyankar   }
2610019b515eSShri Abhyankar   /* backward solve the upper triangular */
26114d12350bSJunchao Zhang   for (i = node_max - 1; i >= 0; i--) {
26124d12350bSJunchao Zhang     row = ns[i + 1] - 1;
26134d12350bSJunchao Zhang     nsz = ns[i + 1] - ns[i];
2614019b515eSShri Abhyankar     aii = ad[row + 1] + 1;
2615019b515eSShri Abhyankar     v1  = aa + aii;
2616019b515eSShri Abhyankar     vi  = aj + aii;
2617019b515eSShri Abhyankar     nz  = ad[row] - ad[row + 1] - 1;
261898991853SShri Abhyankar 
261998991853SShri Abhyankar     if (i > 0) {
262098991853SShri Abhyankar       /* Prefetch the indices for the next block */
262150d8bf02SJed Brown       PetscPrefetchBlock(aj + ad[row - nsz + 1] + 1, ad[row - nsz] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA);
262298991853SShri Abhyankar       /* Prefetch the data for the next block */
26234d12350bSJunchao Zhang       PetscPrefetchBlock(aa + ad[row - nsz + 1] + 1, ad[ns[i - 1] + 1] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA);
262498991853SShri Abhyankar     }
262598991853SShri Abhyankar 
2626019b515eSShri Abhyankar     switch (nsz) { /* Each loop in 'case' is unrolled */
2627019b515eSShri Abhyankar     case 1:
2628019b515eSShri Abhyankar       sum1 = tmp[row];
2629019b515eSShri Abhyankar 
2630019b515eSShri Abhyankar       for (j = 0; j < nz - 1; j += 2) {
2631019b515eSShri Abhyankar         i0   = vi[j];
2632019b515eSShri Abhyankar         i1   = vi[j + 1];
2633019b515eSShri Abhyankar         tmp0 = tmps[i0];
2634019b515eSShri Abhyankar         tmp1 = tmps[i1];
2635019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2636019b515eSShri Abhyankar       }
2637019b515eSShri Abhyankar       if (j == nz - 1) {
2638019b515eSShri Abhyankar         tmp0 = tmps[vi[j]];
2639019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0;
2640019b515eSShri Abhyankar       }
26419371c9d4SSatish Balay       x[c[row]] = tmp[row] = sum1 * v1[nz];
26429371c9d4SSatish Balay       row--;
2643019b515eSShri Abhyankar       break;
2644019b515eSShri Abhyankar     case 2:
2645019b515eSShri Abhyankar       sum1 = tmp[row];
2646019b515eSShri Abhyankar       sum2 = tmp[row - 1];
2647019b515eSShri Abhyankar       v2   = aa + ad[row] + 1;
2648019b515eSShri Abhyankar       for (j = 0; j < nz - 1; j += 2) {
2649019b515eSShri Abhyankar         i0   = vi[j];
2650019b515eSShri Abhyankar         i1   = vi[j + 1];
2651019b515eSShri Abhyankar         tmp0 = tmps[i0];
2652019b515eSShri Abhyankar         tmp1 = tmps[i1];
2653019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2654019b515eSShri Abhyankar         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2655019b515eSShri Abhyankar       }
2656019b515eSShri Abhyankar       if (j == nz - 1) {
2657019b515eSShri Abhyankar         tmp0 = tmps[vi[j]];
2658019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0;
2659019b515eSShri Abhyankar         sum2 -= v2[j + 1] * tmp0;
2660019b515eSShri Abhyankar       }
2661019b515eSShri Abhyankar 
26629371c9d4SSatish Balay       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
26639371c9d4SSatish Balay       row--;
2664019b515eSShri Abhyankar       sum2 -= v2[0] * tmp0;
26659371c9d4SSatish Balay       x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
26669371c9d4SSatish Balay       row--;
2667019b515eSShri Abhyankar       break;
2668019b515eSShri Abhyankar     case 3:
2669019b515eSShri Abhyankar       sum1 = tmp[row];
2670019b515eSShri Abhyankar       sum2 = tmp[row - 1];
2671019b515eSShri Abhyankar       sum3 = tmp[row - 2];
2672019b515eSShri Abhyankar       v2   = aa + ad[row] + 1;
2673019b515eSShri Abhyankar       v3   = aa + ad[row - 1] + 1;
2674019b515eSShri Abhyankar       for (j = 0; j < nz - 1; j += 2) {
2675019b515eSShri Abhyankar         i0   = vi[j];
2676019b515eSShri Abhyankar         i1   = vi[j + 1];
2677019b515eSShri Abhyankar         tmp0 = tmps[i0];
2678019b515eSShri Abhyankar         tmp1 = tmps[i1];
2679019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2680019b515eSShri Abhyankar         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2681019b515eSShri Abhyankar         sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2682019b515eSShri Abhyankar       }
2683019b515eSShri Abhyankar       if (j == nz - 1) {
2684019b515eSShri Abhyankar         tmp0 = tmps[vi[j]];
2685019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0;
2686019b515eSShri Abhyankar         sum2 -= v2[j + 1] * tmp0;
2687019b515eSShri Abhyankar         sum3 -= v3[j + 2] * tmp0;
2688019b515eSShri Abhyankar       }
26899371c9d4SSatish Balay       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
26909371c9d4SSatish Balay       row--;
2691019b515eSShri Abhyankar       sum2 -= v2[0] * tmp0;
2692019b515eSShri Abhyankar       sum3 -= v3[1] * tmp0;
26939371c9d4SSatish Balay       tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
26949371c9d4SSatish Balay       row--;
2695019b515eSShri Abhyankar       sum3 -= v3[0] * tmp0;
26969371c9d4SSatish Balay       x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
26979371c9d4SSatish Balay       row--;
2698019b515eSShri Abhyankar 
2699019b515eSShri Abhyankar       break;
2700019b515eSShri Abhyankar     case 4:
2701019b515eSShri Abhyankar       sum1 = tmp[row];
2702019b515eSShri Abhyankar       sum2 = tmp[row - 1];
2703019b515eSShri Abhyankar       sum3 = tmp[row - 2];
2704019b515eSShri Abhyankar       sum4 = tmp[row - 3];
2705019b515eSShri Abhyankar       v2   = aa + ad[row] + 1;
2706019b515eSShri Abhyankar       v3   = aa + ad[row - 1] + 1;
2707019b515eSShri Abhyankar       v4   = aa + ad[row - 2] + 1;
2708019b515eSShri Abhyankar 
2709019b515eSShri Abhyankar       for (j = 0; j < nz - 1; j += 2) {
2710019b515eSShri Abhyankar         i0   = vi[j];
2711019b515eSShri Abhyankar         i1   = vi[j + 1];
2712019b515eSShri Abhyankar         tmp0 = tmps[i0];
2713019b515eSShri Abhyankar         tmp1 = tmps[i1];
2714019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2715019b515eSShri Abhyankar         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2716019b515eSShri Abhyankar         sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2717019b515eSShri Abhyankar         sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1;
2718019b515eSShri Abhyankar       }
2719019b515eSShri Abhyankar       if (j == nz - 1) {
2720019b515eSShri Abhyankar         tmp0 = tmps[vi[j]];
2721019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0;
2722019b515eSShri Abhyankar         sum2 -= v2[j + 1] * tmp0;
2723019b515eSShri Abhyankar         sum3 -= v3[j + 2] * tmp0;
2724019b515eSShri Abhyankar         sum4 -= v4[j + 3] * tmp0;
2725019b515eSShri Abhyankar       }
2726019b515eSShri Abhyankar 
27279371c9d4SSatish Balay       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
27289371c9d4SSatish Balay       row--;
2729019b515eSShri Abhyankar       sum2 -= v2[0] * tmp0;
2730019b515eSShri Abhyankar       sum3 -= v3[1] * tmp0;
2731019b515eSShri Abhyankar       sum4 -= v4[2] * tmp0;
27329371c9d4SSatish Balay       tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
27339371c9d4SSatish Balay       row--;
2734019b515eSShri Abhyankar       sum3 -= v3[0] * tmp0;
2735019b515eSShri Abhyankar       sum4 -= v4[1] * tmp0;
27369371c9d4SSatish Balay       tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
27379371c9d4SSatish Balay       row--;
2738019b515eSShri Abhyankar       sum4 -= v4[0] * tmp0;
27399371c9d4SSatish Balay       x[c[row]] = tmp[row] = sum4 * v4[nz + 3];
27409371c9d4SSatish Balay       row--;
2741019b515eSShri Abhyankar       break;
2742019b515eSShri Abhyankar     case 5:
2743019b515eSShri Abhyankar       sum1 = tmp[row];
2744019b515eSShri Abhyankar       sum2 = tmp[row - 1];
2745019b515eSShri Abhyankar       sum3 = tmp[row - 2];
2746019b515eSShri Abhyankar       sum4 = tmp[row - 3];
2747019b515eSShri Abhyankar       sum5 = tmp[row - 4];
2748019b515eSShri Abhyankar       v2   = aa + ad[row] + 1;
2749019b515eSShri Abhyankar       v3   = aa + ad[row - 1] + 1;
2750019b515eSShri Abhyankar       v4   = aa + ad[row - 2] + 1;
2751019b515eSShri Abhyankar       v5   = aa + ad[row - 3] + 1;
2752019b515eSShri Abhyankar       for (j = 0; j < nz - 1; j += 2) {
2753019b515eSShri Abhyankar         i0   = vi[j];
2754019b515eSShri Abhyankar         i1   = vi[j + 1];
2755019b515eSShri Abhyankar         tmp0 = tmps[i0];
2756019b515eSShri Abhyankar         tmp1 = tmps[i1];
2757019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2758019b515eSShri Abhyankar         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2759019b515eSShri Abhyankar         sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2760019b515eSShri Abhyankar         sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1;
2761019b515eSShri Abhyankar         sum5 -= v5[j + 4] * tmp0 + v5[j + 5] * tmp1;
2762019b515eSShri Abhyankar       }
2763019b515eSShri Abhyankar       if (j == nz - 1) {
2764019b515eSShri Abhyankar         tmp0 = tmps[vi[j]];
2765019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0;
2766019b515eSShri Abhyankar         sum2 -= v2[j + 1] * tmp0;
2767019b515eSShri Abhyankar         sum3 -= v3[j + 2] * tmp0;
2768019b515eSShri Abhyankar         sum4 -= v4[j + 3] * tmp0;
2769019b515eSShri Abhyankar         sum5 -= v5[j + 4] * tmp0;
2770019b515eSShri Abhyankar       }
2771019b515eSShri Abhyankar 
27729371c9d4SSatish Balay       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
27739371c9d4SSatish Balay       row--;
2774019b515eSShri Abhyankar       sum2 -= v2[0] * tmp0;
2775019b515eSShri Abhyankar       sum3 -= v3[1] * tmp0;
2776019b515eSShri Abhyankar       sum4 -= v4[2] * tmp0;
2777019b515eSShri Abhyankar       sum5 -= v5[3] * tmp0;
27789371c9d4SSatish Balay       tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
27799371c9d4SSatish Balay       row--;
2780019b515eSShri Abhyankar       sum3 -= v3[0] * tmp0;
2781019b515eSShri Abhyankar       sum4 -= v4[1] * tmp0;
2782019b515eSShri Abhyankar       sum5 -= v5[2] * tmp0;
27839371c9d4SSatish Balay       tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
27849371c9d4SSatish Balay       row--;
2785019b515eSShri Abhyankar       sum4 -= v4[0] * tmp0;
2786019b515eSShri Abhyankar       sum5 -= v5[1] * tmp0;
27879371c9d4SSatish Balay       tmp0 = x[c[row]] = tmp[row] = sum4 * v4[nz + 3];
27889371c9d4SSatish Balay       row--;
2789019b515eSShri Abhyankar       sum5 -= v5[0] * tmp0;
27909371c9d4SSatish Balay       x[c[row]] = tmp[row] = sum5 * v5[nz + 4];
27919371c9d4SSatish Balay       row--;
2792019b515eSShri Abhyankar       break;
2793d71ae5a4SJacob Faibussowitsch     default:
2794d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
2795019b515eSShri Abhyankar     }
2796019b515eSShri Abhyankar   }
27979566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &rout));
27989566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &cout));
27999566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
28009566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(xx, &x));
28019566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
28023ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2803019b515eSShri Abhyankar }
2804019b515eSShri Abhyankar 
28054c1414c8SBarry Smith /*
28064c1414c8SBarry Smith      Makes a longer coloring[] array and calls the usual code with that
28074c1414c8SBarry Smith */
280866976f2fSJacob Faibussowitsch static PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat, PetscInt ncolors, PetscInt nin, ISColoringValue coloring[], ISColoring *iscoloring)
2809d71ae5a4SJacob Faibussowitsch {
28104c1414c8SBarry Smith   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)mat->data;
28114d12350bSJunchao Zhang   PetscInt         n = mat->cmap->n, m = a->inode.node_count, j, *ns = a->inode.size_csr, row;
28124c1414c8SBarry Smith   PetscInt        *colorused, i;
28134c1414c8SBarry Smith   ISColoringValue *newcolor;
28144c1414c8SBarry Smith 
28154c1414c8SBarry Smith   PetscFunctionBegin;
28164d12350bSJunchao Zhang   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
28179566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(n + 1, &newcolor));
28184c1414c8SBarry Smith   /* loop over inodes, marking a color for each column*/
28194c1414c8SBarry Smith   row = 0;
28204c1414c8SBarry Smith   for (i = 0; i < m; i++) {
28214d12350bSJunchao Zhang     for (j = 0; j < (ns[i + 1] - ns[i]); j++) PetscCall(ISColoringValueCast(coloring[i] + j * ncolors, newcolor + row++));
28224c1414c8SBarry Smith   }
28234c1414c8SBarry Smith 
28244c1414c8SBarry Smith   /* eliminate unneeded colors */
28259566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(5 * ncolors, &colorused));
2826ad540459SPierre Jolivet   for (i = 0; i < n; i++) colorused[newcolor[i]] = 1;
28274c1414c8SBarry Smith 
2828ad540459SPierre Jolivet   for (i = 1; i < 5 * ncolors; i++) colorused[i] += colorused[i - 1];
28294c1414c8SBarry Smith   ncolors = colorused[5 * ncolors - 1];
28306497c311SBarry Smith   for (i = 0; i < n; i++) PetscCall(ISColoringValueCast(colorused[newcolor[i]] - 1, newcolor + i));
28319566063dSJacob Faibussowitsch   PetscCall(PetscFree(colorused));
28329566063dSJacob Faibussowitsch   PetscCall(ISColoringCreate(PetscObjectComm((PetscObject)mat), ncolors, n, newcolor, PETSC_OWN_POINTER, iscoloring));
28339566063dSJacob Faibussowitsch   PetscCall(PetscFree(coloring));
28343ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
28354c1414c8SBarry Smith }
28364c1414c8SBarry Smith 
2837af0996ceSBarry Smith #include <petsc/private/kernels/blockinvert.h>
28382af78befSBarry Smith 
2839d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
2840d71ae5a4SJacob Faibussowitsch {
28412af78befSBarry Smith   Mat_SeqAIJ        *a    = (Mat_SeqAIJ *)A->data;
28427aaeff0aSMatthew G. Knepley   PetscScalar        sum1 = 0.0, sum2 = 0.0, sum3 = 0.0, sum4 = 0.0, sum5 = 0.0, tmp0, tmp1, tmp2, tmp3;
28435850ef23SBarry Smith   MatScalar         *ibdiag, *bdiag, work[25], *t;
2844a8b09249SBarry Smith   PetscScalar       *x, tmp4, tmp5, x1, x2, x3, x4, x5;
28457aaeff0aSMatthew G. Knepley   const MatScalar   *v = a->a, *v1 = NULL, *v2 = NULL, *v3 = NULL, *v4 = NULL, *v5 = NULL;
28465850ef23SBarry Smith   const PetscScalar *xb, *b;
28477b6c816cSBarry Smith   PetscReal          zeropivot = 100. * PETSC_MACHINE_EPSILON, shift = 0.0;
28484d12350bSJunchao Zhang   PetscInt           n, m = a->inode.node_count, cnt = 0, i, j, row, i1, i2, nodesz;
28498758e1faSBarry Smith   PetscInt           sz, k, ipvt[5];
28507b6c816cSBarry Smith   PetscBool          allowzeropivot, zeropivotdetected;
28514d12350bSJunchao Zhang   const PetscInt    *sizes = a->inode.size_csr, *idx, *diag = a->diag, *ii = a->i;
28522af78befSBarry Smith 
28532af78befSBarry Smith   PetscFunctionBegin;
2854a455e926SHong Zhang   allowzeropivot = PetscNot(A->erroriffailure);
28554d12350bSJunchao Zhang   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
285608401ef6SPierre Jolivet   PetscCheck(omega == 1.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for omega != 1.0; use -mat_no_inode");
285708401ef6SPierre Jolivet   PetscCheck(fshift == 0.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for fshift != 0.0; use -mat_no_inode");
28582af78befSBarry Smith 
285971f1c65dSBarry Smith   if (!a->inode.ibdiagvalid) {
28602af78befSBarry Smith     if (!a->inode.ibdiag) {
28612af78befSBarry Smith       /* calculate space needed for diagonal blocks */
28624d12350bSJunchao Zhang       for (i = 0; i < m; i++) {
28634d12350bSJunchao Zhang         nodesz = sizes[i + 1] - sizes[i];
28644d12350bSJunchao Zhang         cnt += nodesz * nodesz;
28654d12350bSJunchao Zhang       }
2866f0d39aaaSBarry Smith       a->inode.bdiagsize = cnt;
28672205254eSKarl Rupp 
28689566063dSJacob Faibussowitsch       PetscCall(PetscMalloc3(cnt, &a->inode.ibdiag, cnt, &a->inode.bdiag, A->rmap->n, &a->inode.ssor_work));
286971f1c65dSBarry Smith     }
287071f1c65dSBarry Smith 
287171f1c65dSBarry Smith     /* copy over the diagonal blocks and invert them */
28722af78befSBarry Smith     ibdiag = a->inode.ibdiag;
28732af78befSBarry Smith     bdiag  = a->inode.bdiag;
28742af78befSBarry Smith     cnt    = 0;
28752af78befSBarry Smith     for (i = 0, row = 0; i < m; i++) {
28764d12350bSJunchao Zhang       nodesz = sizes[i + 1] - sizes[i];
28774d12350bSJunchao Zhang       for (j = 0; j < nodesz; j++) {
28784d12350bSJunchao Zhang         for (k = 0; k < nodesz; k++) bdiag[cnt + k * nodesz + j] = v[diag[row + j] - j + k];
28792af78befSBarry Smith       }
28804d12350bSJunchao Zhang       PetscCall(PetscArraycpy(ibdiag + cnt, bdiag + cnt, nodesz * nodesz));
28812af78befSBarry Smith 
28824d12350bSJunchao Zhang       switch (nodesz) {
28832af78befSBarry Smith       case 1:
28842af78befSBarry Smith         /* Create matrix data structure */
28858e0e2a9aSHong Zhang         if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) {
28868e0e2a9aSHong Zhang           if (allowzeropivot) {
28877b6c816cSBarry Smith             A->factorerrortype             = MAT_FACTOR_NUMERIC_ZEROPIVOT;
28887b6c816cSBarry Smith             A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]);
28897b6c816cSBarry Smith             A->factorerror_zeropivot_row   = row;
28909566063dSJacob Faibussowitsch             PetscCall(PetscInfo(A, "Zero pivot, row %" PetscInt_FMT "\n", row));
289198921bdaSJacob Faibussowitsch           } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot on row %" PetscInt_FMT, row);
28928e0e2a9aSHong Zhang         }
289364c62002SMatthew Knepley         ibdiag[cnt] = 1.0 / ibdiag[cnt];
28942af78befSBarry Smith         break;
28952af78befSBarry Smith       case 2:
28969566063dSJacob Faibussowitsch         PetscCall(PetscKernel_A_gets_inverse_A_2(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
28977b6c816cSBarry Smith         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
28982af78befSBarry Smith         break;
28992af78befSBarry Smith       case 3:
29009566063dSJacob Faibussowitsch         PetscCall(PetscKernel_A_gets_inverse_A_3(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
29017b6c816cSBarry Smith         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
29022af78befSBarry Smith         break;
29032af78befSBarry Smith       case 4:
29049566063dSJacob Faibussowitsch         PetscCall(PetscKernel_A_gets_inverse_A_4(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
29057b6c816cSBarry Smith         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
29062af78befSBarry Smith         break;
29072af78befSBarry Smith       case 5:
29089566063dSJacob Faibussowitsch         PetscCall(PetscKernel_A_gets_inverse_A_5(ibdiag + cnt, ipvt, work, shift, allowzeropivot, &zeropivotdetected));
29097b6c816cSBarry Smith         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
29102af78befSBarry Smith         break;
2911d71ae5a4SJacob Faibussowitsch       default:
29124d12350bSJunchao Zhang         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
29132af78befSBarry Smith       }
29144d12350bSJunchao Zhang       cnt += nodesz * nodesz;
29154d12350bSJunchao Zhang       row += nodesz;
29162af78befSBarry Smith     }
291771f1c65dSBarry Smith     a->inode.ibdiagvalid = PETSC_TRUE;
29182af78befSBarry Smith   }
29192af78befSBarry Smith   ibdiag = a->inode.ibdiag;
29202af78befSBarry Smith   bdiag  = a->inode.bdiag;
29215850ef23SBarry Smith   t      = a->inode.ssor_work;
29222af78befSBarry Smith 
29239566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
29249566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
29255850ef23SBarry Smith   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
29265850ef23SBarry Smith   if (flag & SOR_ZERO_INITIAL_GUESS) {
29272af78befSBarry Smith     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
29288862d2efSBarry Smith       for (i = 0, row = 0; i < m; i++) {
29298862d2efSBarry Smith         sz  = diag[row] - ii[row];
29308862d2efSBarry Smith         v1  = a->a + ii[row];
29318862d2efSBarry Smith         idx = a->j + ii[row];
29328862d2efSBarry Smith 
29334108e4d5SBarry Smith         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
29344d12350bSJunchao Zhang         nodesz = sizes[i + 1] - sizes[i];
29354d12350bSJunchao Zhang         switch (nodesz) {
29368862d2efSBarry Smith         case 1:
29378862d2efSBarry Smith 
29388862d2efSBarry Smith           sum1 = b[row];
29398862d2efSBarry Smith           for (n = 0; n < sz - 1; n += 2) {
29408862d2efSBarry Smith             i1 = idx[0];
29418862d2efSBarry Smith             i2 = idx[1];
29428862d2efSBarry Smith             idx += 2;
29438862d2efSBarry Smith             tmp0 = x[i1];
29448862d2efSBarry Smith             tmp1 = x[i2];
29459371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
29469371c9d4SSatish Balay             v1 += 2;
29478862d2efSBarry Smith           }
29488862d2efSBarry Smith 
29498862d2efSBarry Smith           if (n == sz - 1) {
2950f0d39aaaSBarry Smith             tmp0 = x[*idx];
2951f0d39aaaSBarry Smith             sum1 -= *v1 * tmp0;
29528862d2efSBarry Smith           }
29535850ef23SBarry Smith           t[row]   = sum1;
29548862d2efSBarry Smith           x[row++] = sum1 * (*ibdiag++);
29558862d2efSBarry Smith           break;
2956f0d39aaaSBarry Smith         case 2:
2957f0d39aaaSBarry Smith           v2   = a->a + ii[row + 1];
2958f0d39aaaSBarry Smith           sum1 = b[row];
2959f0d39aaaSBarry Smith           sum2 = b[row + 1];
2960f0d39aaaSBarry Smith           for (n = 0; n < sz - 1; n += 2) {
2961f0d39aaaSBarry Smith             i1 = idx[0];
2962f0d39aaaSBarry Smith             i2 = idx[1];
2963f0d39aaaSBarry Smith             idx += 2;
2964f0d39aaaSBarry Smith             tmp0 = x[i1];
2965f0d39aaaSBarry Smith             tmp1 = x[i2];
29669371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
29679371c9d4SSatish Balay             v1 += 2;
29689371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
29699371c9d4SSatish Balay             v2 += 2;
2970f0d39aaaSBarry Smith           }
2971f0d39aaaSBarry Smith 
2972f0d39aaaSBarry Smith           if (n == sz - 1) {
2973f0d39aaaSBarry Smith             tmp0 = x[*idx];
2974f0d39aaaSBarry Smith             sum1 -= v1[0] * tmp0;
2975f0d39aaaSBarry Smith             sum2 -= v2[0] * tmp0;
2976f0d39aaaSBarry Smith           }
29775850ef23SBarry Smith           t[row]     = sum1;
29785850ef23SBarry Smith           t[row + 1] = sum2;
2979f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[2];
2980f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[3];
2981f0d39aaaSBarry Smith           ibdiag += 4;
2982f0d39aaaSBarry Smith           break;
2983f0d39aaaSBarry Smith         case 3:
2984f0d39aaaSBarry Smith           v2   = a->a + ii[row + 1];
2985f0d39aaaSBarry Smith           v3   = a->a + ii[row + 2];
2986f0d39aaaSBarry Smith           sum1 = b[row];
2987f0d39aaaSBarry Smith           sum2 = b[row + 1];
2988f0d39aaaSBarry Smith           sum3 = b[row + 2];
2989f0d39aaaSBarry Smith           for (n = 0; n < sz - 1; n += 2) {
2990f0d39aaaSBarry Smith             i1 = idx[0];
2991f0d39aaaSBarry Smith             i2 = idx[1];
2992f0d39aaaSBarry Smith             idx += 2;
2993f0d39aaaSBarry Smith             tmp0 = x[i1];
2994f0d39aaaSBarry Smith             tmp1 = x[i2];
29959371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
29969371c9d4SSatish Balay             v1 += 2;
29979371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
29989371c9d4SSatish Balay             v2 += 2;
29999371c9d4SSatish Balay             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
30009371c9d4SSatish Balay             v3 += 2;
3001f0d39aaaSBarry Smith           }
3002f0d39aaaSBarry Smith 
3003f0d39aaaSBarry Smith           if (n == sz - 1) {
3004f0d39aaaSBarry Smith             tmp0 = x[*idx];
3005f0d39aaaSBarry Smith             sum1 -= v1[0] * tmp0;
3006f0d39aaaSBarry Smith             sum2 -= v2[0] * tmp0;
3007f0d39aaaSBarry Smith             sum3 -= v3[0] * tmp0;
3008f0d39aaaSBarry Smith           }
30095850ef23SBarry Smith           t[row]     = sum1;
30105850ef23SBarry Smith           t[row + 1] = sum2;
30115850ef23SBarry Smith           t[row + 2] = sum3;
3012f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
3013f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
3014f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
3015f0d39aaaSBarry Smith           ibdiag += 9;
3016f0d39aaaSBarry Smith           break;
3017f0d39aaaSBarry Smith         case 4:
3018f0d39aaaSBarry Smith           v2   = a->a + ii[row + 1];
3019f0d39aaaSBarry Smith           v3   = a->a + ii[row + 2];
3020f0d39aaaSBarry Smith           v4   = a->a + ii[row + 3];
3021f0d39aaaSBarry Smith           sum1 = b[row];
3022f0d39aaaSBarry Smith           sum2 = b[row + 1];
3023f0d39aaaSBarry Smith           sum3 = b[row + 2];
3024f0d39aaaSBarry Smith           sum4 = b[row + 3];
3025f0d39aaaSBarry Smith           for (n = 0; n < sz - 1; n += 2) {
3026f0d39aaaSBarry Smith             i1 = idx[0];
3027f0d39aaaSBarry Smith             i2 = idx[1];
3028f0d39aaaSBarry Smith             idx += 2;
3029f0d39aaaSBarry Smith             tmp0 = x[i1];
3030f0d39aaaSBarry Smith             tmp1 = x[i2];
30319371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
30329371c9d4SSatish Balay             v1 += 2;
30339371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
30349371c9d4SSatish Balay             v2 += 2;
30359371c9d4SSatish Balay             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
30369371c9d4SSatish Balay             v3 += 2;
30379371c9d4SSatish Balay             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
30389371c9d4SSatish Balay             v4 += 2;
3039f0d39aaaSBarry Smith           }
3040f0d39aaaSBarry Smith 
3041f0d39aaaSBarry Smith           if (n == sz - 1) {
3042f0d39aaaSBarry Smith             tmp0 = x[*idx];
3043f0d39aaaSBarry Smith             sum1 -= v1[0] * tmp0;
3044f0d39aaaSBarry Smith             sum2 -= v2[0] * tmp0;
3045f0d39aaaSBarry Smith             sum3 -= v3[0] * tmp0;
3046f0d39aaaSBarry Smith             sum4 -= v4[0] * tmp0;
3047f0d39aaaSBarry Smith           }
30485850ef23SBarry Smith           t[row]     = sum1;
30495850ef23SBarry Smith           t[row + 1] = sum2;
30505850ef23SBarry Smith           t[row + 2] = sum3;
30515850ef23SBarry Smith           t[row + 3] = sum4;
3052f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
3053f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
3054f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
3055f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
3056f0d39aaaSBarry Smith           ibdiag += 16;
3057f0d39aaaSBarry Smith           break;
3058f0d39aaaSBarry Smith         case 5:
3059f0d39aaaSBarry Smith           v2   = a->a + ii[row + 1];
3060f0d39aaaSBarry Smith           v3   = a->a + ii[row + 2];
3061f0d39aaaSBarry Smith           v4   = a->a + ii[row + 3];
3062f0d39aaaSBarry Smith           v5   = a->a + ii[row + 4];
3063f0d39aaaSBarry Smith           sum1 = b[row];
3064f0d39aaaSBarry Smith           sum2 = b[row + 1];
3065f0d39aaaSBarry Smith           sum3 = b[row + 2];
3066f0d39aaaSBarry Smith           sum4 = b[row + 3];
3067f0d39aaaSBarry Smith           sum5 = b[row + 4];
3068f0d39aaaSBarry Smith           for (n = 0; n < sz - 1; n += 2) {
3069f0d39aaaSBarry Smith             i1 = idx[0];
3070f0d39aaaSBarry Smith             i2 = idx[1];
3071f0d39aaaSBarry Smith             idx += 2;
3072f0d39aaaSBarry Smith             tmp0 = x[i1];
3073f0d39aaaSBarry Smith             tmp1 = x[i2];
30749371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
30759371c9d4SSatish Balay             v1 += 2;
30769371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
30779371c9d4SSatish Balay             v2 += 2;
30789371c9d4SSatish Balay             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
30799371c9d4SSatish Balay             v3 += 2;
30809371c9d4SSatish Balay             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
30819371c9d4SSatish Balay             v4 += 2;
30829371c9d4SSatish Balay             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
30839371c9d4SSatish Balay             v5 += 2;
3084f0d39aaaSBarry Smith           }
3085f0d39aaaSBarry Smith 
3086f0d39aaaSBarry Smith           if (n == sz - 1) {
3087f0d39aaaSBarry Smith             tmp0 = x[*idx];
3088f0d39aaaSBarry Smith             sum1 -= v1[0] * tmp0;
3089f0d39aaaSBarry Smith             sum2 -= v2[0] * tmp0;
3090f0d39aaaSBarry Smith             sum3 -= v3[0] * tmp0;
3091f0d39aaaSBarry Smith             sum4 -= v4[0] * tmp0;
3092f0d39aaaSBarry Smith             sum5 -= v5[0] * tmp0;
3093f0d39aaaSBarry Smith           }
30945850ef23SBarry Smith           t[row]     = sum1;
30955850ef23SBarry Smith           t[row + 1] = sum2;
30965850ef23SBarry Smith           t[row + 2] = sum3;
30975850ef23SBarry Smith           t[row + 3] = sum4;
30985850ef23SBarry Smith           t[row + 4] = sum5;
3099f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
3100f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
3101f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
3102f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
3103f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
3104f0d39aaaSBarry Smith           ibdiag += 25;
3105f0d39aaaSBarry Smith           break;
3106d71ae5a4SJacob Faibussowitsch         default:
31074d12350bSJunchao Zhang           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
31088862d2efSBarry Smith         }
31092af78befSBarry Smith       }
31102af78befSBarry Smith 
31115850ef23SBarry Smith       xb = t;
31129566063dSJacob Faibussowitsch       PetscCall(PetscLogFlops(a->nz));
31132af78befSBarry Smith     } else xb = b;
31142af78befSBarry Smith     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3115f0d39aaaSBarry Smith       ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3116d0f46423SBarry Smith       for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
31174d12350bSJunchao Zhang         nodesz = sizes[i + 1] - sizes[i];
31184d12350bSJunchao Zhang         ibdiag -= nodesz * nodesz;
31198862d2efSBarry Smith         sz  = ii[row + 1] - diag[row] - 1;
31208862d2efSBarry Smith         v1  = a->a + diag[row] + 1;
31218862d2efSBarry Smith         idx = a->j + diag[row] + 1;
31222af78befSBarry Smith 
31234108e4d5SBarry Smith         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
31244d12350bSJunchao Zhang         switch (nodesz) {
31258862d2efSBarry Smith         case 1:
31268862d2efSBarry Smith 
31278862d2efSBarry Smith           sum1 = xb[row];
31288862d2efSBarry Smith           for (n = 0; n < sz - 1; n += 2) {
31298862d2efSBarry Smith             i1 = idx[0];
31308862d2efSBarry Smith             i2 = idx[1];
31318862d2efSBarry Smith             idx += 2;
31328862d2efSBarry Smith             tmp0 = x[i1];
31338862d2efSBarry Smith             tmp1 = x[i2];
31349371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
31359371c9d4SSatish Balay             v1 += 2;
31368862d2efSBarry Smith           }
31378862d2efSBarry Smith 
31388862d2efSBarry Smith           if (n == sz - 1) {
3139f0d39aaaSBarry Smith             tmp0 = x[*idx];
3140f0d39aaaSBarry Smith             sum1 -= *v1 * tmp0;
31418862d2efSBarry Smith           }
3142f0d39aaaSBarry Smith           x[row--] = sum1 * (*ibdiag);
3143f0d39aaaSBarry Smith           break;
3144f0d39aaaSBarry Smith 
3145f0d39aaaSBarry Smith         case 2:
3146f0d39aaaSBarry Smith 
3147f0d39aaaSBarry Smith           sum1 = xb[row];
3148f0d39aaaSBarry Smith           sum2 = xb[row - 1];
3149f0d39aaaSBarry Smith           /* note that sum1 is associated with the second of the two rows */
3150f0d39aaaSBarry Smith           v2 = a->a + diag[row - 1] + 2;
3151f0d39aaaSBarry Smith           for (n = 0; n < sz - 1; n += 2) {
3152f0d39aaaSBarry Smith             i1 = idx[0];
3153f0d39aaaSBarry Smith             i2 = idx[1];
3154f0d39aaaSBarry Smith             idx += 2;
3155f0d39aaaSBarry Smith             tmp0 = x[i1];
3156f0d39aaaSBarry Smith             tmp1 = x[i2];
31579371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
31589371c9d4SSatish Balay             v1 += 2;
31599371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
31609371c9d4SSatish Balay             v2 += 2;
3161f0d39aaaSBarry Smith           }
3162f0d39aaaSBarry Smith 
3163f0d39aaaSBarry Smith           if (n == sz - 1) {
3164f0d39aaaSBarry Smith             tmp0 = x[*idx];
3165f0d39aaaSBarry Smith             sum1 -= *v1 * tmp0;
3166f0d39aaaSBarry Smith             sum2 -= *v2 * tmp0;
3167f0d39aaaSBarry Smith           }
3168f0d39aaaSBarry Smith           x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3169f0d39aaaSBarry Smith           x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3170f0d39aaaSBarry Smith           break;
3171f0d39aaaSBarry Smith         case 3:
3172f0d39aaaSBarry Smith 
3173f0d39aaaSBarry Smith           sum1 = xb[row];
3174f0d39aaaSBarry Smith           sum2 = xb[row - 1];
3175f0d39aaaSBarry Smith           sum3 = xb[row - 2];
3176f0d39aaaSBarry Smith           v2   = a->a + diag[row - 1] + 2;
3177f0d39aaaSBarry Smith           v3   = a->a + diag[row - 2] + 3;
3178f0d39aaaSBarry Smith           for (n = 0; n < sz - 1; n += 2) {
3179f0d39aaaSBarry Smith             i1 = idx[0];
3180f0d39aaaSBarry Smith             i2 = idx[1];
3181f0d39aaaSBarry Smith             idx += 2;
3182f0d39aaaSBarry Smith             tmp0 = x[i1];
3183f0d39aaaSBarry Smith             tmp1 = x[i2];
31849371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
31859371c9d4SSatish Balay             v1 += 2;
31869371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
31879371c9d4SSatish Balay             v2 += 2;
31889371c9d4SSatish Balay             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
31899371c9d4SSatish Balay             v3 += 2;
3190f0d39aaaSBarry Smith           }
3191f0d39aaaSBarry Smith 
3192f0d39aaaSBarry Smith           if (n == sz - 1) {
3193f0d39aaaSBarry Smith             tmp0 = x[*idx];
3194f0d39aaaSBarry Smith             sum1 -= *v1 * tmp0;
3195f0d39aaaSBarry Smith             sum2 -= *v2 * tmp0;
3196f0d39aaaSBarry Smith             sum3 -= *v3 * tmp0;
3197f0d39aaaSBarry Smith           }
3198f0d39aaaSBarry Smith           x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3199f0d39aaaSBarry Smith           x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3200f0d39aaaSBarry Smith           x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3201f0d39aaaSBarry Smith           break;
3202f0d39aaaSBarry Smith         case 4:
3203f0d39aaaSBarry Smith 
3204f0d39aaaSBarry Smith           sum1 = xb[row];
3205f0d39aaaSBarry Smith           sum2 = xb[row - 1];
3206f0d39aaaSBarry Smith           sum3 = xb[row - 2];
3207f0d39aaaSBarry Smith           sum4 = xb[row - 3];
3208f0d39aaaSBarry Smith           v2   = a->a + diag[row - 1] + 2;
3209f0d39aaaSBarry Smith           v3   = a->a + diag[row - 2] + 3;
3210f0d39aaaSBarry Smith           v4   = a->a + diag[row - 3] + 4;
3211f0d39aaaSBarry Smith           for (n = 0; n < sz - 1; n += 2) {
3212f0d39aaaSBarry Smith             i1 = idx[0];
3213f0d39aaaSBarry Smith             i2 = idx[1];
3214f0d39aaaSBarry Smith             idx += 2;
3215f0d39aaaSBarry Smith             tmp0 = x[i1];
3216f0d39aaaSBarry Smith             tmp1 = x[i2];
32179371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
32189371c9d4SSatish Balay             v1 += 2;
32199371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
32209371c9d4SSatish Balay             v2 += 2;
32219371c9d4SSatish Balay             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
32229371c9d4SSatish Balay             v3 += 2;
32239371c9d4SSatish Balay             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
32249371c9d4SSatish Balay             v4 += 2;
3225f0d39aaaSBarry Smith           }
3226f0d39aaaSBarry Smith 
3227f0d39aaaSBarry Smith           if (n == sz - 1) {
3228f0d39aaaSBarry Smith             tmp0 = x[*idx];
3229f0d39aaaSBarry Smith             sum1 -= *v1 * tmp0;
3230f0d39aaaSBarry Smith             sum2 -= *v2 * tmp0;
3231f0d39aaaSBarry Smith             sum3 -= *v3 * tmp0;
3232f0d39aaaSBarry Smith             sum4 -= *v4 * tmp0;
3233f0d39aaaSBarry Smith           }
3234f0d39aaaSBarry Smith           x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3235f0d39aaaSBarry Smith           x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3236f0d39aaaSBarry Smith           x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3237f0d39aaaSBarry Smith           x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3238f0d39aaaSBarry Smith           break;
3239f0d39aaaSBarry Smith         case 5:
3240f0d39aaaSBarry Smith 
3241f0d39aaaSBarry Smith           sum1 = xb[row];
3242f0d39aaaSBarry Smith           sum2 = xb[row - 1];
3243f0d39aaaSBarry Smith           sum3 = xb[row - 2];
3244f0d39aaaSBarry Smith           sum4 = xb[row - 3];
3245f0d39aaaSBarry Smith           sum5 = xb[row - 4];
3246f0d39aaaSBarry Smith           v2   = a->a + diag[row - 1] + 2;
3247f0d39aaaSBarry Smith           v3   = a->a + diag[row - 2] + 3;
3248f0d39aaaSBarry Smith           v4   = a->a + diag[row - 3] + 4;
3249f0d39aaaSBarry Smith           v5   = a->a + diag[row - 4] + 5;
3250f0d39aaaSBarry Smith           for (n = 0; n < sz - 1; n += 2) {
3251f0d39aaaSBarry Smith             i1 = idx[0];
3252f0d39aaaSBarry Smith             i2 = idx[1];
3253f0d39aaaSBarry Smith             idx += 2;
3254f0d39aaaSBarry Smith             tmp0 = x[i1];
3255f0d39aaaSBarry Smith             tmp1 = x[i2];
32569371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
32579371c9d4SSatish Balay             v1 += 2;
32589371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
32599371c9d4SSatish Balay             v2 += 2;
32609371c9d4SSatish Balay             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
32619371c9d4SSatish Balay             v3 += 2;
32629371c9d4SSatish Balay             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
32639371c9d4SSatish Balay             v4 += 2;
32649371c9d4SSatish Balay             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
32659371c9d4SSatish Balay             v5 += 2;
3266f0d39aaaSBarry Smith           }
3267f0d39aaaSBarry Smith 
3268f0d39aaaSBarry Smith           if (n == sz - 1) {
3269f0d39aaaSBarry Smith             tmp0 = x[*idx];
3270f0d39aaaSBarry Smith             sum1 -= *v1 * tmp0;
3271f0d39aaaSBarry Smith             sum2 -= *v2 * tmp0;
3272f0d39aaaSBarry Smith             sum3 -= *v3 * tmp0;
3273f0d39aaaSBarry Smith             sum4 -= *v4 * tmp0;
3274f0d39aaaSBarry Smith             sum5 -= *v5 * tmp0;
3275f0d39aaaSBarry Smith           }
3276f0d39aaaSBarry Smith           x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3277f0d39aaaSBarry Smith           x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3278f0d39aaaSBarry Smith           x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3279f0d39aaaSBarry Smith           x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3280f0d39aaaSBarry Smith           x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
32818862d2efSBarry Smith           break;
3282d71ae5a4SJacob Faibussowitsch         default:
32834d12350bSJunchao Zhang           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
32848862d2efSBarry Smith         }
32852af78befSBarry Smith       }
32862af78befSBarry Smith 
32879566063dSJacob Faibussowitsch       PetscCall(PetscLogFlops(a->nz));
32882af78befSBarry Smith     }
32892af78befSBarry Smith     its--;
32905850ef23SBarry Smith   }
32915850ef23SBarry Smith   while (its--) {
32925850ef23SBarry Smith     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
32934d12350bSJunchao Zhang       for (i = 0, row = 0, ibdiag = a->inode.ibdiag; i < m; row += nodesz, ibdiag += nodesz * nodesz, i++) {
32944d12350bSJunchao Zhang         nodesz = sizes[i + 1] - sizes[i];
3295d876e2b0SMark Adams         sz     = diag[row] - ii[row];
32965850ef23SBarry Smith         v1     = a->a + ii[row];
32975850ef23SBarry Smith         idx    = a->j + ii[row];
32985850ef23SBarry Smith         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
32994d12350bSJunchao Zhang         switch (nodesz) {
33005850ef23SBarry Smith         case 1:
33015850ef23SBarry Smith           sum1 = b[row];
33025850ef23SBarry Smith           for (n = 0; n < sz - 1; n += 2) {
33035850ef23SBarry Smith             i1 = idx[0];
33045850ef23SBarry Smith             i2 = idx[1];
33055850ef23SBarry Smith             idx += 2;
33065850ef23SBarry Smith             tmp0 = x[i1];
33075850ef23SBarry Smith             tmp1 = x[i2];
33089371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
33099371c9d4SSatish Balay             v1 += 2;
33105850ef23SBarry Smith           }
33115850ef23SBarry Smith           if (n == sz - 1) {
3312d876e2b0SMark Adams             tmp0 = x[*idx++];
3313d876e2b0SMark Adams             sum1 -= *v1 * tmp0;
3314d876e2b0SMark Adams             v1++;
3315d876e2b0SMark Adams           }
3316d876e2b0SMark Adams           t[row] = sum1;
3317d876e2b0SMark Adams           sz     = ii[row + 1] - diag[row] - 1;
3318d876e2b0SMark Adams           idx    = a->j + diag[row] + 1;
3319d876e2b0SMark Adams           v1 += 1;
3320d876e2b0SMark Adams           for (n = 0; n < sz - 1; n += 2) {
3321d876e2b0SMark Adams             i1 = idx[0];
3322d876e2b0SMark Adams             i2 = idx[1];
3323d876e2b0SMark Adams             idx += 2;
3324d876e2b0SMark Adams             tmp0 = x[i1];
3325d876e2b0SMark Adams             tmp1 = x[i2];
33269371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
33279371c9d4SSatish Balay             v1 += 2;
3328d876e2b0SMark Adams           }
3329d876e2b0SMark Adams           if (n == sz - 1) {
3330d876e2b0SMark Adams             tmp0 = x[*idx++];
33315850ef23SBarry Smith             sum1 -= *v1 * tmp0;
33325850ef23SBarry Smith           }
33335850ef23SBarry Smith           /* in MatSOR_SeqAIJ this line would be
33345850ef23SBarry Smith            *
33355850ef23SBarry Smith            * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
33365850ef23SBarry Smith            *
33375850ef23SBarry Smith            * but omega == 1, so this becomes
33385850ef23SBarry Smith            *
3339d876e2b0SMark Adams            * x[row] = sum1*(*ibdiag++);
33405850ef23SBarry Smith            *
33415850ef23SBarry Smith            */
3342d876e2b0SMark Adams           x[row] = sum1 * (*ibdiag);
33435850ef23SBarry Smith           break;
33445850ef23SBarry Smith         case 2:
33455850ef23SBarry Smith           v2   = a->a + ii[row + 1];
33465850ef23SBarry Smith           sum1 = b[row];
33475850ef23SBarry Smith           sum2 = b[row + 1];
33485850ef23SBarry Smith           for (n = 0; n < sz - 1; n += 2) {
33495850ef23SBarry Smith             i1 = idx[0];
33505850ef23SBarry Smith             i2 = idx[1];
33515850ef23SBarry Smith             idx += 2;
33525850ef23SBarry Smith             tmp0 = x[i1];
33535850ef23SBarry Smith             tmp1 = x[i2];
33549371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
33559371c9d4SSatish Balay             v1 += 2;
33569371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
33579371c9d4SSatish Balay             v2 += 2;
33585850ef23SBarry Smith           }
3359d876e2b0SMark Adams           if (n == sz - 1) {
3360d876e2b0SMark Adams             tmp0 = x[*idx++];
3361d876e2b0SMark Adams             sum1 -= v1[0] * tmp0;
3362d876e2b0SMark Adams             sum2 -= v2[0] * tmp0;
33639371c9d4SSatish Balay             v1++;
33649371c9d4SSatish Balay             v2++;
3365d876e2b0SMark Adams           }
3366d876e2b0SMark Adams           t[row]     = sum1;
3367d876e2b0SMark Adams           t[row + 1] = sum2;
3368d876e2b0SMark Adams           sz         = ii[row + 1] - diag[row] - 2;
3369d876e2b0SMark Adams           idx        = a->j + diag[row] + 2;
3370d876e2b0SMark Adams           v1 += 2;
3371d876e2b0SMark Adams           v2 += 2;
3372d876e2b0SMark Adams           for (n = 0; n < sz - 1; n += 2) {
3373d876e2b0SMark Adams             i1 = idx[0];
3374d876e2b0SMark Adams             i2 = idx[1];
3375d876e2b0SMark Adams             idx += 2;
3376d876e2b0SMark Adams             tmp0 = x[i1];
3377d876e2b0SMark Adams             tmp1 = x[i2];
33789371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
33799371c9d4SSatish Balay             v1 += 2;
33809371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
33819371c9d4SSatish Balay             v2 += 2;
3382d876e2b0SMark Adams           }
33835850ef23SBarry Smith           if (n == sz - 1) {
33845850ef23SBarry Smith             tmp0 = x[*idx];
33855850ef23SBarry Smith             sum1 -= v1[0] * tmp0;
33865850ef23SBarry Smith             sum2 -= v2[0] * tmp0;
33875850ef23SBarry Smith           }
3388d876e2b0SMark Adams           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[2];
3389d876e2b0SMark Adams           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
33905850ef23SBarry Smith           break;
33915850ef23SBarry Smith         case 3:
33925850ef23SBarry Smith           v2   = a->a + ii[row + 1];
33935850ef23SBarry Smith           v3   = a->a + ii[row + 2];
33945850ef23SBarry Smith           sum1 = b[row];
33955850ef23SBarry Smith           sum2 = b[row + 1];
33965850ef23SBarry Smith           sum3 = b[row + 2];
33975850ef23SBarry Smith           for (n = 0; n < sz - 1; n += 2) {
33985850ef23SBarry Smith             i1 = idx[0];
33995850ef23SBarry Smith             i2 = idx[1];
34005850ef23SBarry Smith             idx += 2;
34015850ef23SBarry Smith             tmp0 = x[i1];
34025850ef23SBarry Smith             tmp1 = x[i2];
34039371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
34049371c9d4SSatish Balay             v1 += 2;
34059371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
34069371c9d4SSatish Balay             v2 += 2;
34079371c9d4SSatish Balay             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
34089371c9d4SSatish Balay             v3 += 2;
34095850ef23SBarry Smith           }
3410d876e2b0SMark Adams           if (n == sz - 1) {
3411d876e2b0SMark Adams             tmp0 = x[*idx++];
3412d876e2b0SMark Adams             sum1 -= v1[0] * tmp0;
3413d876e2b0SMark Adams             sum2 -= v2[0] * tmp0;
3414d876e2b0SMark Adams             sum3 -= v3[0] * tmp0;
34159371c9d4SSatish Balay             v1++;
34169371c9d4SSatish Balay             v2++;
34179371c9d4SSatish Balay             v3++;
3418d876e2b0SMark Adams           }
3419d876e2b0SMark Adams           t[row]     = sum1;
3420d876e2b0SMark Adams           t[row + 1] = sum2;
3421d876e2b0SMark Adams           t[row + 2] = sum3;
3422d876e2b0SMark Adams           sz         = ii[row + 1] - diag[row] - 3;
3423d876e2b0SMark Adams           idx        = a->j + diag[row] + 3;
3424d876e2b0SMark Adams           v1 += 3;
3425d876e2b0SMark Adams           v2 += 3;
3426d876e2b0SMark Adams           v3 += 3;
3427d876e2b0SMark Adams           for (n = 0; n < sz - 1; n += 2) {
3428d876e2b0SMark Adams             i1 = idx[0];
3429d876e2b0SMark Adams             i2 = idx[1];
3430d876e2b0SMark Adams             idx += 2;
3431d876e2b0SMark Adams             tmp0 = x[i1];
3432d876e2b0SMark Adams             tmp1 = x[i2];
34339371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
34349371c9d4SSatish Balay             v1 += 2;
34359371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
34369371c9d4SSatish Balay             v2 += 2;
34379371c9d4SSatish Balay             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
34389371c9d4SSatish Balay             v3 += 2;
3439d876e2b0SMark Adams           }
34405850ef23SBarry Smith           if (n == sz - 1) {
34415850ef23SBarry Smith             tmp0 = x[*idx];
34425850ef23SBarry Smith             sum1 -= v1[0] * tmp0;
34435850ef23SBarry Smith             sum2 -= v2[0] * tmp0;
34445850ef23SBarry Smith             sum3 -= v3[0] * tmp0;
34455850ef23SBarry Smith           }
3446d876e2b0SMark Adams           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
3447d876e2b0SMark Adams           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
3448d876e2b0SMark Adams           x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
34495850ef23SBarry Smith           break;
34505850ef23SBarry Smith         case 4:
34515850ef23SBarry Smith           v2   = a->a + ii[row + 1];
34525850ef23SBarry Smith           v3   = a->a + ii[row + 2];
34535850ef23SBarry Smith           v4   = a->a + ii[row + 3];
34545850ef23SBarry Smith           sum1 = b[row];
34555850ef23SBarry Smith           sum2 = b[row + 1];
34565850ef23SBarry Smith           sum3 = b[row + 2];
34575850ef23SBarry Smith           sum4 = b[row + 3];
34585850ef23SBarry Smith           for (n = 0; n < sz - 1; n += 2) {
34595850ef23SBarry Smith             i1 = idx[0];
34605850ef23SBarry Smith             i2 = idx[1];
34615850ef23SBarry Smith             idx += 2;
34625850ef23SBarry Smith             tmp0 = x[i1];
34635850ef23SBarry Smith             tmp1 = x[i2];
34649371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
34659371c9d4SSatish Balay             v1 += 2;
34669371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
34679371c9d4SSatish Balay             v2 += 2;
34689371c9d4SSatish Balay             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
34699371c9d4SSatish Balay             v3 += 2;
34709371c9d4SSatish Balay             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
34719371c9d4SSatish Balay             v4 += 2;
34725850ef23SBarry Smith           }
3473d876e2b0SMark Adams           if (n == sz - 1) {
3474d876e2b0SMark Adams             tmp0 = x[*idx++];
3475d876e2b0SMark Adams             sum1 -= v1[0] * tmp0;
3476d876e2b0SMark Adams             sum2 -= v2[0] * tmp0;
3477d876e2b0SMark Adams             sum3 -= v3[0] * tmp0;
3478d876e2b0SMark Adams             sum4 -= v4[0] * tmp0;
34799371c9d4SSatish Balay             v1++;
34809371c9d4SSatish Balay             v2++;
34819371c9d4SSatish Balay             v3++;
34829371c9d4SSatish Balay             v4++;
3483d876e2b0SMark Adams           }
3484d876e2b0SMark Adams           t[row]     = sum1;
3485d876e2b0SMark Adams           t[row + 1] = sum2;
3486d876e2b0SMark Adams           t[row + 2] = sum3;
3487d876e2b0SMark Adams           t[row + 3] = sum4;
3488d876e2b0SMark Adams           sz         = ii[row + 1] - diag[row] - 4;
3489d876e2b0SMark Adams           idx        = a->j + diag[row] + 4;
3490d876e2b0SMark Adams           v1 += 4;
3491d876e2b0SMark Adams           v2 += 4;
3492d876e2b0SMark Adams           v3 += 4;
3493d876e2b0SMark Adams           v4 += 4;
3494d876e2b0SMark Adams           for (n = 0; n < sz - 1; n += 2) {
3495d876e2b0SMark Adams             i1 = idx[0];
3496d876e2b0SMark Adams             i2 = idx[1];
3497d876e2b0SMark Adams             idx += 2;
3498d876e2b0SMark Adams             tmp0 = x[i1];
3499d876e2b0SMark Adams             tmp1 = x[i2];
35009371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
35019371c9d4SSatish Balay             v1 += 2;
35029371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
35039371c9d4SSatish Balay             v2 += 2;
35049371c9d4SSatish Balay             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
35059371c9d4SSatish Balay             v3 += 2;
35069371c9d4SSatish Balay             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
35079371c9d4SSatish Balay             v4 += 2;
3508d876e2b0SMark Adams           }
35095850ef23SBarry Smith           if (n == sz - 1) {
35105850ef23SBarry Smith             tmp0 = x[*idx];
35115850ef23SBarry Smith             sum1 -= v1[0] * tmp0;
35125850ef23SBarry Smith             sum2 -= v2[0] * tmp0;
35135850ef23SBarry Smith             sum3 -= v3[0] * tmp0;
35145850ef23SBarry Smith             sum4 -= v4[0] * tmp0;
35155850ef23SBarry Smith           }
3516d876e2b0SMark Adams           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
3517d876e2b0SMark Adams           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
3518d876e2b0SMark Adams           x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
3519d876e2b0SMark Adams           x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
35205850ef23SBarry Smith           break;
35215850ef23SBarry Smith         case 5:
35225850ef23SBarry Smith           v2   = a->a + ii[row + 1];
35235850ef23SBarry Smith           v3   = a->a + ii[row + 2];
35245850ef23SBarry Smith           v4   = a->a + ii[row + 3];
35255850ef23SBarry Smith           v5   = a->a + ii[row + 4];
35265850ef23SBarry Smith           sum1 = b[row];
35275850ef23SBarry Smith           sum2 = b[row + 1];
35285850ef23SBarry Smith           sum3 = b[row + 2];
35295850ef23SBarry Smith           sum4 = b[row + 3];
35305850ef23SBarry Smith           sum5 = b[row + 4];
35315850ef23SBarry Smith           for (n = 0; n < sz - 1; n += 2) {
35325850ef23SBarry Smith             i1 = idx[0];
35335850ef23SBarry Smith             i2 = idx[1];
35345850ef23SBarry Smith             idx += 2;
35355850ef23SBarry Smith             tmp0 = x[i1];
35365850ef23SBarry Smith             tmp1 = x[i2];
35379371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
35389371c9d4SSatish Balay             v1 += 2;
35399371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
35409371c9d4SSatish Balay             v2 += 2;
35419371c9d4SSatish Balay             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
35429371c9d4SSatish Balay             v3 += 2;
35439371c9d4SSatish Balay             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
35449371c9d4SSatish Balay             v4 += 2;
35459371c9d4SSatish Balay             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
35469371c9d4SSatish Balay             v5 += 2;
35475850ef23SBarry Smith           }
35485850ef23SBarry Smith           if (n == sz - 1) {
3549d876e2b0SMark Adams             tmp0 = x[*idx++];
35505850ef23SBarry Smith             sum1 -= v1[0] * tmp0;
35515850ef23SBarry Smith             sum2 -= v2[0] * tmp0;
35525850ef23SBarry Smith             sum3 -= v3[0] * tmp0;
35535850ef23SBarry Smith             sum4 -= v4[0] * tmp0;
35545850ef23SBarry Smith             sum5 -= v5[0] * tmp0;
35559371c9d4SSatish Balay             v1++;
35569371c9d4SSatish Balay             v2++;
35579371c9d4SSatish Balay             v3++;
35589371c9d4SSatish Balay             v4++;
35599371c9d4SSatish Balay             v5++;
35605850ef23SBarry Smith           }
3561d876e2b0SMark Adams           t[row]     = sum1;
3562d876e2b0SMark Adams           t[row + 1] = sum2;
3563d876e2b0SMark Adams           t[row + 2] = sum3;
3564d876e2b0SMark Adams           t[row + 3] = sum4;
3565d876e2b0SMark Adams           t[row + 4] = sum5;
3566d876e2b0SMark Adams           sz         = ii[row + 1] - diag[row] - 5;
3567d876e2b0SMark Adams           idx        = a->j + diag[row] + 5;
3568d876e2b0SMark Adams           v1 += 5;
3569d876e2b0SMark Adams           v2 += 5;
3570d876e2b0SMark Adams           v3 += 5;
3571d876e2b0SMark Adams           v4 += 5;
3572d876e2b0SMark Adams           v5 += 5;
35735850ef23SBarry Smith           for (n = 0; n < sz - 1; n += 2) {
35745850ef23SBarry Smith             i1 = idx[0];
35755850ef23SBarry Smith             i2 = idx[1];
35765850ef23SBarry Smith             idx += 2;
35775850ef23SBarry Smith             tmp0 = x[i1];
35785850ef23SBarry Smith             tmp1 = x[i2];
35799371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
35809371c9d4SSatish Balay             v1 += 2;
35819371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
35829371c9d4SSatish Balay             v2 += 2;
35839371c9d4SSatish Balay             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
35849371c9d4SSatish Balay             v3 += 2;
35859371c9d4SSatish Balay             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
35869371c9d4SSatish Balay             v4 += 2;
35879371c9d4SSatish Balay             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
35889371c9d4SSatish Balay             v5 += 2;
35895850ef23SBarry Smith           }
35905850ef23SBarry Smith           if (n == sz - 1) {
35915850ef23SBarry Smith             tmp0 = x[*idx];
3592d876e2b0SMark Adams             sum1 -= v1[0] * tmp0;
3593d876e2b0SMark Adams             sum2 -= v2[0] * tmp0;
3594d876e2b0SMark Adams             sum3 -= v3[0] * tmp0;
3595d876e2b0SMark Adams             sum4 -= v4[0] * tmp0;
3596d876e2b0SMark Adams             sum5 -= v5[0] * tmp0;
35975850ef23SBarry Smith           }
3598d876e2b0SMark Adams           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
3599d876e2b0SMark Adams           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
3600d876e2b0SMark Adams           x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
3601d876e2b0SMark Adams           x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
3602d876e2b0SMark Adams           x[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
3603d876e2b0SMark Adams           break;
3604d71ae5a4SJacob Faibussowitsch         default:
36054d12350bSJunchao Zhang           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
3606d876e2b0SMark Adams         }
3607d876e2b0SMark Adams       }
3608d876e2b0SMark Adams       xb = t;
36099566063dSJacob Faibussowitsch       PetscCall(PetscLogFlops(2.0 * a->nz)); /* undercounts diag inverse */
3610d876e2b0SMark Adams     } else xb = b;
3611d876e2b0SMark Adams 
3612d876e2b0SMark Adams     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3613d876e2b0SMark Adams       ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3614d876e2b0SMark Adams       for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
36154d12350bSJunchao Zhang         nodesz = sizes[i + 1] - sizes[i];
36164d12350bSJunchao Zhang         ibdiag -= nodesz * nodesz;
3617d876e2b0SMark Adams 
3618d876e2b0SMark Adams         /* set RHS */
3619d876e2b0SMark Adams         if (xb == b) {
3620d876e2b0SMark Adams           /* whole (old way) */
3621d876e2b0SMark Adams           sz  = ii[row + 1] - ii[row];
3622d876e2b0SMark Adams           idx = a->j + ii[row];
36234d12350bSJunchao Zhang           switch (nodesz) {
3624d71ae5a4SJacob Faibussowitsch           case 5:
3625d71ae5a4SJacob Faibussowitsch             v5 = a->a + ii[row - 4]; /* fall through */
3626d71ae5a4SJacob Faibussowitsch           case 4:
3627d71ae5a4SJacob Faibussowitsch             v4 = a->a + ii[row - 3]; /* fall through */
3628d71ae5a4SJacob Faibussowitsch           case 3:
3629d71ae5a4SJacob Faibussowitsch             v3 = a->a + ii[row - 2]; /* fall through */
3630d71ae5a4SJacob Faibussowitsch           case 2:
3631d71ae5a4SJacob Faibussowitsch             v2 = a->a + ii[row - 1]; /* fall through */
3632d71ae5a4SJacob Faibussowitsch           case 1:
3633d71ae5a4SJacob Faibussowitsch             v1 = a->a + ii[row];
3634d71ae5a4SJacob Faibussowitsch             break;
3635d71ae5a4SJacob Faibussowitsch           default:
36364d12350bSJunchao Zhang             SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
3637d876e2b0SMark Adams           }
3638d876e2b0SMark Adams         } else {
3639d876e2b0SMark Adams           /* upper, no diag */
3640d876e2b0SMark Adams           sz  = ii[row + 1] - diag[row] - 1;
3641d876e2b0SMark Adams           idx = a->j + diag[row] + 1;
36424d12350bSJunchao Zhang           switch (nodesz) {
3643d71ae5a4SJacob Faibussowitsch           case 5:
3644d71ae5a4SJacob Faibussowitsch             v5 = a->a + diag[row - 4] + 5; /* fall through */
3645d71ae5a4SJacob Faibussowitsch           case 4:
3646d71ae5a4SJacob Faibussowitsch             v4 = a->a + diag[row - 3] + 4; /* fall through */
3647d71ae5a4SJacob Faibussowitsch           case 3:
3648d71ae5a4SJacob Faibussowitsch             v3 = a->a + diag[row - 2] + 3; /* fall through */
3649d71ae5a4SJacob Faibussowitsch           case 2:
3650d71ae5a4SJacob Faibussowitsch             v2 = a->a + diag[row - 1] + 2; /* fall through */
3651d71ae5a4SJacob Faibussowitsch           case 1:
3652d71ae5a4SJacob Faibussowitsch             v1 = a->a + diag[row] + 1;
3653d876e2b0SMark Adams           }
3654d876e2b0SMark Adams         }
3655d876e2b0SMark Adams         /* set sum */
36564d12350bSJunchao Zhang         switch (nodesz) {
3657d71ae5a4SJacob Faibussowitsch         case 5:
3658d71ae5a4SJacob Faibussowitsch           sum5 = xb[row - 4]; /* fall through */
3659d71ae5a4SJacob Faibussowitsch         case 4:
3660d71ae5a4SJacob Faibussowitsch           sum4 = xb[row - 3]; /* fall through */
3661d71ae5a4SJacob Faibussowitsch         case 3:
3662d71ae5a4SJacob Faibussowitsch           sum3 = xb[row - 2]; /* fall through */
3663d71ae5a4SJacob Faibussowitsch         case 2:
3664d71ae5a4SJacob Faibussowitsch           sum2 = xb[row - 1]; /* fall through */
3665d876e2b0SMark Adams         case 1:
3666d876e2b0SMark Adams           /* note that sum1 is associated with the last row */
3667d876e2b0SMark Adams           sum1 = xb[row];
3668d876e2b0SMark Adams         }
3669d876e2b0SMark Adams         /* do sums */
3670d876e2b0SMark Adams         for (n = 0; n < sz - 1; n += 2) {
3671d876e2b0SMark Adams           i1 = idx[0];
3672d876e2b0SMark Adams           i2 = idx[1];
3673d876e2b0SMark Adams           idx += 2;
3674d876e2b0SMark Adams           tmp0 = x[i1];
3675d876e2b0SMark Adams           tmp1 = x[i2];
36764d12350bSJunchao Zhang           switch (nodesz) {
3677d71ae5a4SJacob Faibussowitsch           case 5:
3678d71ae5a4SJacob Faibussowitsch             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3679d71ae5a4SJacob Faibussowitsch             v5 += 2; /* fall through */
3680d71ae5a4SJacob Faibussowitsch           case 4:
3681d71ae5a4SJacob Faibussowitsch             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3682d71ae5a4SJacob Faibussowitsch             v4 += 2; /* fall through */
3683d71ae5a4SJacob Faibussowitsch           case 3:
3684d71ae5a4SJacob Faibussowitsch             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3685d71ae5a4SJacob Faibussowitsch             v3 += 2; /* fall through */
3686d71ae5a4SJacob Faibussowitsch           case 2:
3687d71ae5a4SJacob Faibussowitsch             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3688d71ae5a4SJacob Faibussowitsch             v2 += 2; /* fall through */
3689d71ae5a4SJacob Faibussowitsch           case 1:
3690d71ae5a4SJacob Faibussowitsch             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3691d71ae5a4SJacob Faibussowitsch             v1 += 2;
3692d876e2b0SMark Adams           }
3693d876e2b0SMark Adams         }
3694d876e2b0SMark Adams         /* ragged edge */
3695d876e2b0SMark Adams         if (n == sz - 1) {
3696d876e2b0SMark Adams           tmp0 = x[*idx];
36974d12350bSJunchao Zhang           switch (nodesz) {
3698d71ae5a4SJacob Faibussowitsch           case 5:
3699d71ae5a4SJacob Faibussowitsch             sum5 -= *v5 * tmp0; /* fall through */
3700d71ae5a4SJacob Faibussowitsch           case 4:
3701d71ae5a4SJacob Faibussowitsch             sum4 -= *v4 * tmp0; /* fall through */
3702d71ae5a4SJacob Faibussowitsch           case 3:
3703d71ae5a4SJacob Faibussowitsch             sum3 -= *v3 * tmp0; /* fall through */
3704d71ae5a4SJacob Faibussowitsch           case 2:
3705d71ae5a4SJacob Faibussowitsch             sum2 -= *v2 * tmp0; /* fall through */
3706d71ae5a4SJacob Faibussowitsch           case 1:
3707d71ae5a4SJacob Faibussowitsch             sum1 -= *v1 * tmp0;
3708d876e2b0SMark Adams           }
3709d876e2b0SMark Adams         }
3710d876e2b0SMark Adams         /* update */
3711d876e2b0SMark Adams         if (xb == b) {
3712d876e2b0SMark Adams           /* whole (old way) w/ diag */
37134d12350bSJunchao Zhang           switch (nodesz) {
3714d876e2b0SMark Adams           case 5:
37155850ef23SBarry Smith             x[row--] += sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
37165850ef23SBarry Smith             x[row--] += sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
37175850ef23SBarry Smith             x[row--] += sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
37185850ef23SBarry Smith             x[row--] += sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
37195850ef23SBarry Smith             x[row--] += sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
37205850ef23SBarry Smith             break;
3721d876e2b0SMark Adams           case 4:
3722d876e2b0SMark Adams             x[row--] += sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3723d876e2b0SMark Adams             x[row--] += sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3724d876e2b0SMark Adams             x[row--] += sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3725d876e2b0SMark Adams             x[row--] += sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3726d876e2b0SMark Adams             break;
3727d876e2b0SMark Adams           case 3:
3728d876e2b0SMark Adams             x[row--] += sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3729d876e2b0SMark Adams             x[row--] += sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3730d876e2b0SMark Adams             x[row--] += sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3731d876e2b0SMark Adams             break;
3732d876e2b0SMark Adams           case 2:
3733d876e2b0SMark Adams             x[row--] += sum2 * ibdiag[1] + sum1 * ibdiag[3];
3734d876e2b0SMark Adams             x[row--] += sum2 * ibdiag[0] + sum1 * ibdiag[2];
3735d876e2b0SMark Adams             break;
3736d71ae5a4SJacob Faibussowitsch           case 1:
3737d71ae5a4SJacob Faibussowitsch             x[row--] += sum1 * (*ibdiag);
3738d71ae5a4SJacob Faibussowitsch             break;
3739d876e2b0SMark Adams           }
3740d876e2b0SMark Adams         } else {
3741d876e2b0SMark Adams           /* no diag so set =  */
37424d12350bSJunchao Zhang           switch (nodesz) {
3743d876e2b0SMark Adams           case 5:
3744d876e2b0SMark Adams             x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3745d876e2b0SMark Adams             x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3746d876e2b0SMark Adams             x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3747d876e2b0SMark Adams             x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3748d876e2b0SMark Adams             x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3749d876e2b0SMark Adams             break;
3750d876e2b0SMark Adams           case 4:
3751d876e2b0SMark Adams             x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3752d876e2b0SMark Adams             x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3753d876e2b0SMark Adams             x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3754d876e2b0SMark Adams             x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3755d876e2b0SMark Adams             break;
3756d876e2b0SMark Adams           case 3:
3757d876e2b0SMark Adams             x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3758d876e2b0SMark Adams             x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3759d876e2b0SMark Adams             x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3760d876e2b0SMark Adams             break;
3761d876e2b0SMark Adams           case 2:
3762d876e2b0SMark Adams             x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3763d876e2b0SMark Adams             x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3764d876e2b0SMark Adams             break;
3765d71ae5a4SJacob Faibussowitsch           case 1:
3766d71ae5a4SJacob Faibussowitsch             x[row--] = sum1 * (*ibdiag);
3767d71ae5a4SJacob Faibussowitsch             break;
37685850ef23SBarry Smith           }
37695850ef23SBarry Smith         }
3770d876e2b0SMark Adams       }
3771d876e2b0SMark Adams       if (xb == b) {
37729566063dSJacob Faibussowitsch         PetscCall(PetscLogFlops(2.0 * a->nz));
3773d876e2b0SMark Adams       } else {
37749566063dSJacob Faibussowitsch         PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper, undercounts diag inverse */
3775d876e2b0SMark Adams       }
37765850ef23SBarry Smith     }
37772af78befSBarry Smith   }
377889c6957cSBarry Smith   if (flag & SOR_EISENSTAT) {
377989c6957cSBarry Smith     /*
378089c6957cSBarry Smith           Apply  (U + D)^-1  where D is now the block diagonal
378189c6957cSBarry Smith     */
378289c6957cSBarry Smith     ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
378389c6957cSBarry Smith     for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
37844d12350bSJunchao Zhang       nodesz = sizes[i + 1] - sizes[i];
37854d12350bSJunchao Zhang       ibdiag -= nodesz * nodesz;
378689c6957cSBarry Smith       sz  = ii[row + 1] - diag[row] - 1;
378789c6957cSBarry Smith       v1  = a->a + diag[row] + 1;
378889c6957cSBarry Smith       idx = a->j + diag[row] + 1;
37894108e4d5SBarry Smith       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
37904d12350bSJunchao Zhang       switch (nodesz) {
379189c6957cSBarry Smith       case 1:
379289c6957cSBarry Smith 
379389c6957cSBarry Smith         sum1 = b[row];
379489c6957cSBarry Smith         for (n = 0; n < sz - 1; n += 2) {
379589c6957cSBarry Smith           i1 = idx[0];
379689c6957cSBarry Smith           i2 = idx[1];
379789c6957cSBarry Smith           idx += 2;
379889c6957cSBarry Smith           tmp0 = x[i1];
379989c6957cSBarry Smith           tmp1 = x[i2];
38009371c9d4SSatish Balay           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
38019371c9d4SSatish Balay           v1 += 2;
380289c6957cSBarry Smith         }
380389c6957cSBarry Smith 
380489c6957cSBarry Smith         if (n == sz - 1) {
380589c6957cSBarry Smith           tmp0 = x[*idx];
380689c6957cSBarry Smith           sum1 -= *v1 * tmp0;
380789c6957cSBarry Smith         }
38089371c9d4SSatish Balay         x[row] = sum1 * (*ibdiag);
38099371c9d4SSatish Balay         row--;
381089c6957cSBarry Smith         break;
381189c6957cSBarry Smith 
381289c6957cSBarry Smith       case 2:
381389c6957cSBarry Smith 
381489c6957cSBarry Smith         sum1 = b[row];
381589c6957cSBarry Smith         sum2 = b[row - 1];
381689c6957cSBarry Smith         /* note that sum1 is associated with the second of the two rows */
381789c6957cSBarry Smith         v2 = a->a + diag[row - 1] + 2;
381889c6957cSBarry Smith         for (n = 0; n < sz - 1; n += 2) {
381989c6957cSBarry Smith           i1 = idx[0];
382089c6957cSBarry Smith           i2 = idx[1];
382189c6957cSBarry Smith           idx += 2;
382289c6957cSBarry Smith           tmp0 = x[i1];
382389c6957cSBarry Smith           tmp1 = x[i2];
38249371c9d4SSatish Balay           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
38259371c9d4SSatish Balay           v1 += 2;
38269371c9d4SSatish Balay           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
38279371c9d4SSatish Balay           v2 += 2;
382889c6957cSBarry Smith         }
382989c6957cSBarry Smith 
383089c6957cSBarry Smith         if (n == sz - 1) {
383189c6957cSBarry Smith           tmp0 = x[*idx];
383289c6957cSBarry Smith           sum1 -= *v1 * tmp0;
383389c6957cSBarry Smith           sum2 -= *v2 * tmp0;
383489c6957cSBarry Smith         }
3835938d4eb3SBarry Smith         x[row]     = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3836938d4eb3SBarry Smith         x[row - 1] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3837938d4eb3SBarry Smith         row -= 2;
383889c6957cSBarry Smith         break;
383989c6957cSBarry Smith       case 3:
384089c6957cSBarry Smith 
384189c6957cSBarry Smith         sum1 = b[row];
384289c6957cSBarry Smith         sum2 = b[row - 1];
384389c6957cSBarry Smith         sum3 = b[row - 2];
384489c6957cSBarry Smith         v2   = a->a + diag[row - 1] + 2;
384589c6957cSBarry Smith         v3   = a->a + diag[row - 2] + 3;
384689c6957cSBarry Smith         for (n = 0; n < sz - 1; n += 2) {
384789c6957cSBarry Smith           i1 = idx[0];
384889c6957cSBarry Smith           i2 = idx[1];
384989c6957cSBarry Smith           idx += 2;
385089c6957cSBarry Smith           tmp0 = x[i1];
385189c6957cSBarry Smith           tmp1 = x[i2];
38529371c9d4SSatish Balay           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
38539371c9d4SSatish Balay           v1 += 2;
38549371c9d4SSatish Balay           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
38559371c9d4SSatish Balay           v2 += 2;
38569371c9d4SSatish Balay           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
38579371c9d4SSatish Balay           v3 += 2;
385889c6957cSBarry Smith         }
385989c6957cSBarry Smith 
386089c6957cSBarry Smith         if (n == sz - 1) {
386189c6957cSBarry Smith           tmp0 = x[*idx];
386289c6957cSBarry Smith           sum1 -= *v1 * tmp0;
386389c6957cSBarry Smith           sum2 -= *v2 * tmp0;
386489c6957cSBarry Smith           sum3 -= *v3 * tmp0;
386589c6957cSBarry Smith         }
3866938d4eb3SBarry Smith         x[row]     = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3867938d4eb3SBarry Smith         x[row - 1] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3868938d4eb3SBarry Smith         x[row - 2] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3869938d4eb3SBarry Smith         row -= 3;
387089c6957cSBarry Smith         break;
387189c6957cSBarry Smith       case 4:
387289c6957cSBarry Smith 
387389c6957cSBarry Smith         sum1 = b[row];
387489c6957cSBarry Smith         sum2 = b[row - 1];
387589c6957cSBarry Smith         sum3 = b[row - 2];
387689c6957cSBarry Smith         sum4 = b[row - 3];
387789c6957cSBarry Smith         v2   = a->a + diag[row - 1] + 2;
387889c6957cSBarry Smith         v3   = a->a + diag[row - 2] + 3;
387989c6957cSBarry Smith         v4   = a->a + diag[row - 3] + 4;
388089c6957cSBarry Smith         for (n = 0; n < sz - 1; n += 2) {
388189c6957cSBarry Smith           i1 = idx[0];
388289c6957cSBarry Smith           i2 = idx[1];
388389c6957cSBarry Smith           idx += 2;
388489c6957cSBarry Smith           tmp0 = x[i1];
388589c6957cSBarry Smith           tmp1 = x[i2];
38869371c9d4SSatish Balay           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
38879371c9d4SSatish Balay           v1 += 2;
38889371c9d4SSatish Balay           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
38899371c9d4SSatish Balay           v2 += 2;
38909371c9d4SSatish Balay           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
38919371c9d4SSatish Balay           v3 += 2;
38929371c9d4SSatish Balay           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
38939371c9d4SSatish Balay           v4 += 2;
389489c6957cSBarry Smith         }
389589c6957cSBarry Smith 
389689c6957cSBarry Smith         if (n == sz - 1) {
389789c6957cSBarry Smith           tmp0 = x[*idx];
389889c6957cSBarry Smith           sum1 -= *v1 * tmp0;
389989c6957cSBarry Smith           sum2 -= *v2 * tmp0;
390089c6957cSBarry Smith           sum3 -= *v3 * tmp0;
390189c6957cSBarry Smith           sum4 -= *v4 * tmp0;
390289c6957cSBarry Smith         }
3903938d4eb3SBarry Smith         x[row]     = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3904938d4eb3SBarry Smith         x[row - 1] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3905938d4eb3SBarry Smith         x[row - 2] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3906938d4eb3SBarry Smith         x[row - 3] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3907938d4eb3SBarry Smith         row -= 4;
390889c6957cSBarry Smith         break;
390989c6957cSBarry Smith       case 5:
391089c6957cSBarry Smith 
391189c6957cSBarry Smith         sum1 = b[row];
391289c6957cSBarry Smith         sum2 = b[row - 1];
391389c6957cSBarry Smith         sum3 = b[row - 2];
391489c6957cSBarry Smith         sum4 = b[row - 3];
391589c6957cSBarry Smith         sum5 = b[row - 4];
391689c6957cSBarry Smith         v2   = a->a + diag[row - 1] + 2;
391789c6957cSBarry Smith         v3   = a->a + diag[row - 2] + 3;
391889c6957cSBarry Smith         v4   = a->a + diag[row - 3] + 4;
391989c6957cSBarry Smith         v5   = a->a + diag[row - 4] + 5;
392089c6957cSBarry Smith         for (n = 0; n < sz - 1; n += 2) {
392189c6957cSBarry Smith           i1 = idx[0];
392289c6957cSBarry Smith           i2 = idx[1];
392389c6957cSBarry Smith           idx += 2;
392489c6957cSBarry Smith           tmp0 = x[i1];
392589c6957cSBarry Smith           tmp1 = x[i2];
39269371c9d4SSatish Balay           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
39279371c9d4SSatish Balay           v1 += 2;
39289371c9d4SSatish Balay           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
39299371c9d4SSatish Balay           v2 += 2;
39309371c9d4SSatish Balay           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
39319371c9d4SSatish Balay           v3 += 2;
39329371c9d4SSatish Balay           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
39339371c9d4SSatish Balay           v4 += 2;
39349371c9d4SSatish Balay           sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
39359371c9d4SSatish Balay           v5 += 2;
393689c6957cSBarry Smith         }
393789c6957cSBarry Smith 
393889c6957cSBarry Smith         if (n == sz - 1) {
393989c6957cSBarry Smith           tmp0 = x[*idx];
394089c6957cSBarry Smith           sum1 -= *v1 * tmp0;
394189c6957cSBarry Smith           sum2 -= *v2 * tmp0;
394289c6957cSBarry Smith           sum3 -= *v3 * tmp0;
394389c6957cSBarry Smith           sum4 -= *v4 * tmp0;
394489c6957cSBarry Smith           sum5 -= *v5 * tmp0;
394589c6957cSBarry Smith         }
3946938d4eb3SBarry Smith         x[row]     = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3947938d4eb3SBarry Smith         x[row - 1] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3948938d4eb3SBarry Smith         x[row - 2] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3949938d4eb3SBarry Smith         x[row - 3] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3950938d4eb3SBarry Smith         x[row - 4] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3951938d4eb3SBarry Smith         row -= 5;
395289c6957cSBarry Smith         break;
3953d71ae5a4SJacob Faibussowitsch       default:
39544d12350bSJunchao Zhang         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
395589c6957cSBarry Smith       }
395689c6957cSBarry Smith     }
39579566063dSJacob Faibussowitsch     PetscCall(PetscLogFlops(a->nz));
395889c6957cSBarry Smith 
395989c6957cSBarry Smith     /*
396089c6957cSBarry Smith            t = b - D x    where D is the block diagonal
396189c6957cSBarry Smith     */
396289c6957cSBarry Smith     cnt = 0;
396389c6957cSBarry Smith     for (i = 0, row = 0; i < m; i++) {
39644d12350bSJunchao Zhang       nodesz = sizes[i + 1] - sizes[i];
39654d12350bSJunchao Zhang       switch (nodesz) {
396689c6957cSBarry Smith       case 1:
39679371c9d4SSatish Balay         t[row] = b[row] - bdiag[cnt++] * x[row];
39689371c9d4SSatish Balay         row++;
396989c6957cSBarry Smith         break;
397089c6957cSBarry Smith       case 2:
39719371c9d4SSatish Balay         x1         = x[row];
39729371c9d4SSatish Balay         x2         = x[row + 1];
397389c6957cSBarry Smith         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2];
397489c6957cSBarry Smith         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3];
397589c6957cSBarry Smith         t[row]     = b[row] - tmp1;
39769371c9d4SSatish Balay         t[row + 1] = b[row + 1] - tmp2;
39779371c9d4SSatish Balay         row += 2;
397889c6957cSBarry Smith         cnt += 4;
397989c6957cSBarry Smith         break;
398089c6957cSBarry Smith       case 3:
39819371c9d4SSatish Balay         x1         = x[row];
39829371c9d4SSatish Balay         x2         = x[row + 1];
39839371c9d4SSatish Balay         x3         = x[row + 2];
398489c6957cSBarry Smith         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6];
398589c6957cSBarry Smith         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7];
398689c6957cSBarry Smith         tmp3       = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8];
398789c6957cSBarry Smith         t[row]     = b[row] - tmp1;
398889c6957cSBarry Smith         t[row + 1] = b[row + 1] - tmp2;
39899371c9d4SSatish Balay         t[row + 2] = b[row + 2] - tmp3;
39909371c9d4SSatish Balay         row += 3;
399189c6957cSBarry Smith         cnt += 9;
399289c6957cSBarry Smith         break;
399389c6957cSBarry Smith       case 4:
39949371c9d4SSatish Balay         x1         = x[row];
39959371c9d4SSatish Balay         x2         = x[row + 1];
39969371c9d4SSatish Balay         x3         = x[row + 2];
39979371c9d4SSatish Balay         x4         = x[row + 3];
399889c6957cSBarry Smith         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12];
399989c6957cSBarry Smith         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13];
400089c6957cSBarry Smith         tmp3       = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14];
400189c6957cSBarry Smith         tmp4       = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15];
400289c6957cSBarry Smith         t[row]     = b[row] - tmp1;
400389c6957cSBarry Smith         t[row + 1] = b[row + 1] - tmp2;
400489c6957cSBarry Smith         t[row + 2] = b[row + 2] - tmp3;
40059371c9d4SSatish Balay         t[row + 3] = b[row + 3] - tmp4;
40069371c9d4SSatish Balay         row += 4;
400789c6957cSBarry Smith         cnt += 16;
400889c6957cSBarry Smith         break;
400989c6957cSBarry Smith       case 5:
40109371c9d4SSatish Balay         x1         = x[row];
40119371c9d4SSatish Balay         x2         = x[row + 1];
40129371c9d4SSatish Balay         x3         = x[row + 2];
40139371c9d4SSatish Balay         x4         = x[row + 3];
40149371c9d4SSatish Balay         x5         = x[row + 4];
401589c6957cSBarry Smith         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20];
401689c6957cSBarry Smith         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21];
401789c6957cSBarry Smith         tmp3       = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22];
401889c6957cSBarry Smith         tmp4       = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23];
401989c6957cSBarry Smith         tmp5       = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24];
402089c6957cSBarry Smith         t[row]     = b[row] - tmp1;
402189c6957cSBarry Smith         t[row + 1] = b[row + 1] - tmp2;
402289c6957cSBarry Smith         t[row + 2] = b[row + 2] - tmp3;
402389c6957cSBarry Smith         t[row + 3] = b[row + 3] - tmp4;
40249371c9d4SSatish Balay         t[row + 4] = b[row + 4] - tmp5;
40259371c9d4SSatish Balay         row += 5;
402689c6957cSBarry Smith         cnt += 25;
402789c6957cSBarry Smith         break;
4028d71ae5a4SJacob Faibussowitsch       default:
40294d12350bSJunchao Zhang         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
403089c6957cSBarry Smith       }
403189c6957cSBarry Smith     }
40329566063dSJacob Faibussowitsch     PetscCall(PetscLogFlops(m));
403389c6957cSBarry Smith 
403489c6957cSBarry Smith     /*
403589c6957cSBarry Smith           Apply (L + D)^-1 where D is the block diagonal
403689c6957cSBarry Smith     */
403789c6957cSBarry Smith     for (i = 0, row = 0; i < m; i++) {
40384d12350bSJunchao Zhang       nodesz = sizes[i + 1] - sizes[i];
403989c6957cSBarry Smith       sz     = diag[row] - ii[row];
404089c6957cSBarry Smith       v1     = a->a + ii[row];
404189c6957cSBarry Smith       idx    = a->j + ii[row];
40424108e4d5SBarry Smith       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
40434d12350bSJunchao Zhang       switch (nodesz) {
404489c6957cSBarry Smith       case 1:
404589c6957cSBarry Smith 
404689c6957cSBarry Smith         sum1 = t[row];
404789c6957cSBarry Smith         for (n = 0; n < sz - 1; n += 2) {
404889c6957cSBarry Smith           i1 = idx[0];
404989c6957cSBarry Smith           i2 = idx[1];
405089c6957cSBarry Smith           idx += 2;
405189c6957cSBarry Smith           tmp0 = t[i1];
405289c6957cSBarry Smith           tmp1 = t[i2];
40539371c9d4SSatish Balay           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
40549371c9d4SSatish Balay           v1 += 2;
405589c6957cSBarry Smith         }
405689c6957cSBarry Smith 
405789c6957cSBarry Smith         if (n == sz - 1) {
405889c6957cSBarry Smith           tmp0 = t[*idx];
405989c6957cSBarry Smith           sum1 -= *v1 * tmp0;
406089c6957cSBarry Smith         }
40619371c9d4SSatish Balay         x[row] += t[row] = sum1 * (*ibdiag++);
40629371c9d4SSatish Balay         row++;
406389c6957cSBarry Smith         break;
406489c6957cSBarry Smith       case 2:
406589c6957cSBarry Smith         v2   = a->a + ii[row + 1];
406689c6957cSBarry Smith         sum1 = t[row];
406789c6957cSBarry Smith         sum2 = t[row + 1];
406889c6957cSBarry Smith         for (n = 0; n < sz - 1; n += 2) {
406989c6957cSBarry Smith           i1 = idx[0];
407089c6957cSBarry Smith           i2 = idx[1];
407189c6957cSBarry Smith           idx += 2;
407289c6957cSBarry Smith           tmp0 = t[i1];
407389c6957cSBarry Smith           tmp1 = t[i2];
40749371c9d4SSatish Balay           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
40759371c9d4SSatish Balay           v1 += 2;
40769371c9d4SSatish Balay           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
40779371c9d4SSatish Balay           v2 += 2;
407889c6957cSBarry Smith         }
407989c6957cSBarry Smith 
408089c6957cSBarry Smith         if (n == sz - 1) {
408189c6957cSBarry Smith           tmp0 = t[*idx];
408289c6957cSBarry Smith           sum1 -= v1[0] * tmp0;
408389c6957cSBarry Smith           sum2 -= v2[0] * tmp0;
408489c6957cSBarry Smith         }
408589c6957cSBarry Smith         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[2];
408689c6957cSBarry Smith         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
40879371c9d4SSatish Balay         ibdiag += 4;
40889371c9d4SSatish Balay         row += 2;
408989c6957cSBarry Smith         break;
409089c6957cSBarry Smith       case 3:
409189c6957cSBarry Smith         v2   = a->a + ii[row + 1];
409289c6957cSBarry Smith         v3   = a->a + ii[row + 2];
409389c6957cSBarry Smith         sum1 = t[row];
409489c6957cSBarry Smith         sum2 = t[row + 1];
409589c6957cSBarry Smith         sum3 = t[row + 2];
409689c6957cSBarry Smith         for (n = 0; n < sz - 1; n += 2) {
409789c6957cSBarry Smith           i1 = idx[0];
409889c6957cSBarry Smith           i2 = idx[1];
409989c6957cSBarry Smith           idx += 2;
410089c6957cSBarry Smith           tmp0 = t[i1];
410189c6957cSBarry Smith           tmp1 = t[i2];
41029371c9d4SSatish Balay           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
41039371c9d4SSatish Balay           v1 += 2;
41049371c9d4SSatish Balay           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
41059371c9d4SSatish Balay           v2 += 2;
41069371c9d4SSatish Balay           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
41079371c9d4SSatish Balay           v3 += 2;
410889c6957cSBarry Smith         }
410989c6957cSBarry Smith 
411089c6957cSBarry Smith         if (n == sz - 1) {
411189c6957cSBarry Smith           tmp0 = t[*idx];
411289c6957cSBarry Smith           sum1 -= v1[0] * tmp0;
411389c6957cSBarry Smith           sum2 -= v2[0] * tmp0;
411489c6957cSBarry Smith           sum3 -= v3[0] * tmp0;
411589c6957cSBarry Smith         }
411689c6957cSBarry Smith         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
411789c6957cSBarry Smith         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
411889c6957cSBarry Smith         x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
41199371c9d4SSatish Balay         ibdiag += 9;
41209371c9d4SSatish Balay         row += 3;
412189c6957cSBarry Smith         break;
412289c6957cSBarry Smith       case 4:
412389c6957cSBarry Smith         v2   = a->a + ii[row + 1];
412489c6957cSBarry Smith         v3   = a->a + ii[row + 2];
412589c6957cSBarry Smith         v4   = a->a + ii[row + 3];
412689c6957cSBarry Smith         sum1 = t[row];
412789c6957cSBarry Smith         sum2 = t[row + 1];
412889c6957cSBarry Smith         sum3 = t[row + 2];
412989c6957cSBarry Smith         sum4 = t[row + 3];
413089c6957cSBarry Smith         for (n = 0; n < sz - 1; n += 2) {
413189c6957cSBarry Smith           i1 = idx[0];
413289c6957cSBarry Smith           i2 = idx[1];
413389c6957cSBarry Smith           idx += 2;
413489c6957cSBarry Smith           tmp0 = t[i1];
413589c6957cSBarry Smith           tmp1 = t[i2];
41369371c9d4SSatish Balay           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
41379371c9d4SSatish Balay           v1 += 2;
41389371c9d4SSatish Balay           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
41399371c9d4SSatish Balay           v2 += 2;
41409371c9d4SSatish Balay           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
41419371c9d4SSatish Balay           v3 += 2;
41429371c9d4SSatish Balay           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
41439371c9d4SSatish Balay           v4 += 2;
414489c6957cSBarry Smith         }
414589c6957cSBarry Smith 
414689c6957cSBarry Smith         if (n == sz - 1) {
414789c6957cSBarry Smith           tmp0 = t[*idx];
414889c6957cSBarry Smith           sum1 -= v1[0] * tmp0;
414989c6957cSBarry Smith           sum2 -= v2[0] * tmp0;
415089c6957cSBarry Smith           sum3 -= v3[0] * tmp0;
415189c6957cSBarry Smith           sum4 -= v4[0] * tmp0;
415289c6957cSBarry Smith         }
415389c6957cSBarry Smith         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
415489c6957cSBarry Smith         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
415589c6957cSBarry Smith         x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
415689c6957cSBarry Smith         x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
41579371c9d4SSatish Balay         ibdiag += 16;
41589371c9d4SSatish Balay         row += 4;
415989c6957cSBarry Smith         break;
416089c6957cSBarry Smith       case 5:
416189c6957cSBarry Smith         v2   = a->a + ii[row + 1];
416289c6957cSBarry Smith         v3   = a->a + ii[row + 2];
416389c6957cSBarry Smith         v4   = a->a + ii[row + 3];
416489c6957cSBarry Smith         v5   = a->a + ii[row + 4];
416589c6957cSBarry Smith         sum1 = t[row];
416689c6957cSBarry Smith         sum2 = t[row + 1];
416789c6957cSBarry Smith         sum3 = t[row + 2];
416889c6957cSBarry Smith         sum4 = t[row + 3];
416989c6957cSBarry Smith         sum5 = t[row + 4];
417089c6957cSBarry Smith         for (n = 0; n < sz - 1; n += 2) {
417189c6957cSBarry Smith           i1 = idx[0];
417289c6957cSBarry Smith           i2 = idx[1];
417389c6957cSBarry Smith           idx += 2;
417489c6957cSBarry Smith           tmp0 = t[i1];
417589c6957cSBarry Smith           tmp1 = t[i2];
41769371c9d4SSatish Balay           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
41779371c9d4SSatish Balay           v1 += 2;
41789371c9d4SSatish Balay           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
41799371c9d4SSatish Balay           v2 += 2;
41809371c9d4SSatish Balay           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
41819371c9d4SSatish Balay           v3 += 2;
41829371c9d4SSatish Balay           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
41839371c9d4SSatish Balay           v4 += 2;
41849371c9d4SSatish Balay           sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
41859371c9d4SSatish Balay           v5 += 2;
418689c6957cSBarry Smith         }
418789c6957cSBarry Smith 
418889c6957cSBarry Smith         if (n == sz - 1) {
418989c6957cSBarry Smith           tmp0 = t[*idx];
419089c6957cSBarry Smith           sum1 -= v1[0] * tmp0;
419189c6957cSBarry Smith           sum2 -= v2[0] * tmp0;
419289c6957cSBarry Smith           sum3 -= v3[0] * tmp0;
419389c6957cSBarry Smith           sum4 -= v4[0] * tmp0;
419489c6957cSBarry Smith           sum5 -= v5[0] * tmp0;
419589c6957cSBarry Smith         }
419689c6957cSBarry Smith         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
419789c6957cSBarry Smith         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
419889c6957cSBarry Smith         x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
419989c6957cSBarry Smith         x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
420089c6957cSBarry Smith         x[row + 4] += t[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
42019371c9d4SSatish Balay         ibdiag += 25;
42029371c9d4SSatish Balay         row += 5;
420389c6957cSBarry Smith         break;
4204d71ae5a4SJacob Faibussowitsch       default:
42054d12350bSJunchao Zhang         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
420689c6957cSBarry Smith       }
420789c6957cSBarry Smith     }
42089566063dSJacob Faibussowitsch     PetscCall(PetscLogFlops(a->nz));
42095850ef23SBarry Smith   }
42109566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
42119566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
42123ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
42132af78befSBarry Smith }
42142af78befSBarry Smith 
4215ff6a9541SJacob Faibussowitsch static PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A, Vec bb, Vec xx)
4216d71ae5a4SJacob Faibussowitsch {
421789c6957cSBarry Smith   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
421889c6957cSBarry Smith   PetscScalar       *x, tmp1, tmp2, tmp3, tmp4, tmp5, x1, x2, x3, x4, x5;
421989c6957cSBarry Smith   const MatScalar   *bdiag = a->inode.bdiag;
422089c6957cSBarry Smith   const PetscScalar *b;
42214d12350bSJunchao Zhang   PetscInt           m = a->inode.node_count, cnt = 0, i, row, nodesz;
42224d12350bSJunchao Zhang   const PetscInt    *sizes = a->inode.size_csr;
42232af78befSBarry Smith 
422489c6957cSBarry Smith   PetscFunctionBegin;
42254d12350bSJunchao Zhang   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
42269566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
42279566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
422889c6957cSBarry Smith   cnt = 0;
422989c6957cSBarry Smith   for (i = 0, row = 0; i < m; i++) {
42304d12350bSJunchao Zhang     nodesz = sizes[i + 1] - sizes[i];
42314d12350bSJunchao Zhang     switch (nodesz) {
423289c6957cSBarry Smith     case 1:
42339371c9d4SSatish Balay       x[row] = b[row] * bdiag[cnt++];
42349371c9d4SSatish Balay       row++;
423589c6957cSBarry Smith       break;
423689c6957cSBarry Smith     case 2:
42379371c9d4SSatish Balay       x1       = b[row];
42389371c9d4SSatish Balay       x2       = b[row + 1];
423989c6957cSBarry Smith       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2];
424089c6957cSBarry Smith       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3];
424189c6957cSBarry Smith       x[row++] = tmp1;
424289c6957cSBarry Smith       x[row++] = tmp2;
424389c6957cSBarry Smith       cnt += 4;
424489c6957cSBarry Smith       break;
424589c6957cSBarry Smith     case 3:
42469371c9d4SSatish Balay       x1       = b[row];
42479371c9d4SSatish Balay       x2       = b[row + 1];
42489371c9d4SSatish Balay       x3       = b[row + 2];
424989c6957cSBarry Smith       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6];
425089c6957cSBarry Smith       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7];
425189c6957cSBarry Smith       tmp3     = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8];
425289c6957cSBarry Smith       x[row++] = tmp1;
425389c6957cSBarry Smith       x[row++] = tmp2;
425489c6957cSBarry Smith       x[row++] = tmp3;
425589c6957cSBarry Smith       cnt += 9;
425689c6957cSBarry Smith       break;
425789c6957cSBarry Smith     case 4:
42589371c9d4SSatish Balay       x1       = b[row];
42599371c9d4SSatish Balay       x2       = b[row + 1];
42609371c9d4SSatish Balay       x3       = b[row + 2];
42619371c9d4SSatish Balay       x4       = b[row + 3];
426289c6957cSBarry Smith       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12];
426389c6957cSBarry Smith       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13];
426489c6957cSBarry Smith       tmp3     = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14];
426589c6957cSBarry Smith       tmp4     = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15];
426689c6957cSBarry Smith       x[row++] = tmp1;
426789c6957cSBarry Smith       x[row++] = tmp2;
426889c6957cSBarry Smith       x[row++] = tmp3;
426989c6957cSBarry Smith       x[row++] = tmp4;
427089c6957cSBarry Smith       cnt += 16;
427189c6957cSBarry Smith       break;
427289c6957cSBarry Smith     case 5:
42739371c9d4SSatish Balay       x1       = b[row];
42749371c9d4SSatish Balay       x2       = b[row + 1];
42759371c9d4SSatish Balay       x3       = b[row + 2];
42769371c9d4SSatish Balay       x4       = b[row + 3];
42779371c9d4SSatish Balay       x5       = b[row + 4];
427889c6957cSBarry Smith       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20];
427989c6957cSBarry Smith       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21];
428089c6957cSBarry Smith       tmp3     = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22];
428189c6957cSBarry Smith       tmp4     = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23];
428289c6957cSBarry Smith       tmp5     = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24];
428389c6957cSBarry Smith       x[row++] = tmp1;
428489c6957cSBarry Smith       x[row++] = tmp2;
428589c6957cSBarry Smith       x[row++] = tmp3;
428689c6957cSBarry Smith       x[row++] = tmp4;
428789c6957cSBarry Smith       x[row++] = tmp5;
428889c6957cSBarry Smith       cnt += 25;
428989c6957cSBarry Smith       break;
4290d71ae5a4SJacob Faibussowitsch     default:
42914d12350bSJunchao Zhang       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
429289c6957cSBarry Smith     }
429389c6957cSBarry Smith   }
42949566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * cnt));
42959566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
42969566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
42973ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
429889c6957cSBarry Smith }
429989c6957cSBarry Smith 
4300d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJ_Inode_ResetOps(Mat A)
4301d71ae5a4SJacob Faibussowitsch {
4302b215bc84SStefano Zampini   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4303b215bc84SStefano Zampini 
4304b215bc84SStefano Zampini   PetscFunctionBegin;
4305b215bc84SStefano Zampini   a->inode.node_count       = 0;
4306b215bc84SStefano Zampini   a->inode.use              = PETSC_FALSE;
4307b215bc84SStefano Zampini   a->inode.checked          = PETSC_FALSE;
4308b215bc84SStefano Zampini   a->inode.mat_nonzerostate = -1;
4309b215bc84SStefano Zampini   A->ops->getrowij          = MatGetRowIJ_SeqAIJ;
4310b215bc84SStefano Zampini   A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ;
4311b215bc84SStefano Zampini   A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ;
4312b215bc84SStefano Zampini   A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ;
4313b215bc84SStefano Zampini   A->ops->coloringpatch     = NULL;
4314b215bc84SStefano Zampini   A->ops->multdiagonalblock = NULL;
4315ad540459SPierre Jolivet   if (A->factortype) A->ops->solve = MatSolve_SeqAIJ_inplace;
43163ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4317b215bc84SStefano Zampini }
4318b215bc84SStefano Zampini 
43194c1414c8SBarry Smith /*
43204c1414c8SBarry Smith     samestructure indicates that the matrix has not changed its nonzero structure so we
43214c1414c8SBarry Smith     do not need to recompute the inodes
43224c1414c8SBarry Smith */
4323d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCheckInode(Mat A)
4324d71ae5a4SJacob Faibussowitsch {
43254c1414c8SBarry Smith   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
43268758e1faSBarry Smith   PetscInt        i, j, m, nzx, nzy, *ns, node_count, blk_size;
4327ace3abfcSBarry Smith   PetscBool       flag;
43288758e1faSBarry Smith   const PetscInt *idx, *idy, *ii;
43294c1414c8SBarry Smith 
43304c1414c8SBarry Smith   PetscFunctionBegin;
4331b215bc84SStefano Zampini   if (!a->inode.use) {
43329566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJ_Inode_ResetOps(A));
43334d12350bSJunchao Zhang     PetscCall(PetscFree(a->inode.size_csr));
43343ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
4335b215bc84SStefano Zampini   }
43363ba16761SJacob Faibussowitsch   if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(PETSC_SUCCESS);
43374c1414c8SBarry Smith 
4338d0f46423SBarry Smith   m = A->rmap->n;
43394d12350bSJunchao Zhang   if (!a->inode.size_csr) PetscCall(PetscMalloc1(m + 1, &a->inode.size_csr));
43404d12350bSJunchao Zhang   ns    = a->inode.size_csr;
43414d12350bSJunchao Zhang   ns[0] = 0;
43424c1414c8SBarry Smith 
43434c1414c8SBarry Smith   i          = 0;
43444c1414c8SBarry Smith   node_count = 0;
43454c1414c8SBarry Smith   idx        = a->j;
43464c1414c8SBarry Smith   ii         = a->i;
43476f2c871aSStefano Zampini   if (idx) {
43484c1414c8SBarry Smith     while (i < m) {            /* For each row */
43494c1414c8SBarry Smith       nzx = ii[i + 1] - ii[i]; /* Number of nonzeros */
43504c1414c8SBarry Smith       /* Limits the number of elements in a node to 'a->inode.limit' */
43514c1414c8SBarry Smith       for (j = i + 1, idy = idx, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) {
43524c1414c8SBarry Smith         nzy = ii[j + 1] - ii[j]; /* Same number of nonzeros */
43534c1414c8SBarry Smith         if (nzy != nzx) break;
43544c1414c8SBarry Smith         idy += nzx; /* Same nonzero pattern */
43559566063dSJacob Faibussowitsch         PetscCall(PetscArraycmp(idx, idy, nzx, &flag));
43564c1414c8SBarry Smith         if (!flag) break;
43574c1414c8SBarry Smith       }
43584d12350bSJunchao Zhang       ns[node_count + 1] = ns[node_count] + blk_size;
43594d12350bSJunchao Zhang       node_count++;
43604c1414c8SBarry Smith       idx += blk_size * nzx;
43614c1414c8SBarry Smith       i = j;
43624c1414c8SBarry Smith     }
43636f2c871aSStefano Zampini   }
43644c1414c8SBarry Smith   /* If not enough inodes found,, do not use inode version of the routines */
43656f2c871aSStefano Zampini   if (!m || !idx || node_count > .8 * m) {
43669566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJ_Inode_ResetOps(A));
43674d12350bSJunchao Zhang     PetscCall(PetscFree(a->inode.size_csr));
43689566063dSJacob Faibussowitsch     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m));
43694c1414c8SBarry Smith   } else {
4370d5f3da31SBarry Smith     if (!A->factortype) {
4371375a6242SBarry Smith       A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4372375a6242SBarry Smith       if (A->rmap->n == A->cmap->n) {
43734108e4d5SBarry Smith         A->ops->getrowij        = MatGetRowIJ_SeqAIJ_Inode;
43744108e4d5SBarry Smith         A->ops->restorerowij    = MatRestoreRowIJ_SeqAIJ_Inode;
43754108e4d5SBarry Smith         A->ops->getcolumnij     = MatGetColumnIJ_SeqAIJ_Inode;
43764108e4d5SBarry Smith         A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
43774108e4d5SBarry Smith         A->ops->coloringpatch   = MatColoringPatch_SeqAIJ_Inode;
4378375a6242SBarry Smith       }
4379d3ac4fa3SBarry Smith     } else {
4380d3ac4fa3SBarry Smith       A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4381d3ac4fa3SBarry Smith     }
43824c1414c8SBarry Smith     a->inode.node_count = node_count;
43839566063dSJacob Faibussowitsch     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit));
43844c1414c8SBarry Smith   }
4385be6adb11SBarry Smith   a->inode.checked          = PETSC_TRUE;
4386a02bda8eSBarry Smith   a->inode.mat_nonzerostate = A->nonzerostate;
43873ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
43884c1414c8SBarry Smith }
43894c1414c8SBarry Smith 
4390d71ae5a4SJacob Faibussowitsch PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A, MatDuplicateOption cpvalues, Mat *C)
4391d71ae5a4SJacob Faibussowitsch {
4392150f0143SBarry Smith   Mat         B = *C;
4393150f0143SBarry Smith   Mat_SeqAIJ *c = (Mat_SeqAIJ *)B->data, *a = (Mat_SeqAIJ *)A->data;
4394150f0143SBarry Smith   PetscInt    m = A->rmap->n;
4395150f0143SBarry Smith 
4396150f0143SBarry Smith   PetscFunctionBegin;
4397150f0143SBarry Smith   c->inode.use              = a->inode.use;
4398150f0143SBarry Smith   c->inode.limit            = a->inode.limit;
4399150f0143SBarry Smith   c->inode.max_limit        = a->inode.max_limit;
4400ec710b6aSStefano Zampini   c->inode.checked          = PETSC_FALSE;
44014d12350bSJunchao Zhang   c->inode.size_csr         = NULL;
4402ec710b6aSStefano Zampini   c->inode.node_count       = 0;
4403ec710b6aSStefano Zampini   c->inode.ibdiagvalid      = PETSC_FALSE;
4404ec710b6aSStefano Zampini   c->inode.ibdiag           = NULL;
4405ec710b6aSStefano Zampini   c->inode.bdiag            = NULL;
4406ec710b6aSStefano Zampini   c->inode.mat_nonzerostate = -1;
4407b215bc84SStefano Zampini   if (a->inode.use) {
44084d12350bSJunchao Zhang     if (a->inode.checked && a->inode.size_csr) {
44094d12350bSJunchao Zhang       PetscCall(PetscMalloc1(m + 1, &c->inode.size_csr));
44104d12350bSJunchao Zhang       PetscCall(PetscArraycpy(c->inode.size_csr, a->inode.size_csr, m + 1));
4411ec710b6aSStefano Zampini 
4412ec710b6aSStefano Zampini       c->inode.checked          = PETSC_TRUE;
4413ec710b6aSStefano Zampini       c->inode.node_count       = a->inode.node_count;
4414ec710b6aSStefano Zampini       c->inode.mat_nonzerostate = (*C)->nonzerostate;
4415ec710b6aSStefano Zampini     }
4416a02bda8eSBarry Smith     /* note the table of functions below should match that in MatSeqAIJCheckInode() */
44172c451681SBarry Smith     if (!B->factortype) {
44182c451681SBarry Smith       B->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
44192c451681SBarry Smith       B->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
44202c451681SBarry Smith       B->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
44212c451681SBarry Smith       B->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
44222c451681SBarry Smith       B->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
44232c451681SBarry Smith       B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4424150f0143SBarry Smith     } else {
44252c451681SBarry Smith       B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4426150f0143SBarry Smith     }
4427150f0143SBarry Smith   }
44283ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4429150f0143SBarry Smith }
4430150f0143SBarry Smith 
4431d71ae5a4SJacob Faibussowitsch static inline PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols, PetscInt nzl, PetscInt nzu, PetscInt nz, const PetscInt *ai, const PetscInt *aj, const PetscInt *adiag, PetscInt row)
4432d71ae5a4SJacob Faibussowitsch {
44338758e1faSBarry Smith   PetscInt        k;
44348758e1faSBarry Smith   const PetscInt *vi;
44356e111a19SKarl Rupp 
443617454e89SShri Abhyankar   PetscFunctionBegin;
443717454e89SShri Abhyankar   vi = aj + ai[row];
443817454e89SShri Abhyankar   for (k = 0; k < nzl; k++) cols[k] = vi[k];
443917454e89SShri Abhyankar   vi        = aj + adiag[row];
444017454e89SShri Abhyankar   cols[nzl] = vi[0];
444117454e89SShri Abhyankar   vi        = aj + adiag[row + 1] + 1;
444217454e89SShri Abhyankar   for (k = 0; k < nzu; k++) cols[nzl + 1 + k] = vi[k];
44433ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
444417454e89SShri Abhyankar }
44456936b636SHong Zhang /*
4446a02bda8eSBarry Smith    MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix.
4447a02bda8eSBarry Smith    Modified from MatSeqAIJCheckInode().
44486936b636SHong Zhang 
44496936b636SHong Zhang    Input Parameters:
4450abb87a52SBarry Smith .  Mat A - ILU or LU matrix factor
4451abb87a52SBarry Smith 
44526936b636SHong Zhang */
4453d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A)
4454d71ae5a4SJacob Faibussowitsch {
4455019b515eSShri Abhyankar   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
4456019b515eSShri Abhyankar   PetscInt        i, j, m, nzl1, nzu1, nzl2, nzu2, nzx, nzy, node_count, blk_size;
44578758e1faSBarry Smith   PetscInt       *cols1, *cols2, *ns;
44588758e1faSBarry Smith   const PetscInt *ai = a->i, *aj = a->j, *adiag = a->diag;
4459ace3abfcSBarry Smith   PetscBool       flag;
4460019b515eSShri Abhyankar 
4461019b515eSShri Abhyankar   PetscFunctionBegin;
44623ba16761SJacob Faibussowitsch   if (!a->inode.use) PetscFunctionReturn(PETSC_SUCCESS);
44633ba16761SJacob Faibussowitsch   if (a->inode.checked) PetscFunctionReturn(PETSC_SUCCESS);
4464019b515eSShri Abhyankar 
4465019b515eSShri Abhyankar   m = A->rmap->n;
44664d12350bSJunchao Zhang   if (a->inode.size_csr) ns = a->inode.size_csr;
446748a46eb9SPierre Jolivet   else PetscCall(PetscMalloc1(m + 1, &ns));
44684d12350bSJunchao Zhang   ns[0] = 0;
4469019b515eSShri Abhyankar 
4470019b515eSShri Abhyankar   i          = 0;
4471019b515eSShri Abhyankar   node_count = 0;
44729566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(m, &cols1, m, &cols2));
4473019b515eSShri Abhyankar   while (i < m) {                       /* For each row */
4474019b515eSShri Abhyankar     nzl1 = ai[i + 1] - ai[i];           /* Number of nonzeros in L */
4475019b515eSShri Abhyankar     nzu1 = adiag[i] - adiag[i + 1] - 1; /* Number of nonzeros in U excluding diagonal*/
4476019b515eSShri Abhyankar     nzx  = nzl1 + nzu1 + 1;
44773ba16761SJacob Faibussowitsch     PetscCall(MatGetRow_FactoredLU(cols1, nzl1, nzu1, nzx, ai, aj, adiag, i));
4478019b515eSShri Abhyankar 
4479019b515eSShri Abhyankar     /* Limits the number of elements in a node to 'a->inode.limit' */
4480019b515eSShri Abhyankar     for (j = i + 1, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) {
4481019b515eSShri Abhyankar       nzl2 = ai[j + 1] - ai[j];
4482019b515eSShri Abhyankar       nzu2 = adiag[j] - adiag[j + 1] - 1;
4483019b515eSShri Abhyankar       nzy  = nzl2 + nzu2 + 1;
4484019b515eSShri Abhyankar       if (nzy != nzx) break;
44859566063dSJacob Faibussowitsch       PetscCall(MatGetRow_FactoredLU(cols2, nzl2, nzu2, nzy, ai, aj, adiag, j));
44869566063dSJacob Faibussowitsch       PetscCall(PetscArraycmp(cols1, cols2, nzx, &flag));
44878758e1faSBarry Smith       if (!flag) break;
4488019b515eSShri Abhyankar     }
44894d12350bSJunchao Zhang     ns[node_count + 1] = ns[node_count] + blk_size;
44904d12350bSJunchao Zhang     node_count++;
4491019b515eSShri Abhyankar     i = j;
4492019b515eSShri Abhyankar   }
44939566063dSJacob Faibussowitsch   PetscCall(PetscFree2(cols1, cols2));
4494019b515eSShri Abhyankar   /* If not enough inodes found,, do not use inode version of the routines */
4495be6adb11SBarry Smith   if (!m || node_count > .8 * m) {
44969566063dSJacob Faibussowitsch     PetscCall(PetscFree(ns));
44972205254eSKarl Rupp 
4498019b515eSShri Abhyankar     a->inode.node_count = 0;
44994d12350bSJunchao Zhang     a->inode.size_csr   = NULL;
4500019b515eSShri Abhyankar     a->inode.use        = PETSC_FALSE;
45012205254eSKarl Rupp 
45029566063dSJacob Faibussowitsch     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m));
4503019b515eSShri Abhyankar   } else {
4504f4259b30SLisandro Dalcin     A->ops->mult              = NULL;
4505f4259b30SLisandro Dalcin     A->ops->sor               = NULL;
4506f4259b30SLisandro Dalcin     A->ops->multadd           = NULL;
4507f4259b30SLisandro Dalcin     A->ops->getrowij          = NULL;
4508f4259b30SLisandro Dalcin     A->ops->restorerowij      = NULL;
4509f4259b30SLisandro Dalcin     A->ops->getcolumnij       = NULL;
4510f4259b30SLisandro Dalcin     A->ops->restorecolumnij   = NULL;
4511f4259b30SLisandro Dalcin     A->ops->coloringpatch     = NULL;
4512f4259b30SLisandro Dalcin     A->ops->multdiagonalblock = NULL;
4513019b515eSShri Abhyankar     a->inode.node_count       = node_count;
45144d12350bSJunchao Zhang     a->inode.size_csr         = ns;
45159566063dSJacob Faibussowitsch     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit));
4516019b515eSShri Abhyankar   }
4517be6adb11SBarry Smith   a->inode.checked = PETSC_TRUE;
45183ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4519019b515eSShri Abhyankar }
4520019b515eSShri Abhyankar 
4521d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4522d71ae5a4SJacob Faibussowitsch {
4523acf2f550SJed Brown   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4524acf2f550SJed Brown 
4525acf2f550SJed Brown   PetscFunctionBegin;
4526acf2f550SJed Brown   a->inode.ibdiagvalid = PETSC_FALSE;
45273ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4528acf2f550SJed Brown }
4529acf2f550SJed Brown 
45304c1414c8SBarry Smith /*
45314c1414c8SBarry Smith      This is really ugly. if inodes are used this replaces the
45324c1414c8SBarry Smith   permutations with ones that correspond to rows/cols of the matrix
4533467446fbSPierre Jolivet   rather than inode blocks
45344c1414c8SBarry Smith */
4535d71ae5a4SJacob Faibussowitsch PetscErrorCode MatInodeAdjustForInodes(Mat A, IS *rperm, IS *cperm)
4536d71ae5a4SJacob Faibussowitsch {
45374c1414c8SBarry Smith   PetscFunctionBegin;
4538cac4c232SBarry Smith   PetscTryMethod(A, "MatInodeAdjustForInodes_C", (Mat, IS *, IS *), (A, rperm, cperm));
45393ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
45404c1414c8SBarry Smith }
45414c1414c8SBarry Smith 
4542d71ae5a4SJacob Faibussowitsch PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A, IS *rperm, IS *cperm)
4543d71ae5a4SJacob Faibussowitsch {
45444c1414c8SBarry Smith   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
45455d0c19d7SBarry Smith   PetscInt        m = A->rmap->n, n = A->cmap->n, i, j, nslim_row = a->inode.node_count;
45465d0c19d7SBarry Smith   const PetscInt *ridx, *cidx;
45474d12350bSJunchao Zhang   PetscInt        row, col, *permr, *permc, *ns_row = a->inode.size_csr, *tns, start_val, end_val, indx;
45484c1414c8SBarry Smith   PetscInt        nslim_col, *ns_col;
45494c1414c8SBarry Smith   IS              ris = *rperm, cis = *cperm;
45504c1414c8SBarry Smith 
45514c1414c8SBarry Smith   PetscFunctionBegin;
45524d12350bSJunchao Zhang   if (!a->inode.size_csr) PetscFunctionReturn(PETSC_SUCCESS);       /* no inodes so return */
45533ba16761SJacob Faibussowitsch   if (a->inode.node_count == m) PetscFunctionReturn(PETSC_SUCCESS); /* all inodes are of size 1 */
45544c1414c8SBarry Smith 
45559566063dSJacob Faibussowitsch   PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
455632603206SJames Wright   PetscCall(PetscMalloc1(((nslim_row > nslim_col ? nslim_row : nslim_col) + 1), &tns));
45579566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(m, &permr, n, &permc));
45584c1414c8SBarry Smith 
45599566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(ris, &ridx));
45609566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(cis, &cidx));
45614c1414c8SBarry Smith 
4562baca6076SPierre Jolivet   /* Form the inode structure for the rows of permuted matrix using inv perm*/
45634d12350bSJunchao Zhang   for (i = 0, tns[0] = 0; i < nslim_row; ++i) tns[i + 1] = tns[i] + (ns_row[i + 1] - ns_row[i]);
45644c1414c8SBarry Smith 
45654c1414c8SBarry Smith   /* Construct the permutations for rows*/
45664c1414c8SBarry Smith   for (i = 0, row = 0; i < nslim_row; ++i) {
45674c1414c8SBarry Smith     indx      = ridx[i];
45684c1414c8SBarry Smith     start_val = tns[indx];
45694c1414c8SBarry Smith     end_val   = tns[indx + 1];
45704c1414c8SBarry Smith     for (j = start_val; j < end_val; ++j, ++row) permr[row] = j;
45714c1414c8SBarry Smith   }
45724c1414c8SBarry Smith 
45734c1414c8SBarry Smith   /* Form the inode structure for the columns of permuted matrix using inv perm*/
45744d12350bSJunchao Zhang   for (i = 0, tns[0] = 0; i < nslim_col; ++i) tns[i + 1] = tns[i] + (ns_col[i + 1] - ns_col[i]);
45754c1414c8SBarry Smith 
45764c1414c8SBarry Smith   /* Construct permutations for columns */
45774c1414c8SBarry Smith   for (i = 0, col = 0; i < nslim_col; ++i) {
45784c1414c8SBarry Smith     indx      = cidx[i];
45794c1414c8SBarry Smith     start_val = tns[indx];
45804c1414c8SBarry Smith     end_val   = tns[indx + 1];
45814c1414c8SBarry Smith     for (j = start_val; j < end_val; ++j, ++col) permc[col] = j;
45824c1414c8SBarry Smith   }
45834c1414c8SBarry Smith 
45849566063dSJacob Faibussowitsch   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permr, PETSC_COPY_VALUES, rperm));
45859566063dSJacob Faibussowitsch   PetscCall(ISSetPermutation(*rperm));
45869566063dSJacob Faibussowitsch   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permc, PETSC_COPY_VALUES, cperm));
45879566063dSJacob Faibussowitsch   PetscCall(ISSetPermutation(*cperm));
45884c1414c8SBarry Smith 
45899566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(ris, &ridx));
45909566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(cis, &cidx));
45914c1414c8SBarry Smith 
45929566063dSJacob Faibussowitsch   PetscCall(PetscFree(ns_col));
45939566063dSJacob Faibussowitsch   PetscCall(PetscFree2(permr, permc));
45949566063dSJacob Faibussowitsch   PetscCall(ISDestroy(&cis));
45959566063dSJacob Faibussowitsch   PetscCall(ISDestroy(&ris));
45969566063dSJacob Faibussowitsch   PetscCall(PetscFree(tns));
45973ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
45984c1414c8SBarry Smith }
45994c1414c8SBarry Smith 
46004c1414c8SBarry Smith /*@C
460111a5261eSBarry Smith   MatInodeGetInodeSizes - Returns the inode information of a matrix with inodes
46024c1414c8SBarry Smith 
46033f9fe445SBarry Smith   Not Collective
46044c1414c8SBarry Smith 
46054c1414c8SBarry Smith   Input Parameter:
460611a5261eSBarry Smith . A - the Inode matrix or matrix derived from the Inode class -- e.g., `MATSEQAIJ`
46074c1414c8SBarry Smith 
4608d8d19677SJose E. Roman   Output Parameters:
46094c1414c8SBarry Smith + node_count - no of inodes present in the matrix.
46102ef1f0ffSBarry Smith . sizes      - an array of size `node_count`, with the sizes of each inode.
46114c1414c8SBarry Smith - limit      - the max size used to generate the inodes.
46124c1414c8SBarry Smith 
46134c1414c8SBarry Smith   Level: advanced
46144c1414c8SBarry Smith 
461511a5261eSBarry Smith   Note:
46164c1414c8SBarry Smith   It should be called after the matrix is assembled.
46174c1414c8SBarry Smith   The contents of the sizes[] array should not be changed.
46182ef1f0ffSBarry Smith   `NULL` may be passed for information not needed
46194c1414c8SBarry Smith 
46201cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatGetInfo()`
46214c1414c8SBarry Smith @*/
4622d71ae5a4SJacob Faibussowitsch PetscErrorCode MatInodeGetInodeSizes(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit)
4623d71ae5a4SJacob Faibussowitsch {
46245f80ce2aSJacob Faibussowitsch   PetscErrorCode (*f)(Mat, PetscInt *, PetscInt **, PetscInt *);
46254c1414c8SBarry Smith 
46264c1414c8SBarry Smith   PetscFunctionBegin;
46275f80ce2aSJacob Faibussowitsch   PetscCheck(A->assembled, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix");
46289566063dSJacob Faibussowitsch   PetscCall(PetscObjectQueryFunction((PetscObject)A, "MatInodeGetInodeSizes_C", &f));
46299566063dSJacob Faibussowitsch   if (f) PetscCall((*f)(A, node_count, sizes, limit));
46303ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
46314c1414c8SBarry Smith }
46324c1414c8SBarry Smith 
4633d71ae5a4SJacob Faibussowitsch PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit)
4634d71ae5a4SJacob Faibussowitsch {
46354c1414c8SBarry Smith   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
46364c1414c8SBarry Smith 
46374c1414c8SBarry Smith   PetscFunctionBegin;
46384c1414c8SBarry Smith   if (node_count) *node_count = a->inode.node_count;
46394d12350bSJunchao Zhang   if (sizes) *sizes = a->inode.size_csr;
46404c1414c8SBarry Smith   if (limit) *limit = a->inode.limit;
46413ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
46424c1414c8SBarry Smith }
4643